In [1]:
import pandas as pd


# Feature Engineering

In [2]:
df=pd.read_csv('../_data/operations_inscope_CATEGORIES.csv',index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102291 entries, 0 to 128030
Data columns (total 58 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   op_id              102291 non-null  int64  
 1   subject_id         102291 non-null  int64  
 2   hadm_id            102291 non-null  int64  
 3   opdate             102291 non-null  int64  
 4   age                102291 non-null  int64  
 5   sex                102291 non-null  object 
 6   weight             101186 non-null  float64
 7   height             101650 non-null  float64
 8   race               102291 non-null  object 
 9   asa                99438 non-null   float64
 10  emop               102291 non-null  int64  
 11  department         102291 non-null  object 
 12  antype             102291 non-null  object 
 13  icd10_pcs          102291 non-null  object 
 14  category_desc      102291 non-null  object 
 15  desc_short         102291 non-null  object 
 16  categor

## Identify LOS Outliers within each surgery type

We use `groupby`('icd10_pcs') groups the data by the unique values in the 'icd10_pcs'.  
 For each group, `transform` applies the calculate_percentile_90 function to values within that group.   

This group-specific 90th percentile is then used to determine if a value is an outlier within its category.
Lastly, the is_outlier column is created to indicate with 1 if the record is an outlier in its category or 0 otherwise. We also clean up by dropping the intermediate 'percentile_90' column.


In [3]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame, 'column_name' is the column you're examining,
# and 'category_column' is the column with the categorical data
df['LOS']=(df['discharge_time']-df['orout_time'])/1440 # LOS is time following surgery to dishcarge converted to days. 

# Define a function to calculate the 75th percentile for a series
def calculate_percentile_75(series):
    return series.quantile(0.90)

# Apply the function to each group's 'icd10_pcs' and create a new column for the 75th percentile
df['percentile_90'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_percentile_75)

# Create a new column with 0/1 based on the condition, using the group-specific 75th percentile
df['is_outlier'] = np.where(df['LOS'] > df['percentile_90'], 1, 0)

# Drop the 'percentile_75' column if it's no longer needed
df.drop('percentile_90', axis=1, inplace=True)

df



Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,8.7,,27.2,232.0,4.2,138.0,,5.62,4.131944,0
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,11.0,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,12.7,,8.3,244.0,,,,17.62,3.760417,0
7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,...,,,,,,,,,2.447917,0
8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,,,,,3.493056,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,...,,23.7,,,3.8,134.0,,,8.378472,0
128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,,,,,3.7,142.0,,,2.614583,0
128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,,,,,3.8,143.0,,,5.447917,0
128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,,22.6,,,3.9,137.0,,,9.309028,0


### Classify if LOS was "Prolonged"
* set business rule as Prolonged is an LOS that is greater 1 std dev than the average. 

In [4]:
import pandas as pd
import numpy as np

# Define functions to calculate the mean and standard deviation for a series
def calculate_mean(series):
    return series.mean()

def calculate_std(series):
    return series.std()

# Apply the functions to each group's 'column_name' to create new columns for the mean and standard deviation
df['group_mean'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_mean)
df['group_std'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_std)

# Define one standard deviation away from the mean
df['mean_plus_std'] = df['group_mean'] + df['group_std']

# Create a new column with 0/1 based on the condition, 
# using the group-specific mean and standard deviation
df['prolonged_LOS'] = np.where(df['LOS'] > df['group_std'], 1, 0)

# Drop the 'group_mean' and 'group_std' columns if they're no longer needed
df.drop(['group_mean', 'group_std', 'mean_plus_std'], axis=1, inplace=True)
df
# Optionally, if you still need the filtered DataFrame without outliers, you can filter the DataFrame
# df_filtered = df[(df['prolonged_LOS'] == 1 )&(df['is_outlier'] == 0) ]
# df_filtered


Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,,27.2,232.0,4.2,138.0,,5.62,4.131944,0,0
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1,1
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,,8.3,244.0,,,,17.62,3.760417,0,1
7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,...,,,,,,,,2.447917,0,1
8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,,,,3.493056,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,...,23.7,,,3.8,134.0,,,8.378472,0,0
128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,,,,3.7,142.0,,,2.614583,0,1
128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,,,,3.8,143.0,,,5.447917,0,1
128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,22.6,,,3.9,137.0,,,9.309028,0,1


In [None]:
df

## Identify Subject that had ICU visit (and duration)

In [5]:
## Create flag if subject went to ICU

df['icu_visit'] = (df['icuin_time'] > 0).astype(int)
df[df['icu_visit']== True]

# df['ICU_dur']= (df['icuout_time']-df['icuin_time'])
# Elected to omit ICR duration as most are Nan and woud increase complexity to drop later.

## 13771  subjects went to ICU post op. 



Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS,icu_visit
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,38.1,124.0,4.2,140.0,0.5,,47.920139,1,1,1
12,487807079,145247513,259299532,74880,25,M,47.0,172.0,Asian,2.0,...,,,,,,,23.131944,1,0,1
20,480242953,187733661,238035661,10080,55,F,46.0,152.0,Asian,2.0,...,,,3.3,139.0,,,16.302083,0,0,1
22,485183540,160631734,285012362,1440,70,M,68.0,172.0,Asian,3.0,...,,,4.1,139.0,,,6.295139,0,0,1
27,432945014,126077562,272790506,2880,50,F,61.0,149.0,Asian,2.0,...,,,,,3.8,,17.305556,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127986,494319208,150497300,289152391,3483360,80,F,52.0,150.0,Asian,4.0,...,10.6,227.0,3.3,141.0,0.5,15.79,60.829861,1,1,1
127989,474723417,152774881,253900635,14400,80,F,66.0,145.0,Asian,3.0,...,,,3.5,143.0,,,8.111111,0,0,1
127993,468666428,137181591,298918264,5760,65,F,61.0,158.0,Asian,2.0,...,,,3.5,136.0,,,7.152778,0,0,1
128008,465468763,150497300,289152391,3483360,80,F,52.0,150.0,Asian,4.0,...,10.6,227.0,3.3,141.0,0.5,15.79,61.020833,1,1,1


In [6]:
#########################
#
# Calculate OR Duration and Anesthetic Duration
#
#########################

#Operation Duration
df['or_duration']=(df['opend_time']-df['opstart_time'])

#Time under anasthetic
df['anesth_duration']=(df['anend_time']-df['anstart_time'])

df

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS,icu_visit,or_duration,anesth_duration
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,4.2,138.0,,5.62,4.131944,0,0,0,90.0,115.0
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,4.2,140.0,0.5,,47.920139,1,1,1,170.0,195.0
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,,,,17.62,3.760417,0,1,0,100.0,125.0
7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,...,,,,,2.447917,0,1,0,45.0,70.0
8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,3.493056,0,1,0,70.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,...,3.8,134.0,,,8.378472,0,0,0,160.0,180.0
128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,3.7,142.0,,,2.614583,0,1,0,60.0,75.0
128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,3.8,143.0,,,5.447917,0,1,0,55.0,75.0
128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,3.9,137.0,,,9.309028,0,1,1,130.0,170.0


In [7]:
## Drop extra column. Exclude Outliers from further analysis. 
cols_to_drop = ['is_outlier', 'art_dbp','chart_time_y','chart_time_x','inhosp_death_time','orin_time', 'orout_time', 'opstart_time', 'opend_time','admission_time', 'discharge_time','anstart_time' ,'anend_time', 'cpbon_time','cpboff_time','icuin_time','icuout_time', 'subject_id_y', 'chart_time_x','art_mbp','art_sbp','art_sbp','bt','cvp','alp','alt','ast', 'emop']
df = df[df['is_outlier'] == 0 ]
df = df.drop(columns=cols_to_drop)

df

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,platelet,potassium,sodium,total_bilirubin,wbc,LOS,prolonged_LOS,icu_visit,or_duration,anesth_duration
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,232.0,4.2,138.0,,5.62,4.131944,0,0,90.0,115.0
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,244.0,,,,17.62,3.760417,1,0,100.0,125.0
7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,...,,,,,,2.447917,1,0,45.0,70.0
8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,,3.493056,1,0,70.0,90.0
9,461473883,134195201,265770645,1440,35,F,50.0,160.0,Asian,1.0,...,124.0,3.9,138.0,0.6,6.31,4.236111,0,0,115.0,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,...,,3.8,134.0,,,8.378472,0,0,160.0,180.0
128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,,3.7,142.0,,,2.614583,1,0,60.0,75.0
128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,,3.8,143.0,,,5.447917,1,0,55.0,75.0
128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,,3.9,137.0,,,9.309028,1,1,130.0,170.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91862 entries, 0 to 128030
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   op_id            91862 non-null  int64  
 1   subject_id       91862 non-null  int64  
 2   hadm_id          91862 non-null  int64  
 3   opdate           91862 non-null  int64  
 4   age              91862 non-null  int64  
 5   sex              91862 non-null  object 
 6   weight           90910 non-null  float64
 7   height           91326 non-null  float64
 8   race             91862 non-null  object 
 9   asa              89532 non-null  float64
 10  department       91862 non-null  object 
 11  antype           91862 non-null  object 
 12  icd10_pcs        91862 non-null  object 
 13  category_desc    91862 non-null  object 
 14  desc_short       91862 non-null  object 
 15  category_id      91862 non-null  object 
 16  hr               79842 non-null  float64
 17  pip             

## Impute Medians for Missing Data
* cannot impute means across the whole data set (mix of male, female, age)
* means are imputed within each category (group be: sex, age, race)
* Note: this step is not performed in teh pipelines as important to find means for each 'group' of subject (ie: find mean platlet count for same gender, same age, same ASA)

In [None]:
## Confirm the column index that you want to include in the imput calcs.
df.info()

In [10]:
import pandas as pd
import numpy as np

# df_no_outlier = df[df['is_outlier'] == 0]
# Range to evaluate columns 32:58 (inclusive)

category_columns = ['age', 'sex', 'asa']  # asa is the id from Amercican Society of Anthropologists.
start_col = 17  # Start index 
end_col = 34    # End index (exclusive) 

# Iterate over each column in the specified range and apply groupby and transform
for col in df.iloc[:, start_col:end_col].columns:
    # Group by the categorical columns and fill NaN with the mean of the group for each specific column
    df[col] = df.groupby(category_columns)[col].transform(lambda x: x.fillna(x.mean()))

# Now df has the missing values imputed with the mean of their respective column within each group defined by 'category_columns'




In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91862 entries, 0 to 128030
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   op_id            91862 non-null  int64  
 1   subject_id       91862 non-null  int64  
 2   hadm_id          91862 non-null  int64  
 3   opdate           91862 non-null  int64  
 4   age              91862 non-null  int64  
 5   sex              91862 non-null  object 
 6   weight           90910 non-null  float64
 7   height           91326 non-null  float64
 8   race             91862 non-null  object 
 9   asa              89532 non-null  float64
 10  department       91862 non-null  object 
 11  antype           91862 non-null  object 
 12  icd10_pcs        91862 non-null  object 
 13  category_desc    91862 non-null  object 
 14  desc_short       91862 non-null  object 
 15  category_id      91862 non-null  object 
 16  hr               79842 non-null  float64
 17  pip             

In [12]:
df = df.dropna()
df.shape


(76742, 38)

In [13]:
#########################
#
# Data EXport 
#
#########################
df.to_csv('../_data/operations_imputed_CLEAN_v2.csv')
