In [1]:
import pandas as pd


# Feature Engineering

In [2]:
df=pd.read_csv('../../_data/operations_inscope_CATEGORIES.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102291 entries, 0 to 102290
Data columns (total 59 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         102291 non-null  int64  
 1   op_id              102291 non-null  int64  
 2   subject_id         102291 non-null  int64  
 3   hadm_id            102291 non-null  int64  
 4   opdate             102291 non-null  int64  
 5   age                102291 non-null  int64  
 6   sex                102291 non-null  object 
 7   weight             101186 non-null  float64
 8   height             101650 non-null  float64
 9   race               102291 non-null  object 
 10  asa                99438 non-null   float64
 11  emop               102291 non-null  int64  
 12  department         102291 non-null  object 
 13  antype             102291 non-null  object 
 14  icd10_pcs          102291 non-null  object 
 15  category_desc      102291 non-null  object 
 16  de

## Identify LOS Outliers within each surgery type

We use `groupby`('icd10_pcs') groups the data by the unique values in the 'icd10_pcs'.  
 For each group, `transform` applies the calculate_percentile_75 function to values within that group.   
 This group-specific 75th percentile is then used to determine if a value is an outlier within its category.

Lastly, the is_outlier column is created to indicate with 1 if the record is an outlier in its category or 0 otherwise. We also clean up by dropping the intermediate 'percentile_75' column.


In [5]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame, 'column_name' is the column you're examining,
# and 'category_column' is the column with the categorical data
df['LOS']=(df['discharge_time']-df['orout_time'])/1440 # LOS is time following surgery to dishcarge converted to days. 

# Define a function to calculate the 75th percentile for a series
def calculate_percentile_75(series):
    return series.quantile(0.75)

# Apply the function to each group's 'icd10_pcs' and create a new column for the 75th percentile
df['percentile_75'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_percentile_75)

# Create a new column with 0/1 based on the condition, using the group-specific 75th percentile
df['is_outlier'] = np.where(df['LOS'] > df['percentile_75'], 1, 0)

# Drop the 'percentile_75' column if it's no longer needed
df.drop('percentile_75', axis=1, inplace=True)

df



Unnamed: 0.1,Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier
0,0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,...,8.7,,27.2,232.0,4.2,138.0,,5.62,4.131944,0
1,1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,...,11.0,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1
2,2,478413008,133278262,277235295,0,35,F,54.0,,Asian,...,12.7,,8.3,244.0,,,,17.62,3.760417,0
3,7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,...,,,,,,,,,2.447917,0
4,8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,...,,,,,,,,,3.493056,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,...,,23.7,,,3.8,134.0,,,8.378472,0
102287,128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,...,,,,,3.7,142.0,,,2.614583,0
102288,128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,...,,,,,3.8,143.0,,,5.447917,0
102289,128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,...,,22.6,,,3.9,137.0,,,9.309028,1


In [6]:
import plotly.express as px


fig = px.violin(df_no_outlier, y="LOS", x="icd10_pcs", color="sex", box=True,hover_data=df.columns)
fig.show()

NameError: name 'df_no_outlier' is not defined

## Identify Subject that had ICU visit

In [7]:
## Create flag if subject went to ICU

df['icu_visit'] = df['icuin_time'] > 0
df[df['icu_visit']== True]

## 13771  subjects went to ICU post op. 



Unnamed: 0.1,Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,icu_visit
1,1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,...,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1,True
8,12,487807079,145247513,259299532,74880,25,M,47.0,172.0,Asian,...,,,,,,,,23.131944,1,True
16,20,480242953,187733661,238035661,10080,55,F,46.0,152.0,Asian,...,27.9,,,3.3,139.0,,,16.302083,0,True
17,22,485183540,160631734,285012362,1440,70,M,68.0,172.0,Asian,...,23.7,,,4.1,139.0,,,6.295139,0,True
22,27,432945014,126077562,272790506,2880,50,F,61.0,149.0,Asian,...,,,,,,3.8,,17.305556,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102255,127986,494319208,150497300,289152391,3483360,80,F,52.0,150.0,Asian,...,,10.6,227.0,3.3,141.0,0.5,15.79,60.829861,1,True
102257,127989,474723417,152774881,253900635,14400,80,F,66.0,145.0,Asian,...,27.1,,,3.5,143.0,,,8.111111,0,True
102260,127993,468666428,137181591,298918264,5760,65,F,61.0,158.0,Asian,...,24.1,,,3.5,136.0,,,7.152778,0,True
102271,128008,465468763,150497300,289152391,3483360,80,F,52.0,150.0,Asian,...,,10.6,227.0,3.3,141.0,0.5,15.79,61.020833,1,True


In [8]:
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,icu_visit
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,,27.2,232.0,4.2,138.0,,5.62,4.131944,0,False
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1,True
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,,8.3,244.0,,,,17.62,3.760417,0,False
3,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,...,,,,,,,,2.447917,0,False
4,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,,,,3.493056,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,...,23.7,,,3.8,134.0,,,8.378472,0,False
102287,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,,,,3.7,142.0,,,2.614583,0,False
102288,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,,,,3.8,143.0,,,5.447917,0,False
102289,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,22.6,,,3.9,137.0,,,9.309028,1,True


## Impute Medians for Missing Data
* cannot impute means across the whole data set (mix of male, female, age)
* means are imputed within each category (group be: sex, age, race)

In [9]:
import pandas as pd
import numpy as np

# df_no_outlier = df[df['is_outlier'] == 0]
# Range to evaluate columns 32:58 (inclusive)

category_columns = ['age', 'sex', 'asa']  # asa is the id from Amercican Society of Anthropologists.
start_col = 33  # Start index 
end_col = 59    # End index (exclusive) 

# Iterate over each column in the specified range and apply groupby and transform
for col in df.iloc[:, start_col:end_col].columns:
    # Group by the categorical columns and fill NaN with the mean of the group for each specific column
    df[col] = df.groupby(category_columns)[col].transform(lambda x: x.fillna(x.mean()))

# Now df has the missing values imputed with the mean of their respective column within each group defined by 'category_columns'




In [10]:
df

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,icu_visit
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,,,,,,,,,0,False
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,,,,,,,,,1,True
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,,,,,,,,,0,False
3,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,...,24.479548,20.129223,222.547341,3.776111,139.071958,0.843907,9.112005,2.447917,0,False
4,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,24.740786,23.777308,216.100800,3.840183,140.009748,0.795798,8.213167,3.493056,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,...,23.700000,20.129223,222.547341,3.800000,134.000000,0.843907,9.112005,8.378472,0,False
102287,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,24.816717,21.062938,213.019082,3.700000,142.000000,0.775521,8.629752,2.614583,0,False
102288,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,24.793033,21.333279,211.248968,3.800000,143.000000,0.778727,8.612343,5.447917,0,False
102289,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,22.600000,16.725000,213.000000,3.900000,137.000000,1.800000,10.642000,9.309028,1,True


In [11]:

df.to_csv('../../_data/operations_imputed_CLEAN.csv')
