In [1]:
import pandas as pd


# Feature Engineering

In [2]:
df=pd.read_csv('../../_data/operations_inscope_CATEGORIES.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102291 entries, 0 to 102290
Data columns (total 59 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         102291 non-null  int64  
 1   op_id              102291 non-null  int64  
 2   subject_id         102291 non-null  int64  
 3   hadm_id            102291 non-null  int64  
 4   opdate             102291 non-null  int64  
 5   age                102291 non-null  int64  
 6   sex                102291 non-null  object 
 7   weight             101186 non-null  float64
 8   height             101650 non-null  float64
 9   race               102291 non-null  object 
 10  asa                99438 non-null   float64
 11  emop               102291 non-null  int64  
 12  department         102291 non-null  object 
 13  antype             102291 non-null  object 
 14  icd10_pcs          102291 non-null  object 
 15  category_desc      102291 non-null  object 
 16  de

## Identify LOS Outliers within each surgery type

We use `groupby`('icd10_pcs') groups the data by the unique values in the 'icd10_pcs'.  
 For each group, `transform` applies the calculate_percentile_90 function to values within that group.   

This group-specific 90th percentile is then used to determine if a value is an outlier within its category.
Lastly, the is_outlier column is created to indicate with 1 if the record is an outlier in its category or 0 otherwise. We also clean up by dropping the intermediate 'percentile_90' column.


In [3]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame, 'column_name' is the column you're examining,
# and 'category_column' is the column with the categorical data
df['LOS']=(df['discharge_time']-df['orout_time'])/1440 # LOS is time following surgery to dishcarge converted to days. 

# Define a function to calculate the 75th percentile for a series
def calculate_percentile_75(series):
    return series.quantile(0.90)

# Apply the function to each group's 'icd10_pcs' and create a new column for the 75th percentile
df['percentile_90'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_percentile_75)

# Create a new column with 0/1 based on the condition, using the group-specific 75th percentile
df['is_outlier'] = np.where(df['LOS'] > df['percentile_90'], 1, 0)

# Drop the 'percentile_75' column if it's no longer needed
df.drop('percentile_90', axis=1, inplace=True)

df



Unnamed: 0.1,Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier
0,0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,...,8.7,,27.2,232.0,4.2,138.0,,5.62,4.131944,0
1,1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,...,11.0,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1
2,2,478413008,133278262,277235295,0,35,F,54.0,,Asian,...,12.7,,8.3,244.0,,,,17.62,3.760417,0
3,7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,...,,,,,,,,,2.447917,0
4,8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,...,,,,,,,,,3.493056,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,...,,23.7,,,3.8,134.0,,,8.378472,0
102287,128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,...,,,,,3.7,142.0,,,2.614583,0
102288,128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,...,,,,,3.8,143.0,,,5.447917,0
102289,128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,...,,22.6,,,3.9,137.0,,,9.309028,0


### Classify if LOS was "Prolonged"
* set business rule as Prolonged is an LOS that is greater 1 std dev than the average. 

In [4]:
import pandas as pd
import numpy as np

# Define functions to calculate the mean and standard deviation for a series
def calculate_mean(series):
    return series.mean()

def calculate_std(series):
    return series.std()

# Apply the functions to each group's 'column_name' to create new columns for the mean and standard deviation
df['group_mean'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_mean)
df['group_std'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_std)

# Define one standard deviation away from the mean
df['mean_plus_std'] = df['group_mean'] + df['group_std']

# Create a new column with 0/1 based on the condition, 
# using the group-specific mean and standard deviation
df['prolonged_LOS'] = np.where(df['LOS'] > df['group_std'], 1, 0)

# Drop the 'group_mean' and 'group_std' columns if they're no longer needed
df.drop(['group_mean', 'group_std', 'mean_plus_std'], axis=1, inplace=True)
df
# Optionally, if you still need the filtered DataFrame without outliers, you can filter the DataFrame
# df_filtered = df[(df['prolonged_LOS'] == 1 )&(df['is_outlier'] == 0) ]
# df_filtered


Unnamed: 0.1,Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS
0,0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,...,,27.2,232.0,4.2,138.0,,5.62,4.131944,0,0
1,1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,...,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1,1
2,2,478413008,133278262,277235295,0,35,F,54.0,,Asian,...,,8.3,244.0,,,,17.62,3.760417,0,1
3,7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,...,,,,,,,,2.447917,0,1
4,8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,...,,,,,,,,3.493056,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,...,23.7,,,3.8,134.0,,,8.378472,0,0
102287,128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,...,,,,3.7,142.0,,,2.614583,0,1
102288,128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,...,,,,3.8,143.0,,,5.447917,0,1
102289,128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,...,22.6,,,3.9,137.0,,,9.309028,0,1


In [5]:
df

Unnamed: 0.1,Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS
0,0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,...,,27.2,232.0,4.2,138.0,,5.62,4.131944,0,0
1,1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,...,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1,1
2,2,478413008,133278262,277235295,0,35,F,54.0,,Asian,...,,8.3,244.0,,,,17.62,3.760417,0,1
3,7,466411896,100259714,241547739,1440,50,F,66.0,157.0,Asian,...,,,,,,,,2.447917,0,1
4,8,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,...,,,,,,,,3.493056,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,128026,449124488,138484174,228449654,4999680,50,F,58.0,162.0,Asian,...,23.7,,,3.8,134.0,,,8.378472,0,0
102287,128027,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,...,,,,3.7,142.0,,,2.614583,0,1
102288,128028,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,...,,,,3.8,143.0,,,5.447917,0,1
102289,128029,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,...,22.6,,,3.9,137.0,,,9.309028,0,1


## Identify Subject that had ICU visit

In [11]:
## Create flag if subject went to ICU

df['icu_visit'] = (df['icuin_time'] > 0).astype(int)
df[df['icu_visit']== True]

## 13771  subjects went to ICU post op. 



Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS,icu_visit
16,480242953,187733661,238035661,10080,55,F,46.0,152.0,Asian,2.0,...,22.491722,215.576433,3.300000,139.000000,0.822992,8.626221,16.302083,0,0,1
17,485183540,160631734,285012362,1440,70,M,68.0,172.0,Asian,3.0,...,15.496070,175.766284,4.100000,139.000000,1.102778,9.852588,6.295139,0,0,1
22,432945014,126077562,272790506,2880,50,F,61.0,149.0,Asian,2.0,...,20.414394,223.140988,3.768724,139.116926,3.800000,8.957105,17.305556,0,0,1
23,462673015,153210670,240653087,10080,70,M,63.0,158.0,Asian,2.0,...,10.900000,120.000000,4.300000,140.000000,0.900000,5.500000,17.409722,0,1,1
24,461524976,194549663,293384214,2880,50,M,75.0,171.0,Asian,2.0,...,19.161972,208.538462,4.300000,132.000000,1.069805,10.275723,11.069444,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102243,475125418,128375644,289406001,24480,50,F,61.0,155.0,Asian,3.0,...,16.902174,206.610169,4.300000,135.000000,1.390698,9.255439,18.312500,0,0,1
102254,447857589,163754043,274747413,11520,70,M,59.0,168.0,Asian,4.0,...,14.952941,161.800000,5.200000,135.000000,1.266667,9.582941,15.361111,0,0,1
102257,474723417,152774881,253900635,14400,80,F,66.0,145.0,Asian,3.0,...,18.791346,204.385246,3.500000,143.000000,0.954651,9.295366,8.111111,0,0,1
102260,468666428,137181591,298918264,5760,65,F,61.0,158.0,Asian,2.0,...,21.422042,212.006881,3.500000,136.000000,0.780793,8.557983,7.152778,0,0,1


In [12]:
## Drop extra column. Exclude Outliers from further analysis. 

df = df.drop(df.columns[0], axis=1)
df = df[df['is_outlier'] == 0 ]
df

Unnamed: 0,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,emop,...,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS,icu_visit
0,178742874,229842382,0,30,F,48.0,153.0,Asian,,1,...,,,,,,,,0,0,0
2,133278262,277235295,0,35,F,54.0,,Asian,,1,...,,,,,,,,0,1,0
3,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,0,...,20.414394,223.140988,3.768724,139.116926,0.851504,8.957105,2.447917,0,1,0
4,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,0,...,23.938716,217.282759,3.846584,140.033084,0.744921,8.200501,3.493056,0,1,0
5,134195201,265770645,1440,35,F,50.0,160.0,Asian,1.0,0,...,13.300000,124.000000,3.900000,138.000000,0.600000,6.310000,4.236111,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,0,...,20.414394,223.140988,3.800000,134.000000,0.851504,8.957105,8.378472,0,0,0
102287,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,0,...,21.154115,212.690065,3.700000,142.000000,0.760224,8.535719,2.614583,0,1,0
102288,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,0,...,21.422042,212.006881,3.800000,143.000000,0.780793,8.557983,5.447917,0,1,0
102289,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,0,...,19.733333,218.800000,3.900000,137.000000,1.800000,9.932500,9.309028,0,1,1


## Impute Medians for Missing Data
* cannot impute means across the whole data set (mix of male, female, age)
* means are imputed within each category (group be: sex, age, race)

In [13]:
import pandas as pd
import numpy as np

# df_no_outlier = df[df['is_outlier'] == 0]
# Range to evaluate columns 32:58 (inclusive)

category_columns = ['age', 'sex', 'asa']  # asa is the id from Amercican Society of Anthropologists.
start_col = 33  # Start index 
end_col = 59    # End index (exclusive) 

# Iterate over each column in the specified range and apply groupby and transform
for col in df.iloc[:, start_col:end_col].columns:
    # Group by the categorical columns and fill NaN with the mean of the group for each specific column
    df[col] = df.groupby(category_columns)[col].transform(lambda x: x.fillna(x.mean()))

# Now df has the missing values imputed with the mean of their respective column within each group defined by 'category_columns'




In [14]:
df

Unnamed: 0,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,emop,...,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS,icu_visit
0,178742874,229842382,0,30,F,48.0,153.0,Asian,,1,...,,,,,,,,,0,0
2,133278262,277235295,0,35,F,54.0,,Asian,,1,...,,,,,,,,,1,0
3,100259714,241547739,1440,50,F,66.0,157.0,Asian,2.0,0,...,20.414394,223.140988,3.768724,139.116926,0.851504,8.957105,2.447917,0.0,1,0
4,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,0,...,23.938716,217.282759,3.846584,140.033084,0.744921,8.200501,3.493056,0.0,1,0
5,134195201,265770645,1440,35,F,50.0,160.0,Asian,1.0,0,...,13.300000,124.000000,3.900000,138.000000,0.600000,6.310000,4.236111,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102286,138484174,228449654,4999680,50,F,58.0,162.0,Asian,2.0,0,...,20.414394,223.140988,3.800000,134.000000,0.851504,8.957105,8.378472,0.0,0,0
102287,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,0,...,21.154115,212.690065,3.700000,142.000000,0.760224,8.535719,2.614583,0.0,1,0
102288,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,0,...,21.422042,212.006881,3.800000,143.000000,0.780793,8.557983,5.447917,0.0,1,0
102289,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,0,...,19.733333,218.800000,3.900000,137.000000,1.800000,9.932500,9.309028,0.0,1,1


In [15]:
#########################
#
# Data EXport 
#
#########################
df.to_csv('../../_data/operations_imputed_CLEAN.csv')
