In [1]:
import pandas as pd


# Feature Engineering

In [2]:
df=pd.read_csv('../_data/operations_inscope.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58815 entries, 0 to 58814
Data columns (total 58 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   op_id              58815 non-null  int64  
 1   subject_id         58815 non-null  int64  
 2   hadm_id            58815 non-null  int64  
 3   opdate             58815 non-null  int64  
 4   age                58815 non-null  int64  
 5   sex                58815 non-null  object 
 6   weight             58180 non-null  float64
 7   height             58461 non-null  float64
 8   race               58815 non-null  object 
 9   asa                56842 non-null  float64
 10  emop               58815 non-null  int64  
 11  department         58815 non-null  object 
 12  antype             58815 non-null  object 
 13  icd10_pcs          58815 non-null  object 
 14  category_desc      58815 non-null  object 
 15  desc_short         58815 non-null  object 
 16  category_id        588

## Identify LOS Outliers within each surgery type

We use `groupby`('icd10_pcs') groups the data by the unique values in the 'icd10_pcs'.  
 For each group, `transform` applies the calculate_percentile_75 function to values within that group.   
 This group-specific 75th percentile is then used to determine if a value is an outlier within its category.

Lastly, the is_outlier column is created to indicate with 1 if the record is an outlier in its category or 0 otherwise. We also clean up by dropping the intermediate 'percentile_75' column.


In [3]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame, 'column_name' is the column you're examining,
# and 'category_column' is the column with the categorical data
df['LOS']=(df['discharge_time']-df['orout_time'])/1440 # LOS is time following surgery to dishcarge converted to days. 

# Define a function to calculate the 75th percentile for a series
def calculate_percentile_75(series):
    return series.quantile(0.75)

# Apply the function to each group's 'icd10_pcs' and create a new column for the 75th percentile
df['percentile_75'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_percentile_75)

# Create a new column with 0/1 based on the condition, using the group-specific 75th percentile
df['is_outlier'] = np.where(df['LOS'] > df['percentile_75'], 1, 0)

# Drop the 'percentile_75' column if it's no longer needed
df.drop('percentile_75', axis=1, inplace=True)

df
# Optionally, if you still need the filtered DataFrame without outliers, you can filter the DataFrame



Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier
0,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,11.0,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1
1,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,12.7,,8.3,244.0,,,,17.62,3.760417,0
2,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,,,,,3.493056,0
3,461473883,134195201,265770645,1440,35,F,50.0,160.0,Asian,1.0,...,10.4,,13.3,124.0,3.9,138.0,0.6,6.31,4.236111,0
4,466389608,160947402,262240911,1440,60,F,52.0,152.0,Asian,1.0,...,,,,,,,,,1.607639,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58810,497537801,130062652,263988404,3909600,50,M,64.0,169.0,Asian,2.0,...,,,,,3.5,136.0,,,1.447917,0
58811,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,,,,,3.7,142.0,,,2.614583,0
58812,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,,,,,3.8,143.0,,,5.447917,0
58813,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,,22.6,,,3.9,137.0,,,9.309028,1


## Identify Subject that had ICU visit

In [8]:
## Create flag if subject went to ICU

df['icu_visit'] = (df['icuin_time'] > 0).astype(int)
df[df['icu_visit']== True]

## 7250 subjects went to ICU post op. 



Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,icu_visit,or_duration,anesth_duration
0,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,,,,,,,1,1,170.0,195.0
5,487807079,145247513,259299532,74880,25,M,47.0,172.0,Asian,2.0,...,244.000000,4.001266,138.897436,0.822807,9.888197,23.131944,1,1,25.0,55.0
13,480242953,187733661,238035661,10080,55,F,46.0,152.0,Asian,2.0,...,218.004494,3.300000,139.000000,0.797994,8.319523,16.302083,0,1,455.0,510.0
17,432945014,126077562,272790506,2880,50,F,61.0,149.0,Asian,2.0,...,228.864230,3.841283,139.555034,3.800000,8.571333,17.305556,0,1,425.0,485.0
18,461524976,194549663,293384214,2880,50,M,75.0,171.0,Asian,2.0,...,199.367424,4.300000,132.000000,1.132530,9.246938,11.069444,0,1,430.0,460.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58794,475125418,128375644,289406001,24480,50,F,61.0,155.0,Asian,3.0,...,204.026316,4.300000,135.000000,1.490323,9.317778,18.312500,0,1,30.0,55.0
58795,486837329,104340242,298141008,105120,65,M,48.0,161.0,Asian,4.0,...,144.000000,4.105000,135.050000,1.861538,7.880769,106.173611,1,1,45.0,65.0
58802,447857589,163754043,274747413,11520,70,M,59.0,168.0,Asian,4.0,...,153.666667,5.200000,135.000000,1.309091,9.741538,15.361111,1,1,265.0,315.0
58803,494319208,150497300,289152391,3483360,80,F,52.0,150.0,Asian,4.0,...,227.000000,3.300000,141.000000,0.500000,15.790000,60.829861,1,1,225.0,265.0


In [9]:
#########################
#
# Calculate OR Duration and Anesthetic Duration
#
#########################

#Operation Duration
df['or_duration']=(df['opend_time']-df['opstart_time'])

#Time under anasthetic
df['anesth_duration']=(df['anend_time']-df['anstart_time'])

## Impute Medians for Missing Data
* cannot impute means across the whole data set (mix of male, female, age)
* means are imputed within each category (group be: sex, age, race)

In [10]:
import pandas as pd
import numpy as np

# df_no_outlier = df[df['is_outlier'] == 0]
# Range to evaluate columns 32:58 (inclusive)

category_columns = ['age', 'sex', 'asa']  # asa is the id from Amercican Society of Anthropologists.
start_col = 33  # Start index 
end_col = 59    # End index (exclusive) 

# Iterate over each column in the specified range and apply groupby and transform
for col in df.iloc[:, start_col:end_col].columns:
    # Group by the categorical columns and fill NaN with the mean of the group for each specific column
    df[col] = df.groupby(category_columns)[col].transform(lambda x: x.fillna(x.mean()))

# Now df has the missing values imputed with the mean of their respective column within each group defined by 'category_columns'
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58815 entries, 0 to 58814
Data columns (total 63 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   op_id              58815 non-null  int64  
 1   subject_id         58815 non-null  int64  
 2   hadm_id            58815 non-null  int64  
 3   opdate             58815 non-null  int64  
 4   age                58815 non-null  int64  
 5   sex                58815 non-null  object 
 6   weight             58180 non-null  float64
 7   height             58461 non-null  float64
 8   race               58815 non-null  object 
 9   asa                56842 non-null  float64
 10  emop               58815 non-null  int64  
 11  department         58815 non-null  object 
 12  antype             58815 non-null  object 
 13  icd10_pcs          58815 non-null  object 
 14  category_desc      58815 non-null  object 
 15  desc_short         58815 non-null  object 
 16  category_id        588

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,icu_visit,or_duration,anesth_duration
0,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,,,,,,,1,1,170.0,195.0
1,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,,,,,,,0,0,100.0,125.0
2,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,215.854671,3.843406,140.403423,0.825000,7.995625,3.493056,0,0,70.0,90.0
3,461473883,134195201,265770645,1440,35,F,50.0,160.0,Asian,1.0,...,124.000000,3.900000,138.000000,0.600000,6.310000,4.236111,0,0,115.0,150.0
4,466389608,160947402,262240911,1440,60,F,52.0,152.0,Asian,1.0,...,215.854671,3.843406,140.403423,0.825000,7.995625,1.607639,0,0,30.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58810,497537801,130062652,263988404,3909600,50,M,64.0,169.0,Asian,2.0,...,199.367424,3.500000,136.000000,1.132530,9.246938,1.447917,0,0,5.0,25.0
58811,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,206.355948,3.700000,142.000000,0.852152,8.525825,2.614583,0,0,60.0,75.0
58812,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,212.325744,3.800000,143.000000,0.766667,8.280235,5.447917,0,0,55.0,75.0
58813,419787421,195835964,293939099,12960,85,M,74.0,171.0,Asian,4.0,...,164.000000,3.900000,137.000000,1.800000,9.695000,9.309028,1,1,130.0,170.0


In [None]:
#########################
#
# Export Operations to CSV
#
#########################

df.to_csv('../_data/operations_imputed_CLEAN.csv')
