In [1]:
import pandas as pd


In [9]:
df=pd.read_csv('../_data/operations_inscope.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58815 entries, 0 to 58814
Data columns (total 58 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   op_id              58815 non-null  int64  
 1   subject_id         58815 non-null  int64  
 2   hadm_id            58815 non-null  int64  
 3   opdate             58815 non-null  int64  
 4   age                58815 non-null  int64  
 5   sex                58815 non-null  object 
 6   weight             58180 non-null  float64
 7   height             58461 non-null  float64
 8   race               58815 non-null  object 
 9   asa                56842 non-null  float64
 10  emop               58815 non-null  int64  
 11  department         58815 non-null  object 
 12  antype             58815 non-null  object 
 13  icd10_pcs          58815 non-null  object 
 14  category_desc      58815 non-null  object 
 15  desc_short         58815 non-null  object 
 16  category_id        588

## Objective: Identify LOS Outliers withing each surgery type

We use `groupby`('icd10_pcs') groups the data by the unique values in the 'icd10_pcs'.  
 For each group, `transform` applies the calculate_percentile_75 function to values within that group.   
 This group-specific 75th percentile is then used to determine if a value is an outlier within its category.

Lastly, the is_outlier column is created to indicate with 1 if the record is an outlier in its category or 0 otherwise. We also clean up by dropping the intermediate 'percentile_75' column.


In [20]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame, 'column_name' is the column you're examining,
# and 'category_column' is the column with the categorical data
df['LOS']=(df['discharge_time']-df['orout_time'])/1440 # LOS is time following surgery to dishcarge converted to days. 

# Define a function to calculate the 75th percentile for a series
def calculate_percentile_75(series):
    return series.quantile(0.75)

# Apply the function to each group's 'icd10_pcs' and create a new column for the 75th percentile
df['percentile_75'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_percentile_75)

# Create a new column with 0/1 based on the condition, using the group-specific 75th percentile
df['is_outlier'] = np.where(df['LOS'] > df['percentile_75'], 1, 0)

# Drop the 'percentile_75' column if it's no longer needed
df.drop('percentile_75', axis=1, inplace=True)

df
# Optionally, if you still need the filtered DataFrame without outliers, you can filter the DataFrame
df_no_outlier = df[df['is_outlier'] == 0]


Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,weight,height,asa,emop,orin_time,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier
count,44227.0,44227.0,44227.0,44227.0,44227.0,43778.0,44016.0,42955.0,44227.0,44227.0,...,12754.0,9944.0,10416.0,11972.0,17264.0,17135.0,8134.0,11744.0,44227.0,44227.0
mean,449977500.0,150117600.0,250173600.0,204105.0,56.058629,62.245374,161.595352,1.727273,0.097316,204824.0,...,12.093747,23.863936,21.713297,212.305212,3.924631,139.196177,0.957155,8.834136,3.924024,0.0
std,28866880.0,28857160.0,28903350.0,626342.4,15.463223,11.911047,83.332957,0.627437,0.296391,626346.6,...,1.958106,2.92545,12.177941,72.319081,0.452148,3.616322,1.083636,4.07644,4.742772,0.0
min,400000500.0,100000800.0,200002400.0,0.0,20.0,-1.0,-1.0,1.0,0.0,10.0,...,4.1,4.6,0.0,19.0,2.0,115.0,0.1,0.56,-1.385417,0.0
25%,425052800.0,125095800.0,225177800.0,1440.0,45.0,54.0,155.0,1.0,0.0,2030.0,...,10.7,22.1,11.6,167.0,3.6,137.0,0.5,5.93,1.302083,0.0
50%,449942300.0,150345500.0,250358400.0,2880.0,60.0,61.0,161.0,2.0,0.0,3345.0,...,12.1,23.9,20.5,207.0,3.9,140.0,0.7,7.94,2.482639,0.0
75%,475074200.0,175007500.0,275109100.0,10080.0,70.0,69.0,167.0,2.0,0.0,10635.0,...,13.4,25.7,30.4,250.0,4.2,141.0,1.0,10.8925,5.284722,0.0
max,499997600.0,199999400.0,299998900.0,5132160.0,90.0,354.0,17409.0,6.0,1.0,5133090.0,...,25.5,46.8,97.0,1113.0,7.95,181.0,24.8,85.62,79.399306,0.0


In [17]:
import plotly.express as px


fig = px.violin(df_no_outlier, y="LOS", x="icd10_pcs", color="sex", box=True,hover_data=df.columns)
fig.show()

In [23]:
## Create flag if subject went to ICU

df['icu_visit'] = df['icuin_time'] > 0
df[df['icu_visit']== True]

## 7250 subjects went to ICU post op. 



Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,icu_visit
0,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,18.0,38.1,124.0,4.2,140.0,0.5,,47.920139,1,True
5,487807079,145247513,259299532,74880,25,M,47.0,172.0,Asian,2.0,...,,,,,,,,23.131944,1,True
13,480242953,187733661,238035661,10080,55,F,46.0,152.0,Asian,2.0,...,27.9,,,3.3,139.0,,,16.302083,0,True
17,432945014,126077562,272790506,2880,50,F,61.0,149.0,Asian,2.0,...,,,,,,3.8,,17.305556,0,True
18,461524976,194549663,293384214,2880,50,M,75.0,171.0,Asian,2.0,...,18.8,,,4.3,132.0,,,11.069444,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58794,475125418,128375644,289406001,24480,50,F,61.0,155.0,Asian,3.0,...,24.2,,,4.3,135.0,,,18.312500,0,True
58795,486837329,104340242,298141008,105120,65,M,48.0,161.0,Asian,4.0,...,,,,,,,,106.173611,1,True
58802,447857589,163754043,274747413,11520,70,M,59.0,168.0,Asian,4.0,...,24.3,,,5.2,135.0,,,15.361111,1,True
58803,494319208,150497300,289152391,3483360,80,F,52.0,150.0,Asian,4.0,...,,10.6,227.0,3.3,141.0,0.5,15.79,60.829861,1,True
