In [1]:
import pandas as pd


In [9]:
df=pd.read_csv('../_data/operations_inscope.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58815 entries, 0 to 58814
Data columns (total 58 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   op_id              58815 non-null  int64  
 1   subject_id         58815 non-null  int64  
 2   hadm_id            58815 non-null  int64  
 3   opdate             58815 non-null  int64  
 4   age                58815 non-null  int64  
 5   sex                58815 non-null  object 
 6   weight             58180 non-null  float64
 7   height             58461 non-null  float64
 8   race               58815 non-null  object 
 9   asa                56842 non-null  float64
 10  emop               58815 non-null  int64  
 11  department         58815 non-null  object 
 12  antype             58815 non-null  object 
 13  icd10_pcs          58815 non-null  object 
 14  category_desc      58815 non-null  object 
 15  desc_short         58815 non-null  object 
 16  category_id        588

## Objective: Identify LOS Outliers withing each surgery type

We use `groupby`('icd10_pcs') groups the data by the unique values in the 'icd10_pcs'.  
 For each group, `transform` applies the calculate_percentile_75 function to values within that group.   
 This group-specific 75th percentile is then used to determine if a value is an outlier within its category.

Lastly, the is_outlier column is created to indicate with 1 if the record is an outlier in its category or 0 otherwise. We also clean up by dropping the intermediate 'percentile_75' column.


In [13]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame, 'column_name' is the column you're examining,
# and 'category_column' is the column with the categorical data
df['LOS']=(df['discharge_time']-df['orout_time'])/1440 # LOS is time following surgery to dishcarge converted to days. 

# Define a function to calculate the 75th percentile for a series
def calculate_percentile_75(series):
    return series.quantile(0.75)

# Apply the function to each group's 'icd10_pcs' and create a new column for the 75th percentile
df['percentile_75'] = df.groupby('icd10_pcs')['LOS'].transform(calculate_percentile_75)

# Create a new column with 0/1 based on the condition, using the group-specific 75th percentile
df['is_outlier'] = np.where(df['LOS'] > df['percentile_75'], 1, 0)

# Drop the 'percentile_75' column if it's no longer needed
df.drop('percentile_75', axis=1, inplace=True)

df
# Optionally, if you still need the filtered DataFrame without outliers, you can filter the DataFrame
df_no_outlier = df[df['is_outlier'] == 0]
df_no_outlier

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier
1,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,12.7,,8.3,244.0,,,,17.62,3.760417,0
2,467425045,134213281,225860669,1440,60,F,62.0,154.0,Asian,1.0,...,,,,,,,,,3.493056,0
3,461473883,134195201,265770645,1440,35,F,50.0,160.0,Asian,1.0,...,10.4,,13.3,124.0,3.9,138.0,0.6,6.31,4.236111,0
4,466389608,160947402,262240911,1440,60,F,52.0,152.0,Asian,1.0,...,,,,,,,,,1.607639,0
8,439560439,163619571,279388936,0,75,F,65.0,154.0,Asian,2.0,...,,,,,,,,,0.604167,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58808,473347129,164083403,269112760,4913280,85,M,69.0,173.0,Asian,3.0,...,,,,,,,,,1.510417,0
58810,497537801,130062652,263988404,3909600,50,M,64.0,169.0,Asian,2.0,...,,,,,3.5,136.0,,,1.447917,0
58811,461252752,126772283,273139806,2880,70,F,53.0,162.0,Asian,2.0,...,,,,,3.7,142.0,,,2.614583,0
58812,471834474,144363433,275833861,2880,65,F,51.0,152.0,Asian,2.0,...,,,,,3.8,143.0,,,5.447917,0
