# Creating Train + Test set for AKI Prediction
#### MLR4H Group 4 (Chi Him Ng, Dheeraj Varghese and Danila Rusinkiewicz}


In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
!pip install imblearn
from imblearn.ensemble import BalancedRandomForestClassifier



In this following section, we create the train and test sets (for temporary storage) for the different time windows (12, 6, 3 & 1). Here we ensure all instances with a specific patientID are either in the train or test set.

### 12 hours

In [None]:
df = pd.read_csv("data_12.csv")

In [None]:
# Find the minimum hour for each admissionid
min_hours = df.groupby('admissionid')['hour'].min()

# Find the minimum hour across all admissionids
global_min_hour = min_hours.min()

# Calculate the adjustment value for each admissionid
adjustment_values = global_min_hour - min_hours

# Apply the adjustment to the 'hour' column
df['hour'] = df['hour'] + df['admissionid'].map(adjustment_values)

In [None]:
df['icu_days'] = df.groupby('admissionid')['hour'].transform(lambda x: x / 2)

In [None]:
df.columns

Index(['admissionid', 'hour', 'creatinine', 'urine', 'measuredat',
       'baseline_creatinine', 'temp', 'heart_rate', 'systolic_ABP', 'mean_ABP',
       'dystolic_ABP', 'resp_rate', 'glucose', 'hema', 'calcium', 'kalium',
       'ox_sat', 'thrombo', 'bilirubine', 'leukocyten', 'hematocryt',
       'lactate', 'sodium', 'ph', 'comparison_result', 'has_shock',
       'has_sepsis', 'has_ventilation', 'nsaid_taken', 'vassopressor_taken',
       'antimicrobiotic_taken', 'acei_taken', 'arb_taken', 'has_aki',
       'cardiac_surgery', 'traumatology', 'vascular_surgery',
       'gastroenterology_surgery', 'lungs_oncology_surgery',
       'oncology_surgery', 'neuro_surgery', 'patientid', 'gender_Man',
       'gender_Vrouw', 'agegroup', 'weightgroup', 'heightgroup', 'Death',
       'admissionyeargroup', 'creatinine_change', 'urine_change',
       'temp_change', 'heart_rate_change', 'systolic_ABP_change',
       'mean_ABP_change', 'dystolic_ABP_change', 'resp_rate_change',
       'glucose_change'

In [None]:
mapping_dict = {'18-39': 0, '40-49': 1, '50-59': 2, '60-69': 3, '70-79': 4, '80+': 5}
# Map the values using the dictionary
df['agegroup'] = df['agegroup'].map(mapping_dict)

In [None]:
#df.drop(["admissionid",	"hour", "Death", "measuredat"], axis=1, inplace=True)
df['stage_12hours'] = df.groupby('admissionid')['comparison_result'].shift(-1)
df['stage_12hours'].fillna(0, inplace=True)

df['AKI'] = (df['comparison_result'] >= 1).astype(int)
df['AKI_12hours'] = df.groupby('admissionid')['AKI'].shift(-1)
df['AKI_12hours'].fillna(0, inplace=True)

In [None]:
df[['comparison_result', 'AKI', 'stage_12hours', "AKI_12hours"]]

Unnamed: 0,comparison_result,AKI,stage_12hours,AKI_12hours
0,0,0,0.0,0.0
1,0,0,0.0,0.0
2,0,0,1.0,1.0
3,1,1,0.0,0.0
4,0,0,0.0,0.0
...,...,...,...,...
198883,0,0,0.0,0.0
198884,0,0,0.0,0.0
198885,0,0,0.0,0.0
198886,0,0,0.0,0.0


In [None]:
df.drop(["antimicrobiotic_taken", "stage_24hours", "AKI_24hours"], axis=1, inplace=True)

In [None]:
print(len(df))
patients_old = df[df['admissionyeargroup'] == '2003-2009']
patients_new = df[df['admissionyeargroup'] == '2010-2016']

198888


In [None]:
train_patients = patients_old['patientid'].unique()

In [None]:
train_patients_2 = patients_new['patientid'].sample(frac=0.05, random_state=42)  # Use a specific random_state for reproducibility


In [None]:
new_subset = patients_new.loc[patients_new['patientid'].isin(train_patients)]
new_subset_2 = patients_new.loc[patients_new['patientid'].isin(train_patients_2)]

patients_old = pd.concat([patients_old, new_subset], ignore_index=True)
patients_old = pd.concat([patients_old, new_subset_2], ignore_index=True)

# Assuming df is your main DataFrame and subset_df is the DataFrame you want to remove
merged_df = patients_new.merge(new_subset, how='outer', indicator=True)
merged_df = patients_new.merge(new_subset_2, how='outer', indicator=True)

# Keep only the rows that are unique to df (indicator column has value 'left_only')
patients_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
print(len(patients_old), len(patients_new))

178396 20492


In [None]:
patients_old.to_csv('train_data_12h.csv',index=False)
patients_new.to_csv('test_data_12h.csv',index=False)

### 6 hours

In [None]:
df = pd.read_csv("data_6.csv")

In [None]:
# Find the minimum hour for each admissionid
min_hours = df.groupby('admissionid')['hour'].min()

# Find the minimum hour across all admissionids
global_min_hour = min_hours.min()

# Calculate the adjustment value for each admissionid
adjustment_values = global_min_hour - min_hours

# Apply the adjustment to the 'hour' column
df['hour'] = df['hour'] + df['admissionid'].map(adjustment_values)

In [None]:
df['icu_days'] = df.groupby('admissionid')['hour'].transform(lambda x: x / 4)

In [None]:
mapping_dict = {'18-39': 0, '40-49': 1, '50-59': 2, '60-69': 3, '70-79': 4, '80+': 5}
# Map the values using the dictionary
df['agegroup'] = df['agegroup'].map(mapping_dict)

In [None]:
#df.drop(["admissionid",	"hour", "Death", "measuredat"], axis=1, inplace=True)
df['stage_6hours'] = df.groupby('admissionid')['comparison_result'].shift(-1)
df['stage_6hours'].fillna(0, inplace=True)

df['AKI'] = (df['comparison_result'] >= 1).astype(int)
df['AKI_6hours'] = df.groupby('admissionid')['AKI'].shift(-1)
df['AKI_6hours'].fillna(0, inplace=True)

In [None]:
df[['comparison_result', 'AKI', 'stage_6hours', "AKI_6hours"]]

Unnamed: 0,comparison_result,AKI,stage_6hours,AKI_6hours
0,0,0,0.0,0.0
1,0,0,0.0,0.0
2,0,0,0.0,0.0
3,0,0,0.0,0.0
4,0,0,0.0,0.0
...,...,...,...,...
385407,0,0,0.0,0.0
385408,0,0,0.0,0.0
385409,0,0,0.0,0.0
385410,0,0,0.0,0.0


In [None]:
df.drop(["antimicrobiotic_taken"], axis=1, inplace=True)

In [None]:
print(len(df))
patients_old = df[df['admissionyeargroup'] == '2003-2009']
patients_new = df[df['admissionyeargroup'] == '2010-2016']

385412


In [None]:
train_patients = patients_old['patientid'].unique()

In [None]:
train_patients_2 = patients_new['patientid'].sample(frac=0.015, random_state=42)  # Use a specific random_state for reproducibility


In [None]:
new_subset = patients_new.loc[patients_new['patientid'].isin(train_patients)]
new_subset_2 = patients_new.loc[patients_new['patientid'].isin(train_patients_2)]

patients_old = pd.concat([patients_old, new_subset], ignore_index=True)
patients_old = pd.concat([patients_old, new_subset_2], ignore_index=True)

# Assuming df is your main DataFrame and subset_df is the DataFrame you want to remove
merged_df = patients_new.merge(new_subset, how='outer', indicator=True)
merged_df = patients_new.merge(new_subset_2, how='outer', indicator=True)

# Keep only the rows that are unique to df (indicator column has value 'left_only')
patients_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
print(len(patients_old), len(patients_new))

323431 61981


In [None]:
patients_old.to_csv('train_data_6h.csv',index=False)
patients_new.to_csv('test_data_6h.csv',index=False)

### 3 hours

In [None]:
df = pd.read_csv("data_3.csv")

In [None]:
# Find the minimum hour for each admissionid
min_hours = df.groupby('admissionid')['hour'].min()

# Find the minimum hour across all admissionids
global_min_hour = min_hours.min()

# Calculate the adjustment value for each admissionid
adjustment_values = global_min_hour - min_hours

# Apply the adjustment to the 'hour' column
df['hour'] = df['hour'] + df['admissionid'].map(adjustment_values)

In [None]:
df['icu_days'] = df.groupby('admissionid')['hour'].transform(lambda x: x / 8)

In [None]:
mapping_dict = {'18-39': 0, '40-49': 1, '50-59': 2, '60-69': 3, '70-79': 4, '80+': 5}
# Map the values using the dictionary
df['agegroup'] = df['agegroup'].map(mapping_dict)

In [None]:
#df.drop(["admissionid",	"hour", "Death", "measuredat"], axis=1, inplace=True)
df['stage_3hours'] = df.groupby('admissionid')['comparison_result'].shift(-1)
df['stage_3hours'].fillna(0, inplace=True)

df['AKI'] = (df['comparison_result'] >= 1).astype(int)
df['AKI_3hours'] = df.groupby('admissionid')['AKI'].shift(-1)
df['AKI_3hours'].fillna(0, inplace=True)

In [None]:
df[['comparison_result', 'AKI', 'stage_3hours', "AKI_3hours"]]

Unnamed: 0,comparison_result,AKI,stage_3hours,AKI_3hours
0,0,0,0.0,0.0
1,0,0,0.0,0.0
2,0,0,0.0,0.0
3,0,0,0.0,0.0
4,0,0,0.0,0.0
...,...,...,...,...
685471,0,0,0.0,0.0
685472,0,0,0.0,0.0
685473,0,0,0.0,0.0
685474,0,0,0.0,0.0


In [None]:
df.drop(["antimicrobiotic_taken"], axis=1, inplace=True)

In [None]:
print(len(df))
patients_old = df[df['admissionyeargroup'] == '2003-2009']
patients_new = df[df['admissionyeargroup'] == '2010-2016']

685476


In [None]:
train_patients = patients_old['patientid'].unique()

In [None]:
train_patients_2 = patients_new['patientid'].sample(frac=0.012, random_state=42)  # Use a specific random_state for reproducibility


In [None]:
new_subset = patients_new.loc[patients_new['patientid'].isin(train_patients)]
new_subset_2 = patients_new.loc[patients_new['patientid'].isin(train_patients_2)]

patients_old = pd.concat([patients_old, new_subset], ignore_index=True)
patients_old = pd.concat([patients_old, new_subset_2], ignore_index=True)

# Assuming df is your main DataFrame and subset_df is the DataFrame you want to remove
merged_df = patients_new.merge(new_subset, how='outer', indicator=True)
merged_df = patients_new.merge(new_subset_2, how='outer', indicator=True)

# Keep only the rows that are unique to df (indicator column has value 'left_only')
patients_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
print(len(patients_old), len(patients_new))

598056 87420


In [None]:
patients_old.to_csv('train_data_3h.csv',index=False)
patients_new.to_csv('test_data_3h.csv',index=False)

### 1 hours

In [None]:
df = pd.read_csv("data_1.csv")

In [None]:
# Find the minimum hour for each admissionid
min_hours = df.groupby('admissionid')['hour'].min()

# Find the minimum hour across all admissionids
global_min_hour = min_hours.min()

# Calculate the adjustment value for each admissionid
adjustment_values = global_min_hour - min_hours

# Apply the adjustment to the 'hour' column
df['hour'] = df['hour'] + df['admissionid'].map(adjustment_values)

In [None]:
df['hour']

0            0
1            1
2            2
3            3
4            4
          ... 
1993369    254
1993370    256
1993371    258
1993372    260
1993373    262
Name: hour, Length: 1993374, dtype: int64

In [None]:
df['icu_days'] = df.groupby('admissionid')['hour'].transform(lambda x: x / 24)

In [None]:
mapping_dict = {'18-39': 0, '40-49': 1, '50-59': 2, '60-69': 3, '70-79': 4, '80+': 5}
# Map the values using the dictionary
df['agegroup'] = df['agegroup'].map(mapping_dict)

In [None]:
#df.drop(["admissionid",	"hour", "Death", "measuredat"], axis=1, inplace=True)
df['stage_1hours'] = df.groupby('admissionid')['comparison_result'].shift(-1)
df['stage_1hours'].fillna(0, inplace=True)

df['AKI'] = (df['comparison_result'] >= 1).astype(int)
df['AKI_1hours'] = df.groupby('admissionid')['AKI'].shift(-1)
df['AKI_1hours'].fillna(0, inplace=True)

In [None]:
df[['comparison_result', 'AKI', 'stage_1hours', "AKI_1hours"]]

Unnamed: 0,comparison_result,AKI,stage_1hours,AKI_1hours
0,0,0,0.0,0.0
1,0,0,0.0,0.0
2,0,0,0.0,0.0
3,0,0,0.0,0.0
4,0,0,0.0,0.0
...,...,...,...,...
1993369,0,0,0.0,0.0
1993370,0,0,0.0,0.0
1993371,0,0,0.0,0.0
1993372,0,0,0.0,0.0


In [None]:
df.drop(["antimicrobiotic_taken"], axis=1, inplace=True)

In [None]:
print(len(df))
patients_old = df[df['admissionyeargroup'] == '2003-2009']
patients_new = df[df['admissionyeargroup'] == '2010-2016']

1993374


In [None]:
train_patients = patients_old['patientid'].unique()

In [None]:
train_patients_2 = patients_new['patientid'].sample(frac=0.0029, random_state=42)  # Use a specific random_state for reproducibility


In [None]:
new_subset = patients_new.loc[patients_new['patientid'].isin(train_patients)]
new_subset_2 = patients_new.loc[patients_new['patientid'].isin(train_patients_2)]

patients_old = pd.concat([patients_old, new_subset], ignore_index=True)
patients_old = pd.concat([patients_old, new_subset_2], ignore_index=True)

# Assuming df is your main DataFrame and subset_df is the DataFrame you want to remove
merged_df = patients_new.merge(new_subset, how='outer', indicator=True)
merged_df = patients_new.merge(new_subset_2, how='outer', indicator=True)

# Keep only the rows that are unique to df (indicator column has value 'left_only')
patients_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
print(len(patients_old), len(patients_new))

1675865 317509


In [None]:
patients_old.to_csv('train_data_1h.csv',index=False)
patients_new.to_csv('test_data_1h.csv',index=False)