In [41]:
import pandas as pd
import numpy as np
from io import StringIO
Train = pd.read_csv("Data/train.csv")
Test = pd.read_csv("Data/test.csv")

In [42]:
# fill null values with median for temp_apache
Train['temp_apache'].fillna(Train['temp_apache'].median(), inplace=True)

# fill null values with median for d1_potassium_max
Train['d1_potassium_max'].fillna(Train['d1_potassium_max'].median(), inplace=True)

# fill null values with median for apache_4a_hospital_death_prob
Train['apache_4a_hospital_death_prob'].fillna(Train['apache_4a_hospital_death_prob'].median(), inplace=True)

# fill null values with median for apache_4a_icu_death_prob
Train['apache_4a_icu_death_prob'].fillna(Train['apache_4a_icu_death_prob'].median(), inplace=True)
#since all these r heavily skewed andaffected by the outliers we will fill them using median imputation

Test['temp_apache'].fillna(Test['temp_apache'].median(), inplace=True)

# fill null values with median for d1_potassium_max
Test['d1_potassium_max'].fillna(Test['d1_potassium_max'].median(), inplace=True)

# fill null values with median for apache_4a_hospital_death_prob
Test['apache_4a_hospital_death_prob'].fillna(Test['apache_4a_hospital_death_prob'].median(), inplace=True)

# fill null values with median for apache_4a_icu_death_prob
Test['apache_4a_icu_death_prob'].fillna(Test['apache_4a_icu_death_prob'].median(), inplace=True)

# group the dataframe by apache_2_bodysystem and calculate the mean age for each group
mean_age_by_bodysystem = Train.groupby('apache_2_bodysystem')['age'].mean()

# define a function that takes a row of the dataframe as input and returns the mean age of the corresponding apache_2_bodysystem
def fill_age(row):
    if pd.isnull(row['age']):
        if pd.isnull(row['apache_2_bodysystem']):
            return np.nan
        else:
            return mean_age_by_bodysystem[row['apache_2_bodysystem']]
    else:
        return row['age']

# apply the function to each row of the dataframe and fill the missing age values with the corresponding mean age
Train['age'] = Train.apply(fill_age, axis=1)

# group the dataframe by apache_2_bodysystem and calculate the mean age for each group
mean_age_by_bodysystem = Test.groupby('apache_2_bodysystem')['age'].mean()

# define a function that takes a row of the dataframe as input and returns the mean age of the corresponding apache_2_bodysystem
def fill_age(row):
    if pd.isnull(row['age']):
        if pd.isnull(row['apache_2_bodysystem']):
            return np.nan
        else:
            return mean_age_by_bodysystem[row['apache_2_bodysystem']]
    else:
        return row['age']

# apply the function to each row of the dataframe and fill the missing age values with the corresponding mean age
Test['age'] = Test.apply(fill_age, axis=1)

#for all binary columns we will apply mode imputation for missing values
#first we will create a list of all binary columns
binary_colsTest = ['elective_surgery', 'apache_post_operative', 'gcs_unable_apache', 'intubated_apache', 'ventilated_apache','immunosuppression', 'solid_tumor_with_metastasis']

binary_colsTrain = ['elective_surgery', 'apache_post_operative', 'gcs_unable_apache', 'intubated_apache', 'ventilated_apache','immunosuppression', 'solid_tumor_with_metastasis','hospital_death']
#now we will apply mode imputation on these columns
from sklearn.impute import SimpleImputer
binary_colsTest = [col for col in Train.columns if Train[col].dtype == 'object' or col in binary_colsTest]
binary_colsTrain = [col for col in Test.columns if Test[col].dtype == 'object' or col in binary_colsTrain]

imputer = SimpleImputer(strategy='most_frequent')
Train[binary_colsTrain] = imputer.fit_transform(Train[binary_colsTrain])
Test[binary_colsTest] = imputer.fit_transform(Test[binary_colsTest])

numeric_cols = [col for col in Train.select_dtypes(include=[np.number]).columns if col not in binary_colsTrain]
numeric_colsTest = [col for col in Test.select_dtypes(include=[np.number]).columns if col not in binary_colsTrain]

from sklearn.impute import KNNImputer

# create an instance of KNNImputer with k=3
imputer = KNNImputer(n_neighbors=3)

# fill missing values in Train dataframe
Train[numeric_cols] = imputer.fit_transform(Train[numeric_cols])

# fill missing values in Test dataframe
Test[numeric_colsTest] = imputer.fit_transform(Test[numeric_colsTest])

In [43]:
#next we drop simialr record columns
Train = Train.drop('apache_3j_bodysystem', axis=1)
Test = Test.drop('apache_3j_bodysystem', axis=1)

In [44]:
dropcolumns=['d1_diasbp_noninvasive_min','h1_sysbp_max','h1_mbp_max', 'h1_mbp_noninvasive_max', 'h1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'h1_diasbp_noninvasive_min']
Train= Train.drop(dropcolumns, axis=1)
Test= Test.drop(dropcolumns, axis=1)

In [45]:
#one hot encode Train and Test
Train = pd.get_dummies(Train)
Test = pd.get_dummies(Test)


In [46]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
Train_scaled = scaler.fit_transform(Train)
Train_scaled = pd.DataFrame(Train_scaled, columns=Train.columns)


In [47]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
Test_scaled = scaler.fit_transform(Test)
Test_scaled = pd.DataFrame(Test_scaled, columns=Test.columns)


In [48]:
#split Train into Train and Val
from sklearn.model_selection import train_test_split
X = Train_scaled.loc[: , Train_scaled.columns != 'hospital_death']
y = Train_scaled['hospital_death']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [49]:

import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [77]:
rf = RandomForestClassifier(max_depth=3,n_estimators=100, criterion='log_loss')
rf.fit(X_train, y_train)
md_probs = rf.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("RFC" , " : ", md_auc)

RFC  :  0.8474678585880603


I should have used max_features='sqrt'

In [171]:
#record the start time
nb_c = CatBoostClassifier(iterations=650, depth=3, learning_rate=0.1, loss_function='Logloss', verbose=False)
nb_c.fit(X_train,y_train)
md_probs = nb_c.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("Cat Boost" , " : ", md_auc)

Cat Boost  :  0.877734741018164


In [None]:
0.877734741018164=650 3


In [82]:
y_new_predBAG = nb_c.predict_proba(Test_scaled)
hospital_death = y_new_predBAG[:, 1]



In [161]:
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame(hospital_death, columns=['hospital_death'])

# Add the record ID from the test data to the predictions DataFrame
predictions_df.insert(0, 'RecordID', Test['RecordID'])

# Save the predictions to a CSV file
# predictions_df.to_csv('predictionsCAT.csv', index=False)
# predictions_df.to_csv('predictionsXGB.csv', index=False)
# predictions_df.to_csv('predictionsgb.csv', index=False)
# predictions_df.to_csv('predictionslgb.csv', index=False)
# predictions_df.to_csv('predictionsab.csv', index=False)
predictions_df.to_csv('predictionsBGC.csv', index=False)


In [118]:
xgb_model = xgb.XGBClassifier(max_depth=2, n_estimators=900, learning_rate=0.1)
xgb_model.fit(X_train,y_train)
md_probs = xgb_model.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("Cat Boost" , " : ", md_auc)


Cat Boost  :  0.8765432672665414


In [109]:
y_new_predBAG = xgb_model.predict_proba(Test_scaled)
hospital_death = y_new_predBAG[:, 1]



In [143]:
gb = GradientBoostingClassifier(max_depth=2,n_estimators=950, learning_rate=0.1)
gb.fit(X_train,y_train)
md_probs = gb.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("Cat Boost" , " : ", md_auc)


Cat Boost  :  0.8768628721662469


In [153]:
y_new_predBAG = ab.predict_proba(Test_scaled)
hospital_death = y_new_predBAG[:, 1]



In [152]:
ab = AdaBoostClassifier(n_estimators=300, learning_rate=0.1)
ab.fit(X_train,y_train)
md_probs = ab.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("Cat Boost" , " : ", md_auc)


Cat Boost  :  0.8705784951986667


In [None]:
0.8725895555138906 600

In [138]:
lgb_model = lgb.LGBMClassifier(max_depth=2, n_estimators=650, learning_rate=0.1, num_leaves=31)
lgb_model.fit(X_train, y_train)
md_probs = lgb_model.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("Cat Boost" , " : ", md_auc)

[LightGBM] [Info] Number of positive: 3069, number of negative: 31931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4254
[LightGBM] [Info] Number of data points in the train set: 35000, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087686 -> initscore=-2.342226
[LightGBM] [Info] Start training from score -2.342226
Cat Boost  :  0.8769578526131875


In [141]:
y_new_predBAG = lgb_model.predict_proba(Test_scaled)
hospital_death = y_new_predBAG[:, 1]



In [155]:
from sklearn.neighbors import KNeighborsClassifier

In [159]:
#record the start time
nb_c = KNeighborsClassifier(n_neighbors=6)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# create a pipeline with StandardScaler and KNeighborsClassifier
lgb_model = lgb.LGBMClassifier(max_depth=2, n_estimators=650, learning_rate=0.1)
bg_c = BaggingClassifier(base_estimator=lgb_model, n_estimators=650)
bg_c.fit(X_train,y_train)
md_probs = bg_c.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
print("Cat Boost" , " : ", md_auc)


[LightGBM] [Info] Number of positive: 3069, number of negative: 31931
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4254
[LightGBM] [Info] Number of data points in the train set: 35000, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086857 -> initscore=-2.352628
[LightGBM] [Info] Start training from score -2.352628
[LightGBM] [Info] Number of positive: 3069, number of negative: 31931
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4254
[LightGBM] [Info] Number of data points in the train set: 35000, number of used features: 84
[LightGBM] [Info] [b

In [160]:
y_new_predBAG = bg_c.predict_proba(Test_scaled)
hospital_death = y_new_predBAG[:, 1]



In [162]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier


In [164]:
et_clf = ExtraTreesClassifier(n_estimators=650, bootstrap=True) #boostrap by default is False


In [165]:
xgb_model = xgb.XGBClassifier(max_depth=2, n_estimators=900, learning_rate=0.1)


In [166]:
v_clf = VotingClassifier(estimators=[('LGBB', bg_c), ('et', et_clf), ('xgb', xgb_model)], voting='soft')
v_clf.fit(X,y)


[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4361
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086340 -> initscore=-2.359166
[LightGBM] [Info] Start training from score -2.359166
[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4361
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 84
[LightGBM] [Info] [b

In [169]:
y_new_predVote = v_clf.predict_proba(Test_scaled)
hospital_death = y_new_predVote[:, 1]



In [170]:
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame(hospital_death, columns=['hospital_death'])

# Add the record ID from the test data to the predictions DataFrame
predictions_df.insert(0, 'RecordID', Test['RecordID'])

# Save the predictions to a CSV file
# predictions_df.to_csv('predictionsCAT.csv', index=False)
# predictions_df.to_csv('predictionsXGB.csv', index=False)
# predictions_df.to_csv('predictionsgb.csv', index=False)
# predictions_df.to_csv('predictionslgb.csv', index=False)
# predictions_df.to_csv('predictionsab.csv', index=False)
predictions_df.to_csv('predictionsVoting.csv', index=False)


In [172]:
from sklearn.ensemble import StackingClassifier

In [173]:
estimators = [('lgb', bg_c),
                ('CAT', nb_c),
                ('rf', RandomForestClassifier(max_depth=10,n_estimators=600))]

In [175]:
st_clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())
st_clf.fit(X,y)

[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4361
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085080 -> initscore=-2.375245
[LightGBM] [Info] Start training from score -2.375245
[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4361
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.088820 -> initscore=-2.328129
[LightGBM] [Info] Start training from score -2.328129
[LightGBM] [In

In [180]:
st_clf = StackingClassifier(estimators=estimators, final_estimator=v_clf)
st_clf.fit(X,y)

[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4361
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.084460 -> initscore=-2.383236
[LightGBM] [Info] Start training from score -2.383236
[LightGBM] [Info] Number of positive: 4338, number of negative: 45662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4361
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 84
[LightGBM] [Info] [b

In [182]:
y_new_predVote = st_clf.predict_proba(Test)
hospital_death = y_new_predVote[:, 1]
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame(hospital_death, columns=['hospital_death'])

# Add the record ID from the test data to the predictions DataFrame
predictions_df.insert(0, 'RecordID', Test['RecordID'])

# Save the predictions to a CSV file
# predictions_df.to_csv('predictionsCAT.csv', index=False)
# predictions_df.to_csv('predictionsXGB.csv', index=False)
# predictions_df.to_csv('predictionsgb.csv', index=False)
# predictions_df.to_csv('predictionslgb.csv', index=False)
# predictions_df.to_csv('predictionsab.csv', index=False)
predictions_df.to_csv('predictionsStack.csv', index=False)

