# Data Preprocessing

In [39]:
import pandas as pd
import numpy as np
import math

In [40]:
# Equation for Cleaning data

def clean_Sex(x):
    if x.lower() == "male" or x.lower() == "m" or x == 1:
        return "male"
    elif x.lower() == "female" or x.lower() == "f":
        return "female"

def fill_na(x):
    if math.isnan(x):
        return 0
    else:
        return x


In [41]:
#Read the data
df = pd.read_csv("TD_HOSPITAL_TRAIN.csv")
df.head()

Unnamed: 0,timeknown,cost,reflex,sex,blood,bloodchem1,bloodchem2,temperature,race,heart,...,diabetes,income,extraprimary,bloodchem6,education,psych5,psych6,information,cancer,death
0,4.0,3008.38867,11.228005,male,20.699219,2.199707,1.299805,35.59375,white,103.0,...,0.0,$11-$25k,COPD/CHF/Cirrhosis,167.5,20.0,30.0,2.0,0.0,no,1.0
1,467.0,23585.8906,9.714861,M,9.398438,,0.699951,39.0,white,50.0,...,0.0,>$50k,Cancer,480.0,16.0,11.5,1.0,10.0,metastatic,1.0
2,533.0,4046.45898,11.353296,Male,19.296875,,1.599854,38.19531,white,50.0,...,1.0,under $11k,ARF/MOSF,177.125,5.0,18.0,0.0,5.0,yes,0.0
3,68.0,,9.269058,female,7.5,2.5,0.599976,37.59375,white,80.0,...,0.0,$11-$25k,COPD/CHF/Cirrhosis,,12.0,7.0,1.839,12.0,no,1.0
4,1605.0,6457.70703,8.655387,female,15.099609,4.399414,0.699951,35.69531,white,114.0,...,0.0,under $11k,COPD/CHF/Cirrhosis,233.3125,2.0,7.0,6.0,12.0,no,0.0


In [42]:
unique_pdeath_values = df['pdeath'].unique()
unique_pdeath_values

array([nan])

In [43]:
df = df.drop(columns=['pdeath'])

In [44]:
#Apply fill_na to numeric columns
for col in df.columns:
    if df[col].dtype in [np.int64, np.float64]:
        df[col] = df[col].apply(fill_na)

In [45]:
#ONE HOT ENCODING

encoding_columns = ["sex", "dnr", "primary", "disability", "extraprimary", "cancer", "race"]

for i in encoding_columns:
    newCols = pd.get_dummies(df[i], prefix=i)
    df = pd.concat([df, newCols], axis=1)
    df = df.drop(columns=[i])


In [48]:
for i in df.columns:
    try:
        df[i] = df[i].astype(float)
    except ValueError:
        pass

In [50]:
#drop un important features
drop_cols = ["cost", "income"]
df = df.drop(columns=drop_cols)

# Model Training

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

In [52]:
#Get features and labels
y = df['death'] 
X = df.drop(columns=['death'])

In [55]:
# Scalling
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
ada = AdaBoostClassifier()

#Hypertunining
param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500, 1000],
              'learning_rate': [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'algorithm': ['SAMME', 'SAMME.R']}

#train with grid search
grid = GridSearchCV(ada, param_grid, cv=5)
grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = grid.predict_proba(X_test)[:,1]
rocauc = roc_auc_score(y_test, y_pred)
print(rocauc)