# Classifcation Models Evaluation 

## 0. Installation requirements
This model requires sklearn and sklearn gentics 
Run the following lines to install if required:

In [None]:
# !pip install scikit-learn
# !pip install sklearn-genetic  

In [None]:
import numpy as np
import pandas as p

from imblearn.over_sampling import SMOTE

import pandas as pd
import copy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

from sklearn import svm, linear_model

import time 
import numpy as np
from genetic_selection import GeneticSelectionCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import RocCurveDisplay


# 1. Data Pre-processing 

## Functions: pre_process data
Reads the csv file and converts the outcome and gender column from a boolean type to integer for processing. 
The data is filtered for mostly empty columns and empty rows. Followed by splitting of the dataset. Finally SMOTE is applied to remove the problem of imbalanced data in the training dataset. 

## Functions: get_feats
Takes in the classification model used as well as the training data available to determine the best features to be used. The evaluation is done using k-fold cross validation technique. 

## Function: print_results 
Takes in the model and dataset. Using the training data we fit the model and evaluate the performance on the Training and Test dataset using the F-measure and AUROC scoring metric

In [None]:
def pre_process_data():

    df = pd.read_csv('Assignment_1_data.csv')
    df["outcome"] = df["outcome"].astype(int)
    le = LabelEncoder()
    le.fit(df["gender"])
    df["gender"] = le.transform(df["gender"])

    X = df.copy()

    y = X['outcome'].astype(int)
    print('Num Samples:', X.shape[0], 'Num Features:', X.shape[1], 'Num intubation', np.sum(y==1))
    print("Percentage of intubation:", np.round(np.sum(y==1)/X.shape[0]*100, 3) , "%")


    num_samples =  X.shape[0]
    n_std = 4
    for idx, col in enumerate(X.columns):
        non_nan_percentage = df[df.columns[idx]].count()/num_samples*100
        if non_nan_percentage < 80: 
            X = X.drop(columns=[col])
        else: 
            if col != 'outcome':
                mean = X[col].mean()
                sd = X[col].std()
                # X = X[(X[col] <= mean+(n_std*sd))]
                X = X[(X[col] <= mean+(n_std*sd))] #and X[col] >= mean-(n_std*sd)
                X = X[(X[col] >= mean-(n_std*sd))] #and X[col] >= mean-(n_std*sd)
                X[col] = (X[col] - X[col].mean()) / X[col].std()

    X = X.dropna()
    y = X['outcome'].astype(int)
    X = X.drop(columns=['outcome'])

    print('Num Samples:', X.shape[0], 'Num Features:', X.shape[1], 'Num intubation', np.sum(y==1))
    print("Percentage of intubation:", np.round(np.sum(y==1)/X.shape[0]*100, 3) , "%")

        
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


    assert y.shape[0] == X.shape[0]
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X, y)

    return X_train, X_test, y_train, y_test

def get_feats(X, y, estimator):
    max_feats = 10 ;     n_pop = 100;   cross_prob = 0.8 ; mutation_prob = 0.2 ;   n_gens = 5 ;   t_size = 10;
    # estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr")
    # estimator = DecisionTreeClassifier(criterion="entropy", max_depth=3, max_features='log2')
    feat_selector = GeneticSelectionCV(
        estimator,
        cv=10, #5
        verbose=0,
        scoring="accuracy",
        max_features=max_feats,
        n_population=n_pop,
        crossover_proba=cross_prob,
        mutation_proba=mutation_prob,
        n_generations=n_gens,
        crossover_independent_proba=0.5,
        mutation_independent_proba=0.05,
        tournament_size=t_size,
        n_gen_no_change=10,
        caching=True,
        n_jobs=-1,
    )
    feat_selector = feat_selector.fit(X, y)
    
    selected_feats = X.columns[feat_selector.support_]
    print("Selected Features:", selected_feats)
    return selected_feats


def print_results(y_test, key, clf, X_train,y_train,X_test, case):
    start_time = time.time()
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    runtime = (time.time() - start_time)


    f1_score = metrics.accuracy_score(y_train,y_pred_train)
    auroc_train = metrics.roc_auc_score(y_train, y_pred_train)
    print(case, " | Train Set - F1: {0:2f}, AUROC: {1:2f} ".format(f1_score, auroc_train))

    
    f1_score = metrics.accuracy_score(y_test,y_pred)
    auroc = metrics.roc_auc_score(y_test, y_pred)
    print(case, " -| Test Set F1: {0:2f}, AUROC: {1:2f} ".format(f1_score, auroc, runtime))

    return auroc_train, auroc, clf, y_pred

# 1.1 Pre-processing the data 

In [None]:

X_train, X_test, y_train, y_test = pre_process_data()

# X_train.to_pickle("./X_train.pkl")  
# X_test.to_pickle("./X_test.pkl")  
# y_train.to_pickle("./y_train.pkl")  
# y_test.to_pickle("./y_test.pkl")  

# X_train = pd.read_pickle("./X_train_out.pkl")  
# y_train = pd.read_pickle("./y_train_out.pkl")
# X_test = pd.read_pickle("./X_test_out.pkl")
# y_test = pd.read_pickle("./y_test_out.pkl")


print("--------------------------------------------------------------")

print('Train Num Samples:', X_train.shape[0], 'Num Features:', X_train.shape[1], 'Num intubation', np.sum(y_train==1))
print("Percentage of intubation:", np.round(np.sum(y_train==1)/X_train.shape[0]*100, 3) , "%")

print('Test Num Samples:', X_test.shape[0], 'Num Features:', X_test.shape[1], 'Num intubation', np.sum(y_test==1))
print("Percentage of intubation:", np.round(np.sum(y_test==1)/X_test.shape[0]*100, 3) , "%")

print("--------------------------------------------------------------")



# Model Selction with Parameters tuning

In [None]:

models_ = {'Logistic Regression' : linear_model.LogisticRegression(solver="liblinear"), 
            'Decision Tree': DecisionTreeClassifier(criterion='entropy'),
            'Random Forest': RandomForestClassifier(),
            'Adaboost': AdaBoostClassifier(),
            'Gradient Boost': GradientBoostingClassifier(),'SVM': svm.SVC(C=1.0, kernel='linear')}

# Parameter tuning grids-------------------------
LR_params = [{'C': [1, 10, 100], 'penalty': ['l1', 'l2']}]
DT_params = [{'max_depth': [3, 5, 7]}]
RF_params = [{'n_estimators': [50, 100, 500], 'max_depth': [3, 5, 7], 'max_samples': [1000, 4000, 8235]}]
Gradient_params = [{'learning_rate': [0.001, 0.01, 0.1], 'n_estimators': [10, 50 , 100, 500], 'subsample': [0.5, 0.7, 1.0], 'max_depth': [3, 5, 7]}]
Ada_params = [{'learning_rate': [0.001, 0.01, 0.1], 'n_estimators': [10, 50 , 100, 500], 'base_estimator': [DecisionTreeClassifier(max_depth=3, class_weight = 'balanced'), DecisionTreeClassifier(max_depth=5, class_weight = 'balanced'), DecisionTreeClassifier(max_depth=7, class_weight = 'balanced')]}]
SVM_params =[{'kernel': ['linear', 'poly', 'rbf'], 'C': [100, 10, 1.0, 0.1, 0.001]}]

grid_params ={'Logistic Regression': LR_params,
        'Decision Tree': DT_params,
        'Random Forest': RF_params,
        'Adaboost': Ada_params,
        'Gradient Boost': Gradient_params,
        'SVM': SVM_params}



In [None]:
fig, axs = plt.subplots(1, 3, figsize=(24, 6))

experiment_name_list = [] ; auroc_train_list = [] ; auroc_test = [] ; runtime_list = []; feats_list = []

for key in models_:
    print("------------------------------------------------------")
    start_time = time.time()
    model = models_[key]
    case = copy.deepcopy(key) 
    auroc_train, auroc , _, y_pred= print_results(y_test, key, model, X_train,y_train,X_test, key)

    runtime = time.time() - start_time
    RocCurveDisplay.from_predictions(y_true=y_test, y_pred = y_pred, ax = axs[0], label = case + '| (auroc =' + str(np.round(auroc,3)) +')' )
    experiment_name_list.append(case); auroc_train_list.append(auroc_train) ; auroc_test.append(auroc); runtime_list.append(runtime)
    del model

    start_time = time.time()
    clf = models_[key]
    case = key +" + Feature Selection"    
    sel_feats = get_feats(X_train,y_train, clf)


    auroc_train, auroc, _, y_pred = print_results(y_test, key, clf, X_train[sel_feats],y_train,X_test[sel_feats], key)
    RocCurveDisplay.from_predictions(y_true=y_test, y_pred = y_pred, ax = axs[1] ,  label = case + '| (auroc =' + str(np.round(auroc,3)) +')'  )
    runtime = time.time() - start_time
    experiment_name_list.append(case); auroc_train_list.append(auroc_train) ; auroc_test.append(auroc); runtime_list.append(runtime)


    start_time = time.time()
    gsearch = GridSearchCV(clf, grid_params[key], cv = 5, scoring='roc_auc', n_jobs=10)
    case = key +" + FS + Grid search"       

    
    auroc_train, auroc, gsearch, y_pred  = print_results(y_test, key, gsearch, X_train[sel_feats],y_train,X_test[sel_feats], key)
    RocCurveDisplay.from_predictions(y_true=y_test, y_pred = y_pred, ax = axs[2],  label = case + '| (auroc =' + str(np.round(auroc,3)) +')'  )
    print(gsearch.best_params_)
    runtime = time.time() - start_time
    experiment_name_list.append(case); auroc_train_list.append(auroc_train) ; auroc_test.append(auroc); runtime_list.append(runtime)

    results_df = pd.DataFrame(gsearch.cv_results_)
    results_df = results_df.sort_values(by=["rank_test_score"])
    save_file_path = key + "_results.csv"
    results_df.to_csv(save_file_path)

    feats_list.append('None')
    feats_list.append(sel_feats.tolist())
    feats_list.append(sel_feats.tolist()+ [gsearch.best_params_])

    del clf
    # break

axs[0].set_title('Baseline')
axs[1].set_title('Baseline with Feature Selection')
axs[2].set_title('Baseline with Feature Selection and Optimisation')
plt.show()

results = {'Experiment': experiment_name_list , 'auroc train': auroc_train_list , 'auroc': auroc_test , 'runtime': runtime_list , 'selected feaats': feats_list }
df = pd.DataFrame(data=results)
df.to_csv("Obtained_results.csv")