In [4]:
#library imports
import pandas as pd
import numpy as np

import geopandas as gpd
import matplotlib.pyplot as plt
from fuzzywuzzy import process

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV



### 1.1 - Data Visualization

### 1.2 - Data Preprocessing

### 1.3 - Feature Selection

In [2]:
#data loading

path = "../p2/"
files = {
    'test':{
        'in':"cases_2021_test_processed_unlabelled_2",
        'out':"Testing_Data"
    },
    'train':{
        'in':"cases_2021_train_processed_2",
        'out':"Training_Data"
    }
}

mode = 'train'

filename = files[mode]['in']
filetype = ".xlsx"
data = pd.read_excel(path + filename + filetype)

### 1.4 - Feature Mapping

In [3]:
regionWork = data.copy(deep=False)
data = data.loc[:, ~data.columns.isin(['Confirmed', 'Deaths', 'Recovered', 'Active'])]

def genderMapping(val):
    return 1 if val == 'male' else 0

def diseaseMapping(val):
    return 1 if val else 0

def outcomeMapping(val):
    match val:
        case 'deceased':
            return 0
        case 'hospitalized':
            return 1
        case _:
            return 2
        
data['sex'] = np.vectorize(genderMapping)(data['sex'])
data['chronic_disease_binary'] = np.vectorize(diseaseMapping)(data['chronic_disease_binary'])
if 'outcome_group' in data:
    data['outcome_group'] = np.vectorize(outcomeMapping)(data['outcome_group'])

data['date_confirmation'] = data['date_confirmation'] - data['date_confirmation'].min()
data['date_confirmation'] = data['date_confirmation'].dt.days.astype(float) / 149

In [4]:
regionWork = regionWork.replace(np.nan, '')
# regionWork['region'] = regionWork['province'] + ' ' + regionWork['country']

regionWork = regionWork.loc[:, regionWork.columns.isin(['country', 'province', 'Incident_Rate', 'Case_Fatality_Ratio'])]

regionWork = regionWork.groupby(['province', 'country'], as_index=False).mean()

regionWork = regionWork.loc[:, regionWork.columns.isin(['country', 'Incident_Rate', 'Case_Fatality_Ratio'])]

regionWork = regionWork.groupby(['country']).mean()



rates = regionWork.loc[:, regionWork.columns.isin(['Incident_Rate'])].to_dict('index')

ratios = regionWork.loc[:, regionWork.columns.isin(['Case_Fatality_Ratio'])].to_dict('index')

def getRates(c):
    return rates[c]['Incident_Rate']

def getRatios(c):
    return ratios[c]['Case_Fatality_Ratio']


data['average_rate'] = data['country'].apply(getRates)
data['average_ratio'] = data['country'].apply(getRatios)
data = data.loc[:, ~data.columns.isin(['country', 'province'])]

In [5]:
data.to_csv(files[mode]['out'] + '.csv', index=False)

### 1.5 - Class Balancing

#### Over sampling

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# Read the mapped data file
data = pd.read_csv("Training_Data.csv")

# Separate features and labels
features = data.drop(columns=['outcome_group'])
labels = data['outcome_group']

# Use SMOTE to oversample
sm = SMOTE(random_state=42);
features_resampled, labels_resampled = sm.fit_resample(features, labels)

# Combine features and labels into a new DataFrame
data_resampled = pd.concat([pd.DataFrame(features_resampled, columns=features.columns), pd.DataFrame(labels_resampled, columns=['outcome_group'])], axis=1)

# Save the oversampled dataset to a new CSV file
data_resampled.to_csv("oversampled_data.csv", index=False)

print("Oversampled dataset saved successfully.")

#### Hybrid under and over sampling

In [None]:
# Use to SMOTEENN to create hybrid of over and under sampling
sme = SMOTEENN(random_state=42);
features_resampled, labels_resampled = sme.fit_resample(features, labels)

# Combine features and labels into a new DataFrame
data_resampled = pd.concat([pd.DataFrame(features_resampled, columns=features.columns), pd.DataFrame(labels_resampled, columns=['outcome_group'])], axis=1)

# Save the oversampled dataset to a new CSV file
data_resampled.to_csv("Hybrid_data.csv", index=False)

print("Hybrid dataset saved successfully.")

### 1.6 - Building models and hyperparameter tuning

#### XGBoost

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

In [None]:
files = {}
files['under'] = './Undersampled_data/Undersampled_data.csv'
files['hybrid'] = './Hybrid_data/Hybrid_data.csv'
files['over'] = './Oversampled_data/Oversampled_Data.csv'
files['normal'] = './Mapped_data/Training_Data.csv'
# files['verify'] = './p2/cases_2021_train_processed_2.xlsx'
files['unlabelled'] = './Mapped_data/Testing_Data.csv'

target = 'hybrid'
test = 'normal'
unlabelled = 'unlabelled'

data = pd.read_csv(files[target])
test = pd.read_csv(files[test])
unlabelled_data = pd.read_csv(files[unlabelled])

data

In [None]:
X = data.drop('outcome_group', axis=1)
y = data['outcome_group']
xgb_X_test = test.drop('outcome_group', axis=1)
xgb_y_test = test['outcome_group']

xgb_X_train, xgb_X_verify, xgb_Y_train, xgb_Y_verify = train_test_split(X, y, test_size=0.2, random_state=52)

#### Hyper parameter tuning using RandomSearhCV

In [None]:
#hybrid randomized
xgb = XGBClassifier(objective='multi:softmax', nthread = 1)
#xgb.fit(X_train, y_train)
params = {
        'learning_rate': [0.01, 0.02, 0.1, 0.2],
        'min_child_weight': [0.75, 1, 1.25],
        'gamma': [5.75, 6, 6.25],
        'subsample': [0.9, 0.95, 1.0],
        'colsample_bytree': [0.45, 0.5, 0.6],
        'max_depth': [2, 3, 4,5,6,7,8]
        }
folds = 5
param_comb = 5

# {'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 6, 'colsample_bytree': 0.6}
# {'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 6, 'colsample_bytree': 0.5}
# {'subsample': 0.95, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 6.25, 'colsample_bytree': 0.6}

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc_ovo', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001)

random_search.fit(X, y)

print(random_search.cv_results_)
print(random_search.best_estimator_)
print(random_search.best_score_ * 2 - 1)
print(random_search.best_params_)

y_pred = random_search.best_estimator_.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

macF1 = f1_score(y_test, predictions, average='macro')
print("Macro-Average_F1:", macF1)

classes = {
    0:"deceased",
    1:"hospitalized",
    2:"non-hospitalized"
}

#targetClass = 2


for i in range(len(classes)):
    macDecF1 = f1_score(y_test, predictions, labels=[i], average='macro')
    print("Macro-Average", classes[i], "F1:", macDecF1)

Overfitting detection was then done and below is the optimal classifier\
The code for overfitting detection is in 1.7

In [None]:
xgb = XGBClassifier(n_estimators=200, objective='multi:softmax', max_depth=19, min_child_weight=0.2)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

xgb_train_accuracy_scores = []
xgb_train_f1_scores = []
xgb_train_deceased_f1 = []

for tIndex, vIndex in folds.split(xgb_X_train, xgb_Y_train):
    X_train2, X_val = xgb_X_train.iloc[tIndex], xgb_X_train.iloc[vIndex]
    y_train2, y_val = xgb_Y_train.iloc[tIndex], xgb_Y_train.iloc[vIndex]

    xgb.fit(xgb_X_train, xgb_Y_train)

        
    # Make predictions on the validation set
    y_pred = xgb.predict(X_val)
    
    # Calculate the scores
    train_accuracy = accuracy_score(y_val, y_pred)
    train_f1 = f1_score(y_val, y_pred, average='macro')
    train_deceased = f1_score(y_val, y_pred, labels=[0], average='macro')


    # Append scores to correct list
    xgb_train_accuracy_scores.append(train_accuracy)
    xgb_train_f1_scores.append(train_f1)
    xgb_train_deceased_f1.append(train_deceased)

print("Average Training accuracy:", np.mean(xgb_train_accuracy_scores))
print("Average Training F1:", np.mean(xgb_train_f1_scores))
print("Average Training Deceased F1:", np.mean(xgb_train_deceased_f1))


# Predict the validation data and calculate scores
xgb_y_pred = xgb.predict(xgb_X_verify)
xgb_test_accuracy = accuracy_score(xgb_Y_verify, xgb_y_pred)
xgb_test_f1 = f1_score(xgb_Y_verify, xgb_y_pred, average='macro')
xgb_test_deceased = f1_score(xgb_Y_verify, xgb_y_pred, labels=[0], average='macro')

print("Test Accuracy:", xgb_test_accuracy)
print("Test Macro F1:", xgb_test_f1)
print("Test Deceased F1:", xgb_test_deceased)

#### Random forest


In [7]:
# Tools setup
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

In [8]:
# Read the file and retreive the data
rf_hybrid_data = pd.read_csv("Hybrid_data.csv")
#rf_unbalanced_data = pd.read_csv("Training_Data.csv")

# Separate features and labels
rf_X = rf_hybrid_data.drop(columns=['outcome_group'])
rf_class_label = rf_hybrid_data['outcome_group']
rf_X

Unnamed: 0,age,sex,latitude,longitude,date_confirmation,chronic_disease_binary,Incident_Rate,Case_Fatality_Ratio,average_rate,average_ratio
0,65,0,19.420820,76.050130,0.590604,0,2284.297169,1.942744,1296.647984,1.233688
1,73,1,14.584244,121.176289,0.724832,0,681.949809,1.779368,681.949809,1.779368
2,25,0,14.470810,121.427050,0.859060,0,681.949809,1.779368,681.949809,1.779368
3,54,1,14.580000,121.030000,0.583893,0,681.949809,1.779368,681.949809,1.779368
4,60,1,13.896130,121.046370,0.590604,0,681.949809,1.779368,681.949809,1.779368
...,...,...,...,...,...,...,...,...,...,...
34179,58,1,14.562220,121.030000,0.697987,0,681.949809,1.779368,681.949809,1.779368
34180,39,1,7.070000,125.600000,0.757163,0,681.949809,1.779368,681.949809,1.779368
34181,56,1,14.595800,120.977200,0.588851,0,681.949809,1.779368,681.949809,1.779368
34182,54,1,14.542117,121.035255,0.662666,0,681.949809,1.779368,681.949809,1.779368


In [9]:
# Split the data into train and validation data
# The training data will be further split using kfold later 
rf_X_train, rf_X_test, rf_y_train, rf_y_test = train_test_split(rf_X, rf_class_label, test_size=0.2, random_state=42)

# Paramter grid that will be used in grid search to find the best hyper parameters
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200, 250, 300],
    'max_depth': [10,15,20,25,30,35,40,45],
    'min_samples_split': [10, 20, 30,40, 50,100,170,340],
}

# Create k-folds to be used for training and verification 
rf_folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
####### WARNING GRID SEARCH WILL TAKE CLOSE TO 4 HOURS TO RUN ##########
# Create random forest classifier
rfc = RandomForestClassifier(random_state=0)

# Create a grid that will be used to tune the hyper parameters
rf_grid = GridSearchCV(rfc, rf_param_grid, cv=rf_folds, return_train_score = True, scoring = 'f1_macro')

# Try all sets of parameters to find the best combination
rf_grid.fit(rf_X, rf_class_label)

print("Best parameters:")
print(rf_grid.best_params_)
print("Best score:")
print(rf_grid.best_score_)

####### WARNING GRID SEARCH WILL TAKE CLOSE TO 4 HOURS TO RUN ##########


The expected best paramters are {'max_depth': 40, 'min_samples_split': 10, 'n_estimators': 300}\
The expected best score is 0.9588014236895601

Overfitting detection was then done and below is the optimal classifier\
The code for overfitting detection is in 1.7

In [11]:
rf_clf = RandomForestClassifier(n_estimators = 300, min_samples_split = 10, max_depth = 19, random_state=0)
rf_train_accuracy_scores = []
rf_train_f1_scores = []
rf_train_deceased_f1 = []

for train_index, val_index in rf_folds.split(rf_X_train, rf_y_train):
    X_train2, X_val = rf_X_train.iloc[train_index], rf_X_train.iloc[val_index]
    y_train2, y_val = rf_y_train.iloc[train_index], rf_y_train.iloc[val_index]

    # Train your classifier
    rf_clf.fit(X_train2, y_train2)
    
    # Make predictions on the validation set
    y_pred = rf_clf.predict(X_val)
    
    # Calculate the scores
    train_accuracy = accuracy_score(y_val, y_pred)
    train_f1 = f1_score(y_val, y_pred, average='macro')
    train_deceased = f1_score(y_val, y_pred, labels=[0], average='macro')


    # Append scores to correct list
    rf_train_accuracy_scores.append(train_accuracy)
    rf_train_f1_scores.append(train_f1)
    rf_train_deceased_f1.append(train_deceased)

print("Average Training accuracy:", np.mean(rf_train_accuracy_scores))
print("Average Training F1:", np.mean(rf_train_f1_scores))
print("Average Training Deceased F1:", np.mean(rf_train_deceased_f1))


# Predict the validation data and calculate scores
rf_y_pred = rf_clf.predict(rf_X_test)
rf_test_accuracy = accuracy_score(rf_y_test, rf_y_pred)
rf_test_f1 = f1_score(rf_y_test, rf_y_pred, average='macro')
rf_test_deceased = f1_score(rf_y_test, rf_y_pred, labels=[0], average='macro')

print("Test Accuracy:", rf_test_accuracy)
print("Test Macro F1:", rf_test_f1)
print("Test Deceased F1:", rf_test_deceased)

Average Training accuracy: 0.9489524904037815
Average Training F1: 0.947946434525966
Average Training Deceased F1: 0.920506507053417
Test Accuracy: 0.9474915898786017
Test Macro F1: 0.9464534373046676
Test Deceased F1: 0.9171348314606742


### 1.7 - Overfitting

#### Random Forest Overfitting detection

Detect overfitting for max_depth

In [None]:
# Create lists to store scores for different values of hyperparams
rf_total_train_accuracy =[]
rf_total_train_f1 = []
rf_total_train_deceased = []

rf_total_test_accuracy =[]
rf_total_test_f1 = []
rf_total_test_deceased = []

for i in range(10,31):

    # Create and train the model
    clf = RandomForestClassifier(n_estimators = 300, min_samples_split = 10, max_depth = i, random_state=0)

    rf_train_accuracy_scores = []
    rf_train_f1_scores = []
    rf_train_deceased_f1 = []

    for train_index, val_index in rf_folds.split(rf_X_train, rf_y_train):
        X_train2, X_val = rf_X_train.iloc[train_index], rf_X_train.iloc[val_index]
        y_train2, y_val = rf_y_train.iloc[train_index], rf_y_train.iloc[val_index]

        # Train your classifier
        clf.fit(X_train2, y_train2)
        
        # Make predictions on the validation set
        y_pred = clf.predict(X_val)
        
        # Calculate the scores
        train_accuracy = accuracy_score(y_val, y_pred)
        train_f1 = f1_score(y_val, y_pred, average='macro')
        train_deceased = f1_score(y_val, y_pred, labels=[0], average='macro')

        # Append scores to correct list
        rf_train_accuracy_scores.append(train_accuracy)
        rf_train_f1_scores.append(train_f1)
        rf_train_deceased_f1.append(train_deceased)

    # Calculate the mean across the kfolds and append to total list
    rf_total_train_accuracy.append(np.mean(rf_train_accuracy_scores))
    rf_total_train_f1.append(np.mean(rf_train_f1_scores))
    rf_total_train_deceased.append(np.mean(rf_train_deceased_f1))

    # Predict the test data and calculate the correct scores
    rf_y_pred = clf.predict(rf_X_test)
    rf_test_accuracy = accuracy_score(rf_y_test, rf_y_pred)
    rf_test_f1 = f1_score(rf_y_test, rf_y_pred, average='macro')
    rf_test_deceased = f1_score(rf_y_test, rf_y_pred, labels=[0], average='macro')

    # Append the scores to their respective list
    rf_total_test_accuracy.append(rf_test_accuracy)
    rf_total_test_f1.append(rf_test_f1)
    rf_total_test_deceased.append(rf_test_deceased)

In [None]:
# Create plots with respect to change in max_depth to find signs of overfitting
hyperParam_values = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
dataLists = {
    "accuracyScores":{
        "title":"Training accuracy vs Testing accuracy",
        "training":rf_total_train_accuracy,
        "testing":rf_total_test_accuracy
    },
    "macroF1":{
        "title":"Training macroF1 vs Testing macroF1",
        "training":rf_total_train_f1,
        "testing":rf_total_test_f1
    },
    "deceased_F1":{
        "title":"Training deceased F1 vs Testing deceased F1",
        "training":rf_total_train_deceased,
        "testing":rf_total_test_deceased
    }
}

# Plot graphs for accuracy, macro f1, and deceased f1
for i in dataLists.keys():
    fig, ax = plt.subplots()

    ax.plot(hyperParam_values, dataLists[i]['training'], label='Training')
    ax.plot(hyperParam_values, dataLists[i]['testing'], label='Testing')

    ax.set_title(dataLists[i]['title'])
    ax.grid(True)
    ax.legend()

    plt.show()

Detect overfitting for min_samples_split

In [None]:
# Create lists to store scores for different values of hyperparams
rf_total_train_accuracy =[]
rf_total_train_f1 = []
rf_total_train_deceased = []

rf_total_test_accuracy =[]
rf_total_test_f1 = []
rf_total_test_deceased = []

for i in range(10,201,10):

    # Create and train the model
    clf = RandomForestClassifier(n_estimators = 300, min_samples_split = i, max_depth = 19, random_state=0)

    rf_train_accuracy_scores = []
    rf_train_f1_scores = []
    rf_train_deceased_f1 = []

    for train_index, val_index in rf_folds.split(rf_X_train, rf_y_train):
        X_train2, X_val = rf_X_train.iloc[train_index], rf_X_train.iloc[val_index]
        y_train2, y_val = rf_y_train.iloc[train_index], rf_y_train.iloc[val_index]

        # Train your classifier
        clf.fit(X_train2, y_train2)
        
        # Make predictions on the validation set
        y_pred = clf.predict(X_val)
        
        # Calculate the scores
        train_accuracy = accuracy_score(y_val, y_pred)
        train_f1 = f1_score(y_val, y_pred, average='macro')
        train_deceased = f1_score(y_val, y_pred, labels=[0], average='macro')

        # Append scores to correct list
        rf_train_accuracy_scores.append(train_accuracy)
        rf_train_f1_scores.append(train_f1)
        rf_train_deceased_f1.append(train_deceased)

    # Calculate the mean across the kfolds and append to total list
    rf_total_train_accuracy.append(np.mean(rf_train_accuracy_scores))
    rf_total_train_f1.append(np.mean(rf_train_f1_scores))
    rf_total_train_deceased.append(np.mean(rf_train_deceased_f1))

    # Predict the test data and calculate the correct scores
    rf_y_pred = clf.predict(rf_X_test)
    rf_test_accuracy = accuracy_score(rf_y_test, rf_y_pred)
    rf_test_f1 = f1_score(rf_y_test, rf_y_pred, average='macro')
    rf_test_deceased = f1_score(rf_y_test, rf_y_pred, labels=[0], average='macro')

    # Append the scores to their respective list
    rf_total_test_accuracy.append(rf_test_accuracy)
    rf_total_test_f1.append(rf_test_f1)
    rf_total_test_deceased.append(rf_test_deceased)

In [None]:
# Create plots with respect to change in max_depth to find signs of overfitting
hyperParam_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]
dataLists = {
    "accuracyScores":{
        "title":"Training accuracy vs Testing accuracy",
        "training":rf_total_train_accuracy,
        "testing":rf_total_test_accuracy
    },
    "macroF1":{
        "title":"Training macroF1 vs Testing macroF1",
        "training":rf_total_train_f1,
        "testing":rf_total_test_f1
    },
    "deceased_F1":{
        "title":"Training deceased F1 vs Testing deceased F1",
        "training":rf_total_train_deceased,
        "testing":rf_total_test_deceased
    }
}

# Plot graphs for accuracy, macro f1, and deceased f1
for i in dataLists.keys():
    fig, ax = plt.subplots()

    ax.plot(hyperParam_values, dataLists[i]['training'], label='Training')
    ax.plot(hyperParam_values, dataLists[i]['testing'], label='Testing')

    ax.set_title(dataLists[i]['title'])
    ax.grid(True)
    ax.legend()

    plt.show()

Detect overfitting for n_estimators

In [None]:
# Create lists to store scores for different values of hyperparams
rf_total_train_accuracy =[]
rf_total_train_f1 = []
rf_total_train_deceased = []

rf_total_test_accuracy =[]
rf_total_test_f1 = []
rf_total_test_deceased = []

for i in range(50,351,50):

    # Create and train the model
    clf = RandomForestClassifier(n_estimators = i, min_samples_split = 10, max_depth = 19, random_state=0)

    rf_train_accuracy_scores = []
    rf_train_f1_scores = []
    rf_train_deceased_f1 = []

    for train_index, val_index in rf_folds.split(rf_X_train, rf_y_train):
        X_train2, X_val = rf_X_train.iloc[train_index], rf_X_train.iloc[val_index]
        y_train2, y_val = rf_y_train.iloc[train_index], rf_y_train.iloc[val_index]

        # Train your classifier
        clf.fit(X_train2, y_train2)
        
        # Make predictions on the validation set
        y_pred = clf.predict(X_val)
        
        # Calculate the scores
        train_accuracy = accuracy_score(y_val, y_pred)
        train_f1 = f1_score(y_val, y_pred, average='macro')
        train_deceased = f1_score(y_val, y_pred, labels=[0], average='macro')

        # Append scores to correct list
        rf_train_accuracy_scores.append(train_accuracy)
        rf_train_f1_scores.append(train_f1)
        rf_train_deceased_f1.append(train_deceased)

    # Calculate the mean across the kfolds and append to total list
    rf_total_train_accuracy.append(np.mean(rf_train_accuracy_scores))
    rf_total_train_f1.append(np.mean(rf_train_f1_scores))
    rf_total_train_deceased.append(np.mean(rf_train_deceased_f1)) 
    # Predict the test data and calculate the correct scores
    rf_y_pred = clf.predict(rf_X_test)
    rf_test_accuracy = accuracy_score(rf_y_test, rf_y_pred)
    rf_test_f1 = f1_score(rf_y_test, rf_y_pred, average='macro')
    rf_test_deceased = f1_score(rf_y_test, rf_y_pred, labels=[0], average='macro')

    # Append the scores to their respective list
    rf_total_test_accuracy.append(rf_test_accuracy)
    rf_total_test_f1.append(rf_test_f1)
    rf_total_test_deceased.append(rf_test_deceased)

In [None]:
# Create plots with respect to change in max_depth to find signs of overfitting
hyperParam_values = [50, 100, 150, 200, 250, 300, 350]
dataLists = {
    "accuracyScores":{
        "title":"Training accuracy vs Testing accuracy",
        "training":rf_total_train_accuracy,
        "testing":rf_total_test_accuracy
    },
    "macroF1":{
        "title":"Training macroF1 vs Testing macroF1",
        "training":rf_total_train_f1,
        "testing":rf_total_test_f1
    },
    "deceased_F1":{
        "title":"Training deceased F1 vs Testing deceased F1",
        "training":rf_total_train_deceased,
        "testing":rf_total_test_deceased
    }
}

# Plot graphs for accuracy, macro f1, and deceased f1
for i in dataLists.keys():
    fig, ax = plt.subplots()

    ax.plot(hyperParam_values, dataLists[i]['training'], label='Training')
    ax.plot(hyperParam_values, dataLists[i]['testing'], label='Testing')

    ax.set_title(dataLists[i]['title'])
    ax.grid(True)
    ax.legend()

    plt.show()

### 1.9 - Predictions

In [None]:
import csv

xgb_final_predictions = xgb.predict(unlabelled_data)

def create_submission_file(y_preds, file_name):
    with open(file_name, "w") as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(pred)])
create_submission_file(xgb_final_predictions, "submission_XGBoost.csv")