In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 

from copy import deepcopy

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# 1) Loading in the merged data and droping rows containing null values

In [2]:
# Read data from merged_data.csv file
data_init = pd.read_csv("Data/merged_data.csv")

# Drop rows where any value is null
data = data_init.dropna(axis=0)


# 2) Creating classes according to intervals for the mortality


In [3]:
# Creating classes for mortality by discretizing the mortality_rate data
discretize = lambda x : "low" if x < 80 else "average" if x < 200 else "high" if x < 325 else "highest"
data.loc[:, "mortality_rate"] = data.loc[:, "mortality_rate"].map(discretize)


# 3) Splitting in X, y data and training, testing sets

In [4]:
# Data splitting into training and testing sets
data_X = data.drop(data.columns[[0, 1, 2]], axis=1)
data_y = data.drop(data.columns[[0, 1, 3, 4, 5, 6]], axis=1)
data_y.head(15)

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data_X, np.ravel(data_y), test_size=.20, shuffle=True)


# 4) Model selection
## TODO: Play with parameters and find best parameters for each classifier


In [5]:
# Cross-validation and model selection

# Different classifiers for cross-validation
knn = KNeighborsClassifier(n_neighbors=10)
bayes = GaussianNB()
trees = DecisionTreeClassifier()
svm = SVC()
network = MLPClassifier()

# Creating list of clasifiers
classifier_list = [knn, bayes, trees, svm, network]
best_model = []

# Iterating over classifiers
for classifier in classifier_list: 
    scores = cross_validate(classifier, X_train, y_train, scoring="accuracy", return_train_score=True, return_estimator=True)
    best_model.append(deepcopy(scores["estimator"][np.argmax(scores["train_score"])]))
    


# 5) Testing and results


In [6]:
# Array to add all test scores obtain using predict method of each classifier.
# In order K-Neighbors, Bayesian Classifier, Decision Tree Classifier, Support Vector Machine Classifier and Multi Layer Perceptron Classifier
test_scores = [] 

# Iterating over best models obtained by crossvalidation
for model in best_model:
    y_pred = model.predict(X_test)
    test_scores.append(accuracy_score(y_test, y_pred))

