# Model Selection

In this notebook we will test different models to find the one that gives us the best results.

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

def f1_scores(Z, y_test):
    averages = ['macro', 'micro', 'weighted']
    for avg in averages:
        score = f1_score(Z, y_test, average=avg)
        print("f1 score ({}): {}".format(avg, score))

def test_model(X, y, model_name, model):
    print("MODEL: {}".format(model_name))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    Z = model.predict(X_test)
    f1_scores(Z, y_test)
    
def testModelKFold(X, y, model_name, model, k):
    print("CROSS VALIDATION FOR: {}".format(model_name))
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    f1_scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        Z = model.predict(X_test)
        f1_scores.append(f1_score(Z, y_test))
    print("f1 scores: {}".format(f1_scores))
    print("MEAN: {}".format(np.mean(f1_scores)))
    return np.mean(f1_scores)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors = 3),
    "GaussianNB": GaussianNB(),
    "Perceptron": Perceptron(),
    "SGDClassifier": SGDClassifier(),
    "Decision Tree": DecisionTreeClassifier()   
}

dataset_path = "data-all.csv"
data = pd.read_csv(dataset_path, sep=";")
X = data.drop("diagnosis", axis=1).values
y = data["diagnosis"]

for model in models:
    test_model(X, y, model, models[model])
    testModelKFold(X, y, model, models[model], k=5)
    print('')

MODEL: Logistic Regression
f1 score (macro): 0.8130113705517454
f1 score (micro): 0.8479743281187325
f1 score (weighted): 0.8515095436797728
CROSS VALIDATION FOR: Logistic Regression
f1 scores: [0.89928057553956831, 0.87277556440903059, 0.88527349228611507, 0.90554298642533926, 0.86995768688293373]
MEAN: 0.8865660611085975

MODEL: SVC
f1 score (macro): 0.8272403060758199
f1 score (micro): 0.8616125150421179
f1 score (weighted): 0.8696182723463185
CROSS VALIDATION FOR: SVC
f1 scores: [0.92620137299771166, 0.87811271297509819, 0.89311939080772362, 0.89950027762354245, 0.88787061994609162]
MEAN: 0.8969608748700335

MODEL: Random Forest
f1 score (macro): 0.841896382804318
f1 score (micro): 0.8712394705174489
f1 score (weighted): 0.8748732048431905
CROSS VALIDATION FOR: Random Forest
f1 scores: [0.90597279716144297, 0.89113785557986869, 0.89020270270270285, 0.87242306543172987, 0.87429854096520754]
MEAN: 0.8868069923681903

MODEL: K-Nearest Neighbors
f1 score (macro): 0.80041075844836
f1 sc

We can check the confusion matrix for the Random Forest model.

In [43]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
Z = model.predict(X_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, Z))

print("\nClassfication Report")
print(classification_report(y_test, Z))

Confusion Matrix
[[ 530  214]
 [ 110 1639]]

Classfication Report
             precision    recall  f1-score   support

        0.0       0.83      0.71      0.77       744
        1.0       0.88      0.94      0.91      1749

avg / total       0.87      0.87      0.87      2493



Let us compare the performances of the random forest for different numbers of estimators and features

In [47]:
res = np.zeros([3,5])
for i_f, n_f in enumerate([1,5,11]):
    for i_e, n_e in enumerate([10,33,100,333,1000]):
        model = RandomForestClassifier(n_estimators=n_e, max_features=n_f)
        model_name = "Random Forest with %i features and %i estimators" % (n_f, n_e)
        f1 = testModelKFold(X, y, model_name, model, k=5)
        res[i_f, i_e] = f1

CROSS VALIDATION FOR: Random Forest with 1 features and 10 estimators
f1 scores: [0.88888888888888895, 0.88057633693543913, 0.88054998567745635, 0.85068702290076348, 0.86346863468634683]
MEAN: 0.872834173817779
CROSS VALIDATION FOR: Random Forest with 1 features and 33 estimators
f1 scores: [0.90029673590504455, 0.88491189427312766, 0.88068181818181823, 0.85826060788444181, 0.86544428772919613]
MEAN: 0.8779190687947256
CROSS VALIDATION FOR: Random Forest with 1 features and 100 estimators
f1 scores: [0.89994044073853496, 0.88619454395150177, 0.88373408769448381, 0.86180828545509514, 0.86753100338218714]
MEAN: 0.8798416722443605
CROSS VALIDATION FOR: Random Forest with 1 features and 333 estimators
f1 scores: [0.89988081048867707, 0.88888888888888884, 0.88316831683168318, 0.863855421686747, 0.86802030456852786]
MEAN: 0.8807627484929048
CROSS VALIDATION FOR: Random Forest with 1 features and 1000 estimators
f1 scores: [0.9009223445403155, 0.88895012400110229, 0.88493073225897656, 0.86616

In [48]:
res

array([[ 0.87283417,  0.87791907,  0.87984167,  0.88076275,  0.88173392],
       [ 0.87697497,  0.89174986,  0.88981282,  0.89248256,  0.892589  ],
       [ 0.8765871 ,  0.88703356,  0.88987962,  0.8916303 ,  0.89112902]])

## Save model

In [10]:
import dill as pickle

def dump_model(model, path):
    with open(path, 'wb') as file:
        pickle.dump(model, file)
        
# Define path to save the model file
filename = "model_v2.pk"
dir_path = "./"

# Dump model to file
dump_model(model, dir_path + filename)