# Model Selection

In this notebook we will test different models to find the one that gives us the best results.

In [19]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

def f1_scores(Z, y_test):
    averages = ['macro', 'micro', 'weighted']
    for avg in averages:
        f1 = f1_score(Z, y_test, average=avg)
        print("f1 score ({}): {}".format(avg, f1))

def test_model(X, y, model_name, model):
    print("MODEL: {}".format(model_name))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    Z = model.predict(X_test)
    f1_scores(Z, y_test)
    
def testModelKFold(X, y, model_name, model, k):
    print("CROSS VALIDATION FOR: {}".format(model_name))
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    f1_scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        Z = model.predict(X_test)
        f1_scores.append(f1_score(Z, y_test))
    print("f1 scores: {}".format(f1_scores))
    print("MEAN: {}".format(np.mean(f1_scores)))
#     return np.mean(f1_scores)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors = 3),
    "GaussianNB": GaussianNB(),
    "Perceptron": Perceptron(),
    "SGDClassifier": SGDClassifier(),
    "Decision Tree": DecisionTreeClassifier()   
}

dataset_path = "data.csv"
data = pd.read_csv(dataset_path, sep=';')
data = data.sample(frac=1).reset_index(drop=True)

X = data.drop("diagnosis", axis=1).values
y = data["diagnosis"]

for model in models:
    test_model(X, y, model, models[model])
    testModelKFold(X, y, model, models[model], k=5)
    print('')

MODEL: Logistic Regression
f1 score (macro): 0.7932107452231291
f1 score (micro): 0.8109640831758034
f1 score (weighted): 0.8095896312052737
CROSS VALIDATION FOR: Logistic Regression
f1 scores: [0.734806629834254, 0.6997245179063362, 0.7566137566137566, 0.6904109589041096, 0.788659793814433]
MEAN: 0.7340431314145779

MODEL: SVC
f1 score (macro): 0.8772385256508588
f1 score (micro): 0.888468809073724
f1 score (weighted): 0.888398619802331
CROSS VALIDATION FOR: SVC
f1 scores: [0.8338028169014085, 0.7909604519774012, 0.8225352112676056, 0.7616279069767442, 0.8828337874659401]
MEAN: 0.8183520349178199

MODEL: Random Forest
f1 score (macro): 0.8759583017045749
f1 score (micro): 0.888468809073724
f1 score (weighted): 0.8879475379333428
CROSS VALIDATION FOR: Random Forest
f1 scores: [0.8619718309859156, 0.818941504178273, 0.8488063660477454, 0.8033240997229918, 0.8749999999999999]
MEAN: 0.841608760186985

MODEL: K-Nearest Neighbors
f1 score (macro): 0.8297064125676024
f1 score (micro): 0.8487

In [7]:
data.head()

Unnamed: 0,age in months at the time of the interview/test/sampling/imaging.;anxiety;hand and finger and other complex mannerisms;imagination/creativity;immediate echolalia;quality of social overtures;self-injurious behavior;shared enjoyment in interaction;tantrums,aggression,negative or disruptive behavior;unusual eye contact;diagnosis;isMale
0,35;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0;0,,
1,30;0.0;0.0;2.0;1.0;0.0;0.0;0.0;0.0;0.0;0;0,,
2,23;1.0;0.0;1.0;0.0;0.0;0.0;0.0;0.0;0.0;0;1,,
3,31;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0;1,,
4,13;1.0;1.0;1.0;2.0;1.0;1.0;1.0;1.0;0.0;0;0,,
