In [1]:
import numpy as np
from preprocessing.preprocess import load_data, split, preprocess
from utils.save_to_csv import save_data_to_csv

In [2]:
df = load_data('data/virus_hw2.csv')

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = split(df)

In [4]:
save_data_to_csv(X_train, X_val, X_test, y_train, y_val, y_test, suffix='before')

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess(X_train, X_val, X_test, y_train, y_val, y_test)

In [7]:
save_data_to_csv(X_train, X_val, X_test, y_train, y_val, y_test, suffix='after')

In [5]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [6]:
def grid_search(model, X, y, param_grid):
    clf = GridSearchCV(estimator=model, param_grid=param_grid,
                    n_jobs=-1)
    clf.fit(X, y)
    return clf 

In [7]:
models = [('SVC', svm.SVC(kernel='rbf'), dict(C= np.logspace(-10, 0, 10))),
          ('KNN', KNeighborsClassifier(), dict(n_neighbors=np.linspace(2,10,9, dtype=int))),
          ('RandomForest', RandomForestClassifier(max_depth=10, random_state=0), dict(max_depth=np.linspace(2,16,15))),
          ('LogisticRegression', LogisticRegression(random_state=0), dict()),
          ('PolynomialLinearRegression',  Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', LogisticRegression())]), dict())]

chosen_models = dict()

for column in y_train.columns:
    print(f'Finding best model for column {column}')
    fitted_models = []
    for name, model, param_grid in models:
        print(f'Fitting model {name}')
        clf = grid_search(model, X_train, y_train[column], param_grid)
        fitted_models.append(clf)
    chosen_models[column] = max(fitted_models,  key = lambda m: m.best_score_)

Finding best model for column Virus
Fitting model SVC
Fitting model KNN
Fitting model RandomForest
Fitting model LogisticRegression
Fitting model PolynomialLinearRegression
Finding best model for column Spreader
Fitting model SVC
Fitting model KNN
Fitting model RandomForest
Fitting model LogisticRegression
Fitting model PolynomialLinearRegression
Finding best model for column AtRisk
Fitting model SVC
Fitting model KNN
Fitting model RandomForest
Fitting model LogisticRegression
Fitting model PolynomialLinearRegression


In [8]:
[chosen_models[key].best_score_ for key in chosen_models]

[0.808, 0.8774285714285714, 0.782]

In [9]:
chosen_models

{'Virus': GridSearchCV(estimator=RandomForestClassifier(max_depth=10, random_state=0),
              n_jobs=-1,
              param_grid={'max_depth': array([ 2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
        15., 16.])}),
 'Spreader': GridSearchCV(estimator=RandomForestClassifier(max_depth=10, random_state=0),
              n_jobs=-1,
              param_grid={'max_depth': array([ 2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
        15., 16.])}),
 'AtRisk': GridSearchCV(estimator=RandomForestClassifier(max_depth=10, random_state=0),
              n_jobs=-1,
              param_grid={'max_depth': array([ 2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
        15., 16.])})}

In [None]:
for key in chosen_models:
    chosen_models[key].score(X_val, y_val[key])