In [1]:
import pandas as pd
import numpy as np
from preprocessing.preprocess import DataPreprocessor
from preprocessing.transformations import fix_label_type
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score




In [2]:

##################################################
## Load train dataset

df_train = pd.read_csv('Data/virus_hw5.csv')
X_ids = df_train['PatientID']
X_train, y_train = df_train.drop(labels=['TestResultsCode'], axis=1), df_train[['TestResultsCode']]
y_train = fix_label_type(y_train)

X_train = X_train[['DisciplineScore', 'TimeOnSocialActivities', 'AgeGroup', 'StepsPerYear', 'pcrResult4', 'pcrResult1',
                   'pcrResult12', 'pcrResult5', 'pcrResult16', 'pcrResult14', 'SyndromeClass']]




In [3]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)



preprocessor = DataPreprocessor().fit(X_train, y_train)
X_train = preprocessor.transform(X_train)
X_val = preprocessor.transform(X_val)

In [5]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DisciplineScore         1800 non-null   float64
 1   TimeOnSocialActivities  1800 non-null   float64
 2   AgeGroup                1800 non-null   float64
 3   StepsPerYear            1800 non-null   float64
 4   pcrResult4              1800 non-null   float64
 5   pcrResult1              1800 non-null   float64
 6   pcrResult12             1800 non-null   float64
 7   pcrResult5              1800 non-null   float64
 8   pcrResult16             1800 non-null   float64
 9   pcrResult14             1800 non-null   float64
 10  SyndromeClass_1.0       1800 non-null   float64
 11  SyndromeClass_2.0       1800 non-null   float64
 12  SyndromeClass_3.0       1800 non-null   float64
 13  SyndromeClass_4.0       1800 non-null   float64
dtypes: float64(14)
memory usage: 197.0 KB


In [11]:

##################################################
## Training various models

# ## Each target column has a different wanted score (with it's params)
# column_score_map = {'Virus' : (precision_score, {'average':'weighted'}), 
#     'Spreader': (recall_score, {'average': "binary", 'pos_label': "Spreader"}), 
#     'AtRisk': (recall_score, {'average': "binary", 'pos_label': "atRisk"})}

## Each target column has a different wanted score (with it's params)
column_score_map = {'Virus' : (accuracy_score, dict()), 
    'Spreader': (accuracy_score, dict()), 
    'AtRisk': (accuracy_score, dict())}

## We use grid search to optimize our parameters
def grid_search(model, X, y, param_grid, score_fn):
    clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=score_fn,
                    n_jobs=-1, verbose=4)
    clf.fit(X, y)
    return clf 

print('### Training ###')

random_forest = (
                'RandomForest', 
                RandomForestClassifier(), 
                {
                    'max_depth': np.floor(np.linspace(10, 40, 20)).astype(int),
                }
            )

ada_boost = (
                'AdaBoost', 
                AdaBoostClassifier(), 
                {
                    'learning_rate': [0.5],
                    'n_estimators': np.floor(np.linspace(1, 20, 10)).astype(int),
                    'base_estimator':  [DecisionTreeClassifier(max_depth=3,  min_samples_leaf=3)]
                }
            )

gradient_boost = (
                'GradientBoosting', 
                GradientBoostingClassifier(), 
                {
                    "min_samples_split": [2],
                    "min_samples_leaf": [1],
                    "max_depth":[3],
                    'n_estimators': np.floor(np.linspace(10, 100, 40)).astype(int)
                }
            )

models = [
#             ('SVC_rbf', svm.SVC(kernel='rbf'), dict(C=[0.1, 1, 10, 100, 1000], gamma=[0.0001, 0.001, 0.01, 0.1, 1])),
#             ('KNN', KNeighborsClassifier(), dict(n_neighbors=np.linspace(2, 10, 9, dtype=int))),
            random_forest,
            # ('LogisticRegression', LogisticRegression(max_iter=1000), dict(C=[0.1, 1, 5, 7, 10])),
            # ('PolynomialLogisticRegression',  Pipeline([('poly', PolynomialFeatures(degree=2)),
#                                                         ('linear', LogisticRegression())]), dict()),
            ada_boost,
            gradient_boost]

models_per_column = dict()

for column in column_score_map:
    print(f'# Fitting for column {column}')
    fitted_models = []
    for name, model, param_grid in models:
        print(f'## Fitting model {name}')

        score_fn, params = column_score_map[column]

        clf = grid_search(model, X_train, y_train[column], param_grid, make_scorer(score_fn, **params))
        fitted_models.append((name, clf))
    models_per_column[column] = fitted_models


# ## Each target column has a different wanted score (with it's params)
# column_score_map = {'Virus' : (precision_score, {'average':'weighted'}), 
#     'Spreader': (recall_score, {'average': "binary", 'pos_label': "Spreader"}), 
#     'AtRisk': (recall_score, {'average': "binary", 'pos_label': "atRisk"})}



IndentationError: unindent does not match any outer indentation level (<tokenize>, line 42)

In [7]:

##################################################
## Choose the best model for each task

best_models = dict()

for column in ['Virus', 'Spreader', 'AtRisk']:
    print(f'# Checking column {column}')
    fitted_models = models_per_column[column]
    best_model, best_score = None, -1
    for name, model in fitted_models:
        print(f'## Checking model {name}')
        y_hat = model.predict(X_val)
        score_function, params = column_score_map[column]
        score = score_function(y_val[column], y_hat, **params)
        if score > best_score:
            best_model, best_score = model, score
    
    best_models[column] = (best_model.best_estimator_, best_model.best_params_, best_score)

print(best_models)
    


# Checking column Virus
## Checking model RandomForest
# Checking column Spreader
## Checking model RandomForest
# Checking column AtRisk
## Checking model RandomForest
{'Virus': (RandomForestClassifier(max_depth=40), {'max_depth': 40}, 0.8161111111111111), 'Spreader': (RandomForestClassifier(max_depth=14), {'max_depth': 14}, 0.8594444444444445), 'AtRisk': (RandomForestClassifier(max_depth=10), {'max_depth': 10}, 0.8083333333333333)}


In [None]:

##################################################
## Predict on test data using chosen models

df_test = pd.read_csv('Data/virus_hw5_test.csv')
X_ret = pd.DataFrame(df_test['PatientID'], columns=['PatientID'])
X_test = df_test.drop(labels=['PatientID'], axis=1)
X_test = X_test[['DisciplineScore', 'TimeOnSocialActivities', 'AgeGroup', 'StepsPerYear', 'pcrResult4', 'pcrResult1',
                   'pcrResult12', 'pcrResult5', 'pcrResult16', 'pcrResult14', 'SyndromeClass']]
X_test = preprocessor.transform(X_test)
X_ret['Virus'] = best_models['Virus'][0].predict(X_test)
X_ret['Spreader'] = best_models['Spreader'][0].predict(X_test)
X_ret['Risk'] = best_models['AtRisk'][0].predict(X_test)

X_ret.to_csv('results/predicted.csv', index=False)


