In [1]:
import pandas as pd
import numpy as np
from preprocessing.preprocess import split
from utils.save_to_csv import save_data_to_csv
import pickle

In [2]:
df = pd.read_csv('data/virus_hw2.csv')

In [22]:
X_train, X_val, X_test, y_train, y_val, y_test = split(df)

In [24]:
X_train.shape

(3500, 38)

In [4]:
save_data_to_csv(X_train, X_val, X_test, y_train, y_val, y_test, suffix='before')

In [5]:
from preprocessing.preprocess import DataPreprocessor 
from preprocessing.transformations import fix_label_type 

preprocess = DataPreprocessor()

X_train = preprocess.fit_transform(X_train, y_train)
X_val = preprocess.transform(X_val)
X_test = preprocess.transform(X_test)

y_train = fix_label_type(y_train)
y_val = fix_label_type(y_val)
y_test = fix_label_type(y_test)


In [None]:
pickle.dump(preprocess, open('dumps/preprocessor.pkl', 'wb'))

In [6]:
save_data_to_csv(X_train, X_val, X_test, y_train, y_val, y_test, suffix='after')

In [7]:
X_train = X_train.drop(labels=['PatientID'], axis=1)
X_val = X_val.drop(labels=['PatientID'], axis=1)
X_test = X_test.drop(labels=['PatientID'], axis=1)

In [8]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [9]:
def grid_search(model, X, y, param_grid):
    clf = GridSearchCV(estimator=model, param_grid=param_grid,
                    n_jobs=-1)
    clf.fit(X, y)
    return clf 

In [10]:
print('### Training ###')
models = [('SVC', svm.SVC(), dict(C= np.logspace(-10, 10, 10), kernel=['rbf','poly'])),
          ('KNN', KNeighborsClassifier(), dict(n_neighbors=np.linspace(2,10,9, dtype=int))),
          ('RandomForest', RandomForestClassifier(max_depth=10, random_state=0), dict(max_depth=np.linspace(2,16,15))),
          ('LogisticRegression', LogisticRegression(random_state=0), dict()),
          ('PolynomialLogisticRegression',  Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', LogisticRegression())]), dict())]

models_per_column = dict()

for column in y_train.columns:
    print(f'# Fitting for column {column}')
    fitted_models = []
    for name, model, param_grid in models:
        print(f'## Fitting model {name}')
        clf = grid_search(model, X_train, y_train[column], param_grid)
        fitted_models.append((name, clf))
    models_per_column[column] = fitted_models


### Training ###
# Fitting for column Virus
## Fitting model SVC
## Fitting model KNN
## Fitting model RandomForest
## Fitting model LogisticRegression
## Fitting model PolynomialLogisticRegression
# Fitting for column Spreader
## Fitting model SVC
## Fitting model KNN
## Fitting model RandomForest
## Fitting model LogisticRegression
## Fitting model PolynomialLogisticRegression
# Fitting for column AtRisk
## Fitting model SVC
## Fitting model KNN
## Fitting model RandomForest
## Fitting model LogisticRegression
## Fitting model PolynomialLogisticRegression


In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [12]:
print('### Validation Scores ###')

print('#########################################')
print(f'# checking for column Virus')
column = 'Virus'
fitted_models = models_per_column[column]
for name, model in fitted_models:
    print(f'# scoring for model {name}')
    y_hat = model.predict(X_val)
    accuracy = accuracy_score(y_val[column], y_hat)
    print(f'accuracy: {accuracy}')
    percision = precision_score(y_val[column],y_hat, average='weighted')
    print(f'percision: {percision}')
    recall = recall_score(y_val[column],y_hat, average='macro')
    print(f'recall score: {recall}')
    f1 = f1_score(y_val[column],y_hat, average='macro')
    print(f'f1 score: {f1}')

print('#########################################')
print(f'# checking for column Spreader')
column = 'Spreader'
fitted_models = models_per_column[column]
for name, model in fitted_models:
    print(f'# scoring for model {name}')
    y_hat = model.predict(X_val)
    accuracy = accuracy_score(y_val[column], y_hat)
    print(f'accuracy: {accuracy}')
    percision = precision_score(y_val[column],y_hat, average="binary", pos_label="Spreader")
    print(f'percision: {percision}')
    recall = recall_score(y_val[column],y_hat, average="binary", pos_label="Spreader")
    print(f'recall score: {recall} (PREFERED)')
    f1 = f1_score(y_val[column],y_hat, average="binary", pos_label="Spreader")
    print(f'f1 score: {f1}')


print('#########################################')
print(f'# checking for column AtRisk')
column = 'AtRisk'
fitted_models = models_per_column[column]
for name, model in fitted_models:
    print(f'# scoring for model {name}')
    y_hat = model.predict(X_val)
    accuracy = accuracy_score(y_val[column], y_hat)
    print(f'accuracy: {accuracy}')
    percision = precision_score(y_val[column],y_hat , average="binary", pos_label="atRisk")
    print(f'percision: {percision}')
    recall = recall_score(y_val[column],y_hat, average="binary", pos_label="atRisk")
    print(f'recall score: {recall} (PREFERED)')
    f1 = f1_score(y_val[column],y_hat, average="binary", pos_label="atRisk")
    print(f'f1 score: {f1}')

### Validation Scores ###
#########################################
# checking for column Virus
# scoring for model SVC
accuracy: 0.6093333333333333
percision: 0.6042516610641108
recall score: 0.6606950636870573
f1 score: 0.6473322181177784
# scoring for model KNN
accuracy: 0.448
percision: 0.4542934562379429
recall score: 0.34400915242085794
f1 score: 0.3643691927446406
# scoring for model RandomForest
accuracy: 0.816
percision: 0.8225967629546949
recall score: 0.7981104032830184
f1 score: 0.8226055202738474
# scoring for model LogisticRegression
accuracy: 0.5786666666666667
percision: 0.5741198941255136
recall score: 0.5821970991056432
f1 score: 0.5676599049394694
# scoring for model PolynomialLogisticRegression
accuracy: 0.644
percision: 0.6422904822091414
recall score: 0.6778170674437738
f1 score: 0.6803151291414454
#########################################
# checking for column Spreader
# scoring for model SVC
accuracy: 0.8226666666666667
percision: 0.8179487179487179
recall score

In [13]:
column_score_map = {'Virus' : (precision_score, {'average':'weighted'}), 
    'Spreader': (recall_score, {'average': "binary", 'pos_label': "Spreader"}), 
    'AtRisk': (recall_score, {'average': "binary", 'pos_label': "atRisk"})}

best_models = dict()

for column in ['Virus', 'Spreader', 'AtRisk']:
    fitted_models = models_per_column[column]
    best_model, best_score = None, -1
    for name, model in fitted_models:
        y_hat = model.predict(X_val)
        score_function, params =  column_score_map[column]
        score = score_function(y_val[column], y_hat, **params)
        if score > best_score:
            best_model, best_score = model, score
    
    best_models[column] = (best_model, best_score)
    
    

In [34]:
pickle.dump(best_models, open('dumps/best_models.pkl', 'wb'))

TypeError: file must have a 'write' attribute

In [15]:
print('### Testing Scores ###')

print('#########################################')
print(f'# checking for column Virus')
column = 'Virus'
model, _  = best_models['Virus']
y_hat = model.predict(X_test)
accuracy = accuracy_score(y_test[column], y_hat)
print(f'accuracy: {accuracy}')
percision = precision_score(y_test[column],y_hat, average='weighted')
print(f'percision: {percision}')
recall = recall_score(y_test[column],y_hat, average='macro')
print(f'recall score: {recall}')
f1 = f1_score(y_test[column],y_hat, average='macro')
print(f'f1 score: {f1}')

print('#########################################')
print(f'# checking for column Spreader')
column = 'Spreader'
model,_ = best_models['Spreader']
y_hat = model.predict(X_test)
accuracy = accuracy_score(y_test[column], y_hat)
print(f'accuracy: {accuracy}')
percision = precision_score(y_test[column],y_hat, average="binary", pos_label="Spreader")
print(f'percision: {percision}')
recall = recall_score(y_test[column],y_hat, average="binary", pos_label="Spreader")
print(f'recall score: {recall} (PREFERED)')
f1 = f1_score(y_test[column],y_hat, average="binary", pos_label="Spreader")
print(f'f1 score: {f1}')


print('#########################################')
print(f'# checking for column AtRisk')
column = 'AtRisk'
model, _ = best_models['AtRisk']

y_hat = model.predict(X_test)
accuracy = accuracy_score(y_test[column], y_hat)
print(f'accuracy: {accuracy}')
percision = precision_score(y_test[column],y_hat , average="binary", pos_label="atRisk")
print(f'percision: {percision}')
recall = recall_score(y_test[column],y_hat, average="binary", pos_label="atRisk")
print(f'recall score: {recall} (PREFERED)')
f1 = f1_score(y_test[column],y_hat, average="binary", pos_label="atRisk")
print(f'f1 score: {f1}')

### Testing Scores ###
#########################################
# checking for column Virus
accuracy: 0.8146666666666667
percision: 0.8191983854193686
recall score: 0.7811885765110382
f1 score: 0.811035758513396
#########################################
# checking for column Spreader
accuracy: 0.8666666666666667
percision: 0.8356164383561644
recall score: 0.8840579710144928 (PREFERED)
f1 score: 0.8591549295774648
#########################################
# checking for column AtRisk
accuracy: 0.76
percision: 0.7288557213930348
recall score: 0.804945054945055 (PREFERED)
f1 score: 0.7650130548302873


In [30]:
df_unknown = pd.read_csv('data/virus_hw3_unlabeled.csv')

X_unknown = df_unknown.drop(labels=['TestResultsCode'], axis=1) 
X_unknown = preprocess.transform(X_unknown)


patientIds = X_unknown['PatientID']
X_unknown = X_unknown.drop(labels=['PatientID'], axis=1)


y_pred = pd.DataFrame()
y_pred['Virus'] = pd.Series(best_models['Virus'][0].predict(X_unknown))
y_pred['Spreader'] = pd.Series(best_models['Spreader'][0].predict(X_unknown))
y_pred['AtRisk'] = pd.Series(best_models['AtRisk'][0].predict(X_unknown))

y_pred['TestResultsCode'] = y_pred[['Virus', 'Spreader', 'AtRisk']].agg('_'.join, axis=1)

Result =  pd.concat([patientIds, y_pred['TestResultsCode']], axis=1)

Result.to_csv('results/predicted.csv', index=False)