# Modelling

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE

In [None]:
train_comb = pd.read_csv('../assets/train_comb.csv')

In [None]:
train_comb['date'] = pd.to_datetime(train_comb['date'])

In [None]:
train_comb.set_index('date', inplace=True)

In [None]:
# dropping trap column for now as it is not numeralize yet
X = train_comb.drop(columns=['wnvpresent', 'trap'])
y = train_comb['wnvpresent']

In [None]:
# train/test split on data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [None]:
# Scale X data
ss = StandardScaler()

In [None]:
Xs_train, Xs_val = ss.fit_transform(X_train), ss.transform(X_val)

## Baseline Score

In [None]:
train_comb['wnvpresent'].value_counts(normalize=True)

The baseline accuracy score would be 0.948.

## Account for Imbalanced Class

We note from our target column: 'wnvpresent' that there is massive class imbalance. We can resolve this by using SMOTE and embed it within a pipeline.

In [None]:
# sm = SMOTE()

In [None]:
# Xsm_train, ysm_train = sm.fit_resample(Xs_train, y_train)

## Logistic Regression

In [None]:
def model_perform(classifier, X, y):
    
    print(f'Best Score: {classifier.best_score_}')
    print('Best Parameters: ')
    print(classifier.best_params_)
    
    d_scores = {'classifier': classifier}
    
    y_pred = classifier.predict(X)
    y_pred_proba = classifier.predict_proba(X)[:, 1]
    
    
    d_scores['accuracy'] = accuracy_score(y, y_pred)
    d_scores['recall'] = recall_score(y, y_pred)
    d_scores['precision'] = precision_score(y, y_pred)
    d_scores['f1_score'] = f1_score(y, y_pred)
    d_scores['val_roc_auc_score'] = roc_auc_score(y, y_pred_proba)
    
    return d_scores

In [None]:
# instantiate pipeline
pipe = Pipeline([('sampling', SMOTE()),
                ('logreg', LogisticRegression(solver='liblinear', random_state=100))
                ])

In [None]:
pipe.get_params()

In [None]:
# set params to gridsearch
# sampling params set to comment because it was taking too long to fit
logreg_params = {
    'logreg__penalty': ['l1','l2'],
    'logreg__C': [0.5, 1, 1.5, 2],
#    'sampling__sampling_strategy': ['minority', 'auto'],
#    'sampling__k_neighbors': [3, 5],
}

In [None]:
logreg_grid = GridSearchCV(pipe, logreg_params, verbose=2, n_jobs=-1)

In [None]:
%%time
logreg_grid.fit(Xs_train, y_train)

In [None]:
scores = model_perform(logreg_grid, Xs_val, y_val)

In [None]:
# create a list to hold the dict
logreg_results = []
logreg_results.append(scores)

In [None]:
# create a df to hold all our classifier performance
df_results = pd.DataFrame(logreg_results)
df_results

In [None]:
# ROC curve
plot_roc_curve(logreg_grid, Xs_val, y_val)

# add worst case scenario line
plt.plot([0,1], [0,1], label='baseline', linestyle='--')

# add a legend
plt.legend()

In [None]:
# confusion_matrix(y_val, y_val_pred)

In [None]:
# tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()

# print("True Negatives: %s" % tn)
# print("False Positives: %s" % fp)
# print("False Negatives: %s" % fn)
# print("True Positives: %s" % tp)

## SVM

In [None]:
# instantiate pipeline
pipe = Pipeline([('sampling', SMOTE()),
                 ('svc', SVC(probability=True, random_state=100)) # probability=True is needed to calculate predict_proba but fitting grid model would take very long
                ])

In [None]:
pipe.get_params()

In [None]:
svc_params = {
    'svc__C': [5, 7, 9],
    'svc__kernel': ['rbf'],  #'poly'
    #'sampling__sampling_strategy': ['minority', 'not minority', 'auto'],
    #'sampling__k_neighbors': [3, 5],      
}

In [None]:
svc_grid = GridSearchCV(pipe, svc_params, verbose=2, n_jobs=-1)

In [None]:
%%time
svc_grid.fit(Xs_train, y_train)

# wall time is cut by half we dont put SVC(probability=True) in the pipeline

In [None]:
scores = model_perform(svc_grid, Xs_val, y_val)

In [None]:
# create list to hold the dict
svm_results = []
svm_results.append(scores)

In [None]:
# append dict to df to compare classifier performance

df_results = df_results.append(svm_results)
df_results

In [None]:
# ROC curve
plot_roc_curve(svc_grid, Xs_val, y_val)

# add worst case scenario line
plt.plot([0,1], [0,1], label='baseline', linestyle='--')

# add a legend
plt.legend()

In [None]:
#confusion_matrix(y_val, y_val_pred)

In [None]:
#tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()

#print("True Negatives: %s" % tn)
#print("False Positives: %s" % fp)
#print("False Negatives: %s" % fn)
#print("True Positives: %s" % tp)

## PCA

In [None]:
# instantiate pca
pca = PCA(random_state=100)

In [None]:
pca.fit(Xs_train)

In [None]:
Z_train, Z_val = pca.transform(Xs_train), pca.transform(Xs_val)

In [None]:
pca.explained_variance_ratio_.round(3)

In [None]:
pca.explained_variance_ratio_.round(3)[:10].sum()

We see that the first 10 components explain 77% of the variance.

In [None]:
pca.explained_variance_ratio_.round(3)[:25].sum()

The first 25 component explain 100% of the variance.

### Logistic Regression with PCA

In [None]:
# embed pca in a pipeline
pipe = Pipeline([('sampling', SMOTE()),
                 ('pca', PCA(n_components=[20, 25, 30])),
                 ('logreg', LogisticRegression(solver='liblinear', random_state=100))
                ])

In [None]:
pipe.get_params()

In [None]:
logreg_pca_params = {
    'logreg__penalty': ['l1','l2'],
    'logreg__C': [1, 1.5, 2],
    'pca__n_components': [20, 30, 40]
    #'sampling__sampling_strategy': ['minority', 'not minority', 'auto'],
    #'sampling__k_neighbors': [3, 5],
}

In [None]:
logreg_pca_grid = GridSearchCV(pipe, logreg_pca_params, verbose=2, n_jobs=-1)

In [None]:
%%time
logreg_pca_grid.fit(Xs_train, y_train)

In [None]:
scores = model_perform(logreg_pca_grid, Xs_val, y_val)

In [None]:
# create list to hold the dict
logreg_pca_results = []
logreg_pca_results.append(scores)

In [None]:
# append dict to df to compare classifier performance
df_results = df_results.append(logreg_pca_results)
df_results

In [None]:
# ROC curve
plot_roc_curve(logreg_pca_grid, Xs_val, y_val)

# add worst case scenario line
plt.plot([0,1], [0,1], label='baseline', linestyle='--')

# add a legend
plt.legend()

In [None]:
#confusion_matrix(y_val, y_val_pred)

In [None]:
#tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()

#print("True Negatives: %s" % tn)
#print("False Positives: %s" % fp)
#print("False Negatives: %s" % fn)
#print("True Positives: %s" % tp)

### SVM with PCA

In [None]:
pipe = Pipeline([('sampling', SMOTE()),
                 ('pca', PCA(n_components=[20, 30, 40])),
                 ('svc', SVC(probability=True, random_state=100))
])

In [None]:
pipe.get_params()

In [None]:
svc_pca_params = {
    'svc__C': [5, 7, 9],
    'svc__kernel': ['rbf'],  # 'poly'
    'pca__n_components': [20, 30, 40]
    #'sampling__sampling_strategy': ['minority', 'not minority', 'auto'],
    #'sampling__k_neighbors': [3, 5],      
}

In [None]:
svc_pca_grid = GridSearchCV(pipe, svc_pca_params, verbose=2, n_jobs=-1)

In [None]:
%%time
svc_pca_grid.fit(Xs_train, y_train)

## lol example of time taken (14 mins) if we put SVC(probability=True) in the pipeline

In [None]:
scores = model_perform(svc_pca_grid, Xs_val, y_val)

In [None]:
# create list to hold the dict
svc_pca_results = []
svc_pca_results.append(scores)

In [None]:
# append dict to df to compare classifier performance
df_results = df_results.append(svc_pca_results)

In [None]:
df_results.reset_index(drop=True)

In [None]:
# ROC curve
plot_roc_curve(svc_pca_grid, Xs_val, y_val)

# add worst case scenario line
plt.plot([0,1], [0,1], label='baseline', linestyle='--')

# add a legend
plt.legend()

In [None]:
#confusion_matrix(y_val, y_val_pred)

In [None]:
#tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()

#print("True Negatives: %s" % tn)
#print("False Positives: %s" % fp)
#print("False Negatives: %s" % fn)
#print("True Positives: %s" % tp)

# Feature Importance

In [None]:
# note: svm on rbf has no feature importance, only svm on linear has it
# https://stackoverflow.com/questions/41592661/determining-the-most-contributing-features-for-svm-classifier-in-sklearn

Feature Importance based on Logistic Regression

In [None]:
logreg_grid.best_estimator_[1].coef_

In [None]:
logreg_features = pd.DataFrame({'feature_names': X_train.columns,
                                'feature_importance': logreg_grid.best_estimator_[1].coef_[0]})

# top 20 features
logreg_features.sort_values(by='feature_importance', ascending=False).head(20)

# Cost Benefit Analysis

# Conclusion