<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Modelling" data-toc-modified-id="Modelling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Modelling</a></span><ul class="toc-item"><li><span><a href="#Naive-Bayes" data-toc-modified-id="Naive-Bayes-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Naive Bayes</a></span></li><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Logistic Regression</a></span></li><li><span><a href="#Extra-Trees" data-toc-modified-id="Extra-Trees-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Extra Trees</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from imblearn.over_sampling import SMOTENC, SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.metrics import (
    confusion_matrix, 
    plot_confusion_matrix,
    accuracy_score,
    plot_roc_curve,
    roc_auc_score,
    recall_score,
    precision_score,
    f1_score
)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_weather = pd.read_csv('datasets/train_weather.csv')
test_weather = pd.read_csv('datasets/test_weather.csv')

## Modelling

In [3]:
X = train_weather.drop(columns='WnvPresent')
y = train_weather['WnvPresent']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

metrics = pd.DataFrame(index=[0])

In [20]:
def get_model(model, df, gridcv, pipe_params={}):
    
    models = {'lr': LogisticRegression(),
              'nb': MultinomialNB(),
              'rf': RandomForestClassifier(),
              'et': ExtraTreesClassifier(),
              'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
             }
    
    if model == 'nb':
            pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                (model, models[model]),
            ])
    else:
            pipe = imbpipeline([
                ('ss', StandardScaler()),
                ('sm', SMOTE(random_state=42)), 
                (model, models[model]),
            ])
    
    pipe_params = pipe_params
    
    gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1)
    
    gridsearch.fit(X_train, y_train)
    best_params = gridsearch.best_params_
    print(f'Best Parameters: {best_params}')
    
    preds = gridsearch.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    model_probs_train = gridsearch.predict_proba(X_train)[:, 1]
    model_probs_test = gridsearch.predict_proba(X_test)[:, 1]
    roc_auc = cross_val_score(pipe, X_train, y_train, scoring='roc_auc', cv=gridcv)
    
    metrics['model'] = model
    metrics['accuracy'] = accuracy_score(y_test, preds)
    metrics['specificity'] = tn / (tn+fp)
    metrics['recall'] = recall_score(y_test, preds)
    metrics['precision'] = precision_score(y_test, preds)
    metrics['AUC_CV'] = roc_auc.mean()
    metrics['AUC_train'] = roc_auc_score(y_train, model_probs_train)
    metrics['AUC_test'] = roc_auc_score(y_test, model_probs_test)
    metrics['f1_score'] = f1_score(y_test, preds)
    
    
    
    return metrics

In [None]:
# def get_model1(model, df, gridcv, pipe_params={}):
    
#     models = {'lr': LogisticRegression(),
#               'nb': MultinomialNB(),
#               'rf': RandomForestClassifier(),
#               'et': ExtraTreesClassifier(),
#               'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
#              }
    
#     if model == 'nb':
#             pipe = imbpipeline([
#                 ('sm', SMOTE(random_state=42)), 
#                 (model, models[model]),
#             ])
#     else:
#             pipe = imbpipeline([
#                 ('ss', StandardScaler()),
#                 ('sm', SMOTE(random_state=42)), 
#                 (model, models[model]),
#             ])
    
#     pipe_params = pipe_params
    
#     gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1)
    
#     gridsearch.fit(X_train, y_train)
#     best_params = gridsearch.best_params_
#     print(f'Best Parameters: {best_params}')
    
#     preds = gridsearch.predict(X_test)
#     tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

#     model_probs_train = gridsearch.predict_proba(X_train)[:, 1]
#     model_probs_test = gridsearch.predict_proba(X_test)[:, 1]
#     roc_auc = cross_val_score(pipe, X_train, y_train, scoring='roc_auc', cv=gridcv)
    
#     metrics['model'] = model
#     metrics['accuracy'] = accuracy_score(y_test, preds)
#     metrics['specificity'] = tn / (tn+fp)
#     metrics['recall'] = recall_score(y_test, preds)
#     metrics['precision'] = precision_score(y_test, preds)
#     metrics['AUC_CV'] = roc_auc.mean()
#     metrics['AUC_train'] = roc_auc_score(y_train, model_probs_train)
#     metrics['AUC_test'] = roc_auc_score(y_test, model_probs_test)
#     metrics['f1_score'] = f1_score(y_test, preds)
    
    
    
#     return metrics

### Naive Bayes

In [5]:
get_model('nb', train_weather, 5, pipe_params={
    'sm__k_neighbors': [3, 5, 10]
})

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,Naive Bayes,0.685701,0.687781,0.649123,0.105714,0.737213,0.754839,0.735733,0.181818


### Logistic Regression

In [18]:
%%time
get_model('lr', train_weather, 5, pipe_params={
    'lr__C' : np.logspace(0.01, 1, 5),
     'lr__solver' : ['saga'], #### Estimator fit failed when penalty = l1. 
          'lr__max_iter' : [2000, 3000, 5000]
})

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Parameters: {'lr__C': 3.198895109691398, 'lr__max_iter': 2000, 'lr__solver': 'saga'}
CPU times: user 2min 12s, sys: 2.69 s, total: 2min 14s
Wall time: 2min 4s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.732893,0.729177,0.798246,0.143533,0.837943,0.843921,0.827184,0.243316


In [13]:
# %%time
# get_model('lr', train_weather, 5, pipe_params={})

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {}
CPU times: user 1.27 s, sys: 128 ms, total: 1.4 s
Wall time: 754 ms


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.732893,0.729177,0.798246,0.143533,0.837943,0.843888,0.82718,0.243316


In [19]:
%%time
get_model('lr', train_weather, 5, pipe_params={
    'lr__C' : np.logspace(0.01, 1, 5),
     'lr__solver' : ['saga','newton-cg','sag','liblinear'],
             #'lr__penalty' : ['l2','none'],  #### Estimator fit failed when penalty = l1. 
          'lr__max_iter' : [2000, 3000, 5000]
})

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'lr__C': 3.198895109691398, 'lr__max_iter': 2000, 'lr__solver': 'saga'}
CPU times: user 4min 9s, sys: 6.89 s, total: 4min 16s
Wall time: 3min 37s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.732893,0.729177,0.798246,0.143533,0.837943,0.843921,0.827184,0.243316


### Extra Trees

In [22]:
%%time
get_model('et', train_weather, 5, pipe_params={
    'et__criterion':['gini','entropy'],
    'et__bootstrap':["True","False"],
    'et__min_samples_leaf':[1,2,5,10],
    'et__min_samples_split':[2,3],
    'et__n_estimators':[100,150,300],
})

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Parameters: {'et__bootstrap': 'True', 'et__criterion': 'entropy', 'et__min_samples_leaf': 5, 'et__min_samples_split': 2, 'et__n_estimators': 100}
CPU times: user 9min 26s, sys: 20.2 s, total: 9min 47s
Wall time: 8min 43s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,et,0.818782,0.829925,0.622807,0.17233,0.6742,0.915986,0.831306,0.269962
