In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from imblearn.over_sampling import SMOTENC, SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.metrics import (
    confusion_matrix, 
    plot_confusion_matrix,
    accuracy_score,
    plot_roc_curve,
    roc_auc_score,
    recall_score,
    precision_score,
    f1_score
)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train_weather = pd.read_csv(r'../datasets/train_weather.csv')
test_weather = pd.read_csv(r'../datasets/test_weather.csv')

## Modelling

In [3]:
X = train_weather.drop(columns='WnvPresent')
y = train_weather['WnvPresent']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

metrics = pd.DataFrame(index=[0])

In [64]:
def get_model(model, gridcv, pipe_params={}):
    
    models = {'lr': LogisticRegression(),
              'nb': MultinomialNB(),
              'rf': RandomForestClassifier(),
              'et': ExtraTreesClassifier(),
              'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
              'knn': KNeighborsClassifier()
             }
    
    if model == 'nb':
            pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                (model, models[model]),
            ])
    else:
            pipe = imbpipeline([
                ('ss', StandardScaler()),
                ('sm', SMOTE(random_state=42)), 
                (model, models[model]),
            ])
    
    pipe_params = pipe_params
    
    gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1)
    
    gridsearch.fit(X_train, y_train)
    
    preds = gridsearch.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    model_probs_train = gridsearch.predict_proba(X_train)[:, 1]
    model_probs_test = gridsearch.predict_proba(X_test)[:, 1]
    roc_auc = cross_val_score(pipe, X_train, y_train, scoring='roc_auc', cv=gridcv)
    
    metrics['model'] = models[model]
    metrics['accuracy'] = accuracy_score(y_test, preds)
    metrics['specificity'] = tn / (tn+fp)
    metrics['recall'] = recall_score(y_test, preds)
    metrics['precision'] = precision_score(y_test, preds)
    metrics['AUC_CV'] = roc_auc.mean()
    metrics['AUC_train'] = roc_auc_score(y_train, model_probs_train)
    metrics['AUC_test'] = roc_auc_score(y_test, model_probs_test)
    metrics['f1_score'] = f1_score(y_test, preds)

    print(gridsearch.best_params_)
    print(gridsearch.best_score_)
    return metrics

## Baseline 

In [52]:
get_model('lr', 5, pipe_params={})

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{}
0.8379433850233623


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.732893,0.729177,0.798246,0.143533,0.837943,0.843888,0.82718,0.243316


## Random Forest

In [61]:
%%time

get_model('rf', 5, pipe_params={
    'rf__n_estimators': [100, 150],
    'rf__min_samples_split': [2, 3],
    'rf__min_samples_leaf': [2, 5, 10]
})

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'rf__min_samples_leaf': 10, 'rf__min_samples_split': 3, 'rf__n_estimators': 100}
0.8310164709927242
CPU times: user 1min 28s, sys: 828 ms, total: 1min 29s
Wall time: 50.6 s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,RandomForestClassifier(),0.886267,0.908728,0.491228,0.23431,0.75181,0.922901,0.833703,0.31728


## K Nearest Neighbors

In [78]:
%%time

get_model('knn', 5, pipe_params={
    'knn__leaf_size': [10, 20, 30],
    'knn__n_neighbors': [10, 25, 50, 75],
    'knn__p': [1, 2]
})

Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'knn__leaf_size': 10, 'knn__n_neighbors': 50, 'knn__p': 1}
0.8056296123231377
CPU times: user 1min 52s, sys: 14.5 s, total: 2min 7s
Wall time: 50.3 s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,KNeighborsClassifier(),0.772534,0.775561,0.719298,0.154135,0.732373,0.904529,0.815293,0.25387


- no need df condition
- add print best_params
- rename metrics['model']

In [74]:
test_weather

Unnamed: 0,Id,Tavg,DewPoint,PrecipTotal,StnPressure,AvgSpeed,Rain,Mist,Dust,CULEX PIPIENS,...,CULEX TARSALIS,CULEX TERRITANS,low_risk,medium_risk,high_risk,very_high_risk,mth_7,mth_8,mth_9,mth_10
0,1,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,74,56,0.00,29.28,10.0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,74,56,0.00,29.28,10.0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116288,116289,71,63,0.72,29.10,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
116289,116290,71,63,0.72,29.10,7.9,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
116290,116291,71,63,0.72,29.10,7.9,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
116291,116292,71,63,0.72,29.10,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
