**Project 4** - Group 2 (Emma, Gabriel, Ben, Junhao)

<br>

<font size="6">Part 2: Modelling

## Load Libraries / Import Datasets

In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.metrics import (
    confusion_matrix, 
    plot_confusion_matrix,
    accuracy_score,
    plot_roc_curve,
    roc_auc_score,
    recall_score,
    precision_score,
    f1_score
)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
from xgboost import XGBClassifier

pd.set_option('display.max_colwidth', 0)

In [106]:
train_weather = pd.read_csv(r'../datasets/train_weather_v3.csv')
test_weather = pd.read_csv(r'../datasets/test_weather_v3.csv')
submission = pd.read_csv(r'../datasets/sampleSubmission.csv')

The cleaned test dataset has ~13 times more entries than the train dataset.

In [3]:
train_weather.shape

(8475, 29)

In [4]:
test_weather.shape

(116293, 29)

## Functions

In [170]:
def get_model(model, gridcv, pipe_params={}):
    '''Function to apply GriSearchCV based on selected model and parameters'''
    
    #Scale X
    ss = StandardScaler()
    Xsc_train = ss.fit_transform(X_train)
    Xsc_test = ss.transform(X_test)
    
    #Models to test
    models = {'lr': LogisticRegression(),
              'nb': MultinomialNB(),
              'rf': RandomForestClassifier(),
              'et': ExtraTreesClassifier(),
              'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
              'knn': KNeighborsClassifier(),
              'gb': GradientBoostingClassifier()
              #'xgb': xgb.XGBClassifier()
             }
    
    #pipeline to include SMOTE since this is an unbalanced dataset 
    pipe = imbpipeline([
        ('sm', SMOTE(random_state=42)), 
        (model, models[model]),
            ])
    
    pipe_params = pipe_params
    
    #GriSesarchCV on user assigned parameters
    gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1, n_jobs=-1)
    
    gridsearch.fit(Xsc_train, y_train)
    
    #predict on test 
    preds = gridsearch.predict(Xsc_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    model_probs_train = gridsearch.predict_proba(Xsc_train)[:, 1]
    model_probs_test = gridsearch.predict_proba(Xsc_test)[:, 1]
    roc_auc = cross_val_score(pipe, Xsc_train, y_train, scoring='roc_auc', cv=gridcv)
    best_params = gridsearch.best_params_
    
    #Metrics to produce
    summary ={
        'model': model,
        'accuracy': accuracy_score(y_test, preds),
        'specificity': tn / (tn+fp),
        'recall': recall_score(y_test, preds),
        'precision': precision_score(y_test, preds),
        'f1_score': f1_score(y_test, preds),
        'AUC_CV': roc_auc.mean(),
        'AUC_train': roc_auc_score(y_train, model_probs_train),
        'AUC_test': roc_auc_score(y_test, model_probs_test),
        'best params': best_params}
    
    summary_df = pd.DataFrame.from_dict(summary, orient='Index', columns=[str(model)])

    return summary_df

## Modelling

In [108]:
#Train/Test(aka Validation) split on Train dataset 

X = train_weather.drop(columns=['WnvPresent'])
y = train_weather['WnvPresent'] 
    
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

metrics = pd.DataFrame(index=[0])

In [166]:
X_full_train = train_weather.drop(columns=['WnvPresent'])
X_full_test = test_weather

In [None]:
#Scale of X
#Fit_transform X_train, and transform X_test (val), X_full_train, X_full_test

ss = StandardScaler()
Xsc_train = ss.fit_transform(X_train)
Xsc_test = ss.transform(X_test)
Xsc_full_train = ss.transform(X_full_train)
Xsc_full_test = ss.transform(X_full_test)

### Baseline 

For the baseline model, we run it a logistic regression model with no parameter tuning. It produced a ROC_AUC test score of 0.81. 

In [150]:
get_model('lr', 5, pipe_params={})

Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,lr
model,lr
accuracy,0.73714
specificity,0.735661
recall,0.763158
precision,0.141005
f1_score,0.23803
AUC_CV,0.833047
AUC_train,0.844657
AUC_test,0.817988
best params,{}


### Logistic Regression

In [154]:
%%time

lr_results = get_model('lr', 5, pipe_params={
    'lr__C' : np.logspace(0.01, 1, 10),
    'lr__solver': ['sag','saga', 'liblinear'],    
    'lr__max_iter': [1000, 5000, 10000]
})

lr_results

Fitting 5 folds for each of 90 candidates, totalling 450 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 5.42 s, sys: 249 ms, total: 5.67 s
Wall time: 4min 29s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,lr
model,lr
accuracy,0.73714
specificity,0.735661
recall,0.763158
precision,0.141005
f1_score,0.23803
AUC_CV,0.833047
AUC_train,0.844653
AUC_test,0.817984
best params,"{'lr__C': 1.023292992280754, 'lr__max_iter': 1000, 'lr__solver': 'saga'}"


### Random Forest

In [151]:
%%time

rf_results = get_model('rf', 5, pipe_params={
    'rf__n_estimators': [100, 150],
    'rf__min_samples_split': [2, 3, 5],
    'rf__min_samples_leaf': [2, 5, 10]
})

rf_results

Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: user 8.7 s, sys: 101 ms, total: 8.8 s
Wall time: 25.2 s


Unnamed: 0,rf
model,rf
accuracy,0.880604
specificity,0.906733
recall,0.421053
precision,0.204255
f1_score,0.275072
AUC_CV,0.756269
AUC_train,0.926258
AUC_test,0.824266
best params,"{'rf__min_samples_leaf': 10, 'rf__min_samples_split': 3, 'rf__n_estimators': 100}"


### K Nearest Neighbors

In [153]:
%%time

knn_results = get_model('knn', 5, pipe_params={
    'knn__leaf_size': [10, 20, 30],
    'knn__n_neighbors': [10, 25, 50, 75],
    'knn__p': [1, 2]
})

knn_results

Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: user 8.94 s, sys: 1.36 s, total: 10.3 s
Wall time: 16.3 s


Unnamed: 0,knn
model,knn
accuracy,0.746579
specificity,0.74813
recall,0.719298
precision,0.139693
f1_score,0.233951
AUC_CV,0.729757
AUC_train,0.904566
AUC_test,0.784709
best params,"{'knn__leaf_size': 10, 'knn__n_neighbors': 50, 'knn__p': 1}"


### GradientBoost

In [171]:
%%time

gb_results = get_model('gb', 5, pipe_params={
    'gb__max_depth': [2,5,8],
    'gb__n_estimators': [100, 125, 150],
    'gb__learning_rate': [.05, .1, .15],
    'gb__min_samples_leaf':[1,2,3],
    'gb__min_samples_split':[2,3,5],
})

gb_results

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
CPU times: user 18.9 s, sys: 546 ms, total: 19.5 s
Wall time: 16min 52s


Unnamed: 0,gb
model,gb
accuracy,0.90656
specificity,0.935162
recall,0.403509
precision,0.261364
f1_score,0.317241
AUC_CV,0.837507
AUC_train,0.925149
AUC_test,0.837102
best params,"{'gb__learning_rate': 0.1, 'gb__max_depth': 5, 'gb__min_samples_leaf': 1, 'gb__min_samples_split': 2, 'gb__n_estimators': 125}"


### AdaBoost

In [None]:
# %%time

# ada_results = get_model('ada', 5, pipe_params={
#     'sm__k_neighbors': [20, 50, 80], 
#     'ada__n_estimators': [30, 50, 80],
#     'ada__learning_rate': [0.1, 0.3, 0.7, 1],
#     'ada__base_estimator__max_depth': [1, 2] 
# })

# ada_results

### XGBoost

In [None]:
# xgb_results = get_model('xgb', 5, pipe_params={
#     'xgb__eval_metric' : ['auc'],
#     'xgb__use_label_encoder' : [False],
#     'xgb__subsample' : [1], 
#     'xgb__colsample_bytree' : [0.5, 0.75, 1],  
#     'xgb__learning_rate' : [0.1, 0.17], 
#     'xgb__min_child_weight': [1, 3], 
#     'xgb__max_depth' : [3, 5],  
#     'xgb__n_estimators' : [100, 300, 500], 
#     'xgb__reg_alpha' : [1, 5, 10], 
#     'xgb__reg_lambda' : [10, 35, 50], 
#     'xgb__gamma' : [0,0.05, 0.1], 
#     'xgb__objective' : ['binary:logistic']
# })

# xgb_results

## Summary

In [172]:
overall_df = pd.concat([lr_results, rf_results, knn_results, gb_results], axis=1).T
overall_df.sort_values(ascending=False, by='AUC_test').drop(columns=['model'])

Unnamed: 0,accuracy,specificity,recall,precision,f1_score,AUC_CV,AUC_train,AUC_test,best params
gb,0.90656,0.935162,0.403509,0.261364,0.317241,0.837507,0.925149,0.837102,"{'gb__learning_rate': 0.1, 'gb__max_depth': 5, 'gb__min_samples_leaf': 1, 'gb__min_samples_split': 2, 'gb__n_estimators': 125}"
rf,0.880604,0.906733,0.421053,0.204255,0.275072,0.756269,0.926258,0.824266,"{'rf__min_samples_leaf': 10, 'rf__min_samples_split': 3, 'rf__n_estimators': 100}"
lr,0.73714,0.735661,0.763158,0.141005,0.23803,0.833047,0.844653,0.817984,"{'lr__C': 1.023292992280754, 'lr__max_iter': 1000, 'lr__solver': 'saga'}"
knn,0.746579,0.74813,0.719298,0.139693,0.233951,0.729757,0.904566,0.784709,"{'knn__leaf_size': 10, 'knn__n_neighbors': 50, 'knn__p': 1}"


### ROC-AUC curve

### Confusion Matrix

### Prediction vs True

### Coefficient Analysis

## Kaggle Model

In [163]:
#Train/Test split on full dataset

X_full_train = train_weather.drop(columns=['WnvPresent'])
X_full_test = test_weather.drop(columns=['Id'])
y_full_train = train_weather['WnvPresent']


In [15]:
%%time

ss = StandardScaler()
X_full_train_sc = ss.fit_transform(X_full_train)
X_full_test_sc = ss.transform(X_full_test)

#Pipeline
pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                ('lr', LogisticRegression()),
            ])

pipe_params = {
    'lr__C' : [1.023292992280754],
    'lr__solver' : ['saga'],
    'lr__max_iter' : [1000]
}
    
gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)

#Fit model
gridsearch.fit(X_full_train_sc, y_full_train)
    
preds = gridsearch.predict_proba(X_full_test_sc)


model_probs_train = gridsearch.predict_proba(X_full_train_sc)[:, 1]
model_probs_test = gridsearch.predict_proba(X_full_test_sc)[:, 1]
roc_auc = cross_val_score(pipe, X_full_train_sc, y_full_train, scoring='roc_auc', cv=5)
    
print('model', 'lr')
print('AUC_CV', roc_auc.mean())
print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
print('best_params', gridsearch.best_params_)


Fitting 5 folds for each of 60 candidates, totalling 300 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

model lr
AUC_CV 0.7573432061608809
AUC_train 0.8393473819573356
best_params {'lr__C': 1.023292992280754, 'lr__max_iter': 1000, 'lr__solver': 'saga'}
0.7575306848641691
CPU times: user 7.16 s, sys: 208 ms, total: 7.37 s
Wall time: 3min 57s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [284]:
# pipe = imbpipeline([
#                 ('ss', StandardScaler()),
#                 ('sm', SMOTE(random_state=42)), 
#                 ('rf', RandomForestClassifier()),
#             ])

# pipe_params = {
#     'rf__C': np.logspace(0.01, 1, 5),
#     'rf__min_samples_split': [2, 3],
#     'rf__min_samples_leaf': [2, 5, 10]
# }
    
# gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
    
# gridsearch.fit(X_full_train, y_full_train)
    
# preds = gridsearch.predict_proba(X_full_test)


# model_probs_train = gridsearch.predict_proba(X_full_train)[:, 1]
# model_probs_test = gridsearch.predict_proba(X_full_test)[:, 1]
# roc_auc = cross_val_score(pipe, X_full_train, y_full_train, scoring='roc_auc', cv=5)
    
# print( 'model', 'ada')
# print('AUC_CV', roc_auc.mean())
# print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
# print('best_params', gridsearch.best_params_)

# print(gridsearch.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
model ada
AUC_CV 0.41853426857811193
AUC_train 0.920408157138779
best_params {'rf__min_samples_leaf': 10, 'rf__min_samples_split': 2, 'rf__n_estimators': 150}
0.5626193360540922


In [5]:
ss = StandardScaler()
X_full_train_sc = ss.fit_transform(X_full_train)
X_full_test_sc = ss.transform(X_full_test)

pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                ('gb', GradientBoostingClassifier()),
            ])

pipe_params = {
    'gb__max_depth': [2,5,8],
    'gb__n_estimators': [100, 125, 150],
    'gb__learning_rate': [.05, .1, .15],
    'gb__min_samples_leaf':[1,2,3],
    'gb__min_samples_split':[2,3,5],
}
    
gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
    
gridsearch.fit(X_full_train_sc, y_full_train)
    
preds = gridsearch.predict_proba(X_full_test_sc)


model_probs_train = gridsearch.predict_proba(X_full_train_sc)[:, 1]
model_probs_test = gridsearch.predict_proba(X_full_test_sc)[:, 1]
roc_auc = cross_val_score(pipe, X_full_train_sc, y_full_train, scoring='roc_auc', cv=5)
    
print('model', 'lr')
print('AUC_CV', roc_auc.mean())
print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
print('best_params', gridsearch.best_params_)

print(gridsearch.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
model lr
AUC_CV 0.6213272850459075
AUC_train 0.8328218565121257
best_params {'gb__learning_rate': 0.05, 'gb__max_depth': 2, 'gb__min_samples_leaf': 1, 'gb__min_samples_split': 2, 'gb__n_estimators': 100}
0.671607943391864


In [6]:
predict_y=pd.DataFrame(preds[:,1])
predict_y =predict_y.rename(columns={0:'WnvPresent'})

In [8]:
submission = pd.read_csv(r'../datasets/sampleSubmission.csv')
submission['WnvPresent'] = predict_y['WnvPresent']

In [10]:
submission.to_csv(r'../datasets/sampleSubmission_v12.csv', index=False)

In [21]:
submission

Unnamed: 0,Id,WnvPresent
0,1,0.132141
1,2,0.051930
2,3,0.057630
3,4,0.000066
4,5,0.000071
...,...,...
116288,116289,0.000004
116289,116290,0.000004
116290,116291,0.000005
116291,116292,0.001593
