**Project 4** - Group 2 (Emma, Gabriel, Ben, Junhao)

<br>

<font size="6">Part 2: Modelling

## Load Libraries / Import Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.metrics import (
    confusion_matrix, 
    plot_confusion_matrix,
    accuracy_score,
    plot_roc_curve,
    roc_auc_score,
    recall_score,
    precision_score,
    f1_score
)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [12]:
train_weather = pd.read_csv(r'../datasets/train_weather_v3.csv')
test_weather = pd.read_csv(r'../datasets/test_weather_v3.csv')
submission = pd.read_csv(r'../datasets/sampleSubmission.csv')

The cleaned test dataset has ~13 times more entries than the train dataset.

In [3]:
train_weather.shape

(8475, 29)

In [4]:
test_weather.shape

(116293, 29)

## Functions

In [22]:
def get_model(model, gridcv, pipe_params={}):
    
    ss = StandardScaler()
    Xsc_train = ss.fit_transform(X_train)
    Xsc_test = ss.transform(X_test)
    
    models = {'lr': LogisticRegression(),
              'nb': MultinomialNB(),
              'rf': RandomForestClassifier(),
              'et': ExtraTreesClassifier(),
              'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
              'knn': KNeighborsClassifier()
             }
    
    pipe = imbpipeline([
        ('sm', SMOTE(random_state=42)), 
        (model, models[model]),
            ])
    
    pipe_params = pipe_params
    
    gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1)
    
    gridsearch.fit(Xsc_train, y_train)
    
    preds = gridsearch.predict(Xsc_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    model_probs_train = gridsearch.predict_proba(Xsc_train)[:, 1]
    model_probs_test = gridsearch.predict_proba(Xsc_test)[:, 1]
    roc_auc = cross_val_score(pipe, Xsc_train, y_train, scoring='roc_auc', cv=gridcv)
    
    metrics['model'] = models[model]
    metrics['accuracy'] = accuracy_score(y_test, preds)
    metrics['specificity'] = tn / (tn+fp)
    metrics['recall'] = recall_score(y_test, preds)
    metrics['precision'] = precision_score(y_test, preds)
    metrics['AUC_CV'] = roc_auc.mean()
    metrics['AUC_train'] = roc_auc_score(y_train, model_probs_train)
    metrics['AUC_test'] = roc_auc_score(y_test, model_probs_test)
    metrics['f1_score'] = f1_score(y_test, preds)

    print(gridsearch.best_params_)
    return metrics

In [19]:
# def get_model(model, gridcv, pipe_params={}):
    
#     ss = StandardScaler()
#     Xsc_train = ss.fit_transform(X_train)
#     Xsc_test = ss.transform(X_test)
    
#     models = {'lr': LogisticRegression(),
#               'nb': MultinomialNB(),
#               'rf': RandomForestClassifier(),
#               'et': ExtraTreesClassifier(),
#               'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
#              }
   
#     pipe = imbpipeline([
#         ('sm', SMOTE(random_state=42)), 
#         (model, models[model]),
#     ])
    
#     pipe_params = pipe_params
    
#     gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1, n_jobs=-1)
    
#     gridsearch.fit(Xsc_train, y_train)
    
#     preds = gridsearch.predict(Xsc_test)
#     tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

#     model_probs_train = gridsearch.predict_proba(Xsc_train)[:, 1]
#     model_probs_test = gridsearch.predict_proba(Xsc_test)[:, 1]
#     roc_auc = cross_val_score(pipe, Xsc_train, y_train, scoring='roc_auc', cv=gridcv)
    
#     metrics['model'] = model
#     metrics['accuracy'] = accuracy_score(y_test, preds)
#     metrics['specificity'] = tn / (tn+fp)
#     metrics['recall'] = recall_score(y_test, preds)
#     metrics['precision'] = precision_score(y_test, preds)
#     metrics['AUC_CV'] = roc_auc.mean()
#     metrics['AUC_train'] = roc_auc_score(y_train, model_probs_train)
#     metrics['AUC_test'] = roc_auc_score(y_test, model_probs_test)
#     metrics['f1_score'] = f1_score(y_test, preds)
    
#     print(gridsearch.best_params_)
    
#     return metrics, gridsearch

## Modelling

In [13]:
#Train/Test(aka Validation) split on Train dataset 

X = train_weather.drop(columns='WnvPresent')
y = train_weather['WnvPresent']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

metrics = pd.DataFrame(index=[0])

### Baseline 

In [33]:
get_model('lr', 5, pipe_params={
    'lr__max_iter' : [1000, 10000]})

Fitting 5 folds for each of 2 candidates, totalling 10 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'lr__max_iter': 1000}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.743747,0.743142,0.754386,0.143095,0.832781,0.844471,0.816369,0.240559


### Logistic Regression

In [34]:
get_model('lr', 5, pipe_params={
    'lr__C' : np.logspace(0.01, 1, 10),
    'lr__solver' : ['saga', 'liblinear'],
    'lr__max_iter' : [1000, 2000, 10000]})

Fitting 5 folds for each of 60 candidates, totalling 300 fits






KeyboardInterrupt: 

## Random Forest

In [274]:
%%time

get_model('rf', 5, pipe_params={
    'rf__n_estimators': [100, 150],
    'rf__min_samples_split': [2, 3],
    'rf__min_samples_leaf': [2, 5, 10]
})

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'rf__min_samples_leaf': 10, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
0.8344685226910095
CPU times: user 1min 44s, sys: 1.1 s, total: 1min 45s
Wall time: 1min 7s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,RandomForestClassifier(),0.885323,0.910723,0.438596,0.218341,0.760445,0.926043,0.819012,0.291545


## K Nearest Neighbors

In [8]:
%%time

get_model('knn', 5, pipe_params={
    'knn__leaf_size': [10, 20, 30],
    'knn__n_neighbors': [10, 25, 50, 75],
    'knn__p': [1, 2]
})

Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'knn__leaf_size': 10, 'knn__n_neighbors': 50, 'knn__p': 1}
0.8056296123231377
CPU times: user 1min 55s, sys: 14.8 s, total: 2min 10s
Wall time: 49.7 s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,KNeighborsClassifier(),0.772534,0.775561,0.719298,0.154135,0.732373,0.904529,0.815293,0.25387


- no need df condition
- add print best_params
- rename metrics['model']

In [74]:
test_weather

Unnamed: 0,Id,Tavg,DewPoint,PrecipTotal,StnPressure,AvgSpeed,Rain,Mist,Dust,CULEX PIPIENS,...,CULEX TARSALIS,CULEX TERRITANS,low_risk,medium_risk,high_risk,very_high_risk,mth_7,mth_8,mth_9,mth_10
0,1,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,74,56,0.00,29.28,10.0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,74,56,0.00,29.28,10.0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116288,116289,71,63,0.72,29.10,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
116289,116290,71,63,0.72,29.10,7.9,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
116290,116291,71,63,0.72,29.10,7.9,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
116291,116292,71,63,0.72,29.10,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
%%time

get_model('lr', 5, pipe_params={
    'lr__C' : np.logspace(0.01, 1, 10),
    'lr__max_iter' : [2000, 10000]
})

Fitting 5 folds for each of 20 candidates, totalling 100 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'lr__C': 1.023292992280754, 'lr__max_iter': 2000}
CPU times: user 1min 21s, sys: 1.83 s, total: 1min 23s
Wall time: 22.7 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.743747,0.743142,0.754386,0.143095,0.832781,0.844488,0.816365,0.240559


In [265]:
X_full_test.isnull().sum()

Tavg                          0
DewPoint                      0
PrecipTotal                   0
StnPressure                   0
AvgSpeed                      0
Rain                          0
Mist                          0
Dust                          0
CULEX PIPIENS                 0
CULEX PIPIENS/RESTUANS        0
CULEX RESTUANS                0
CULEX SALINARIUS              0
CULEX TARSALIS                0
CULEX TERRITANS               0
low_risk                      0
medium_risk                   0
very_high_risk                0
very_low_risk                 0
DewPoint very_high_risk       0
Tavg very_high_risk           0
DewPoint CULEX PIPIENS        0
StnPressure very_high_risk    0
DewPoint StnPressure          0
Tavg DewPoint                 0
mth_7                         0
mth_8                         0
mth_9                         0
mth_10                        0
dtype: int64

In [14]:
X_full_train = train_weather.drop(columns=['WnvPresent'])
X_full_test = test_weather.drop(columns=['Id'])
y_full_train = train_weather['WnvPresent']

In [284]:
# pipe = imbpipeline([
#                 ('ss', StandardScaler()),
#                 ('sm', SMOTE(random_state=42)), 
#                 ('rf', RandomForestClassifier()),
#             ])

# pipe_params = {
#     'rf__C': np.logspace(0.01, 1, 5),
#     'rf__min_samples_split': [2, 3],
#     'rf__min_samples_leaf': [2, 5, 10]
# }
    
# gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
    
# gridsearch.fit(X_full_train, y_full_train)
    
# preds = gridsearch.predict_proba(X_full_test)


# model_probs_train = gridsearch.predict_proba(X_full_train)[:, 1]
# model_probs_test = gridsearch.predict_proba(X_full_test)[:, 1]
# roc_auc = cross_val_score(pipe, X_full_train, y_full_train, scoring='roc_auc', cv=5)
    
# print( 'model', 'ada')
# print('AUC_CV', roc_auc.mean())
# print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
# print('best_params', gridsearch.best_params_)

# print(gridsearch.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
model ada
AUC_CV 0.41853426857811193
AUC_train 0.920408157138779
best_params {'rf__min_samples_leaf': 10, 'rf__min_samples_split': 2, 'rf__n_estimators': 150}
0.5626193360540922


In [15]:
%%time

ss = StandardScaler()
X_full_train_sc = ss.fit_transform(X_full_train)
X_full_test_sc = ss.transform(X_full_test)

pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                ('lr', LogisticRegression()),
            ])

pipe_params = {
    'lr__C' : np.logspace(0.01, 1, 10),
    'lr__solver' : ['saga', 'liblinear'],
    'lr__max_iter' : [1000, 2000, 10000]
}
    
gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
    
gridsearch.fit(X_full_train_sc, y_full_train)
    
preds = gridsearch.predict_proba(X_full_test_sc)


model_probs_train = gridsearch.predict_proba(X_full_train_sc)[:, 1]
model_probs_test = gridsearch.predict_proba(X_full_test_sc)[:, 1]
roc_auc = cross_val_score(pipe, X_full_train_sc, y_full_train, scoring='roc_auc', cv=5)
    
print('model', 'lr')
print('AUC_CV', roc_auc.mean())
print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
print('best_params', gridsearch.best_params_)

print(gridsearch.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

model lr
AUC_CV 0.7573432061608809
AUC_train 0.8393473819573356
best_params {'lr__C': 1.023292992280754, 'lr__max_iter': 1000, 'lr__solver': 'saga'}
0.7575306848641691
CPU times: user 7.16 s, sys: 208 ms, total: 7.37 s
Wall time: 3min 57s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
ss = StandardScaler()
X_full_train_sc = ss.fit_transform(X_full_train)
X_full_test_sc = ss.transform(X_full_test)

pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                ('gb', GradientBoostingClassifier()),
            ])

pipe_params = {
    'gb__max_depth': [2,5,8],
    'gb__n_estimators': [100, 125, 150],
    'gb__learning_rate': [.05, .1, .15],
    'gb__min_samples_leaf':[1,2,3],
    'gb__min_samples_split':[2,3,5],
}
    
gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
    
gridsearch.fit(X_full_train_sc, y_full_train)
    
preds = gridsearch.predict_proba(X_full_test_sc)


model_probs_train = gridsearch.predict_proba(X_full_train_sc)[:, 1]
model_probs_test = gridsearch.predict_proba(X_full_test_sc)[:, 1]
roc_auc = cross_val_score(pipe, X_full_train_sc, y_full_train, scoring='roc_auc', cv=5)
    
print('model', 'lr')
print('AUC_CV', roc_auc.mean())
print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
print('best_params', gridsearch.best_params_)

print(gridsearch.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
model lr
AUC_CV 0.6234053827546482
AUC_train 0.8343595618829189
best_params {'gb__learning_rate': 0.05, 'gb__max_depth': 2, 'gb__min_samples_leaf': 3, 'gb__min_samples_split': 2, 'gb__n_estimators': 100}
0.671379982888176


In [16]:
predict_y=pd.DataFrame(preds[:,1])

In [17]:
predict_y =predict_y.rename(columns={0:'WnvPresent'})

In [18]:
submission = pd.read_csv(r'../datasets/sampleSubmission.csv')

In [19]:
submission['WnvPresent'] = predict_y['WnvPresent']

In [20]:
submission.to_csv(r'../datasets/sampleSubmission_v11.csv', index=False)

In [21]:
submission

Unnamed: 0,Id,WnvPresent
0,1,0.132141
1,2,0.051930
2,3,0.057630
3,4,0.000066
4,5,0.000071
...,...,...
116288,116289,0.000004
116289,116290,0.000004
116290,116291,0.000005
116291,116292,0.001593


In [161]:
test_weather.isnull().sum()

Id                            0
Tavg                          0
DewPoint                      0
PrecipTotal                   0
StnPressure                   0
AvgSpeed                      0
Rain                          0
Mist                          0
Dust                          0
CULEX PIPIENS                 0
CULEX PIPIENS/RESTUANS        0
CULEX RESTUANS                0
CULEX SALINARIUS              0
CULEX TARSALIS                0
CULEX TERRITANS               0
low_risk                      0
medium_risk                   0
very_high_risk                0
very_low_risk                 0
DewPoint very_high_risk       0
Tavg very_high_risk           0
DewPoint CULEX PIPIENS        0
StnPressure very_high_risk    0
DewPoint StnPressure          0
Tavg DewPoint                 0
mth_7                         0
mth_8                         0
mth_9                         0
mth_10                        0
dtype: int64

In [None]:
{
    'gb__max_depth': [4,5,6],
    'gb__n_estimators': [125],
    'gb__learning_rate': [.12, .14, .16],
    'gb__min_samples_leaf':[1,2,5,10],
    'gb__min_samples_split':[2,3],
}