In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from imblearn.over_sampling import SMOTENC, SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.metrics import (
    confusion_matrix, 
    plot_confusion_matrix,
    accuracy_score,
    plot_roc_curve,
    roc_auc_score,
    recall_score,
    precision_score,
    f1_score
)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [255]:
train_weather = pd.read_csv(r'../datasets/train_weather_v2.csv')
test_weather = pd.read_csv(r'../datasets/test_weather_v2.csv')

In [256]:
train_weather.shape

(8475, 29)

In [257]:
test_weather.shape

(116293, 29)

## Modelling

In [258]:
X = train_weather.drop(columns='WnvPresent')
y = train_weather['WnvPresent']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

metrics = pd.DataFrame(index=[0])

In [144]:
def get_model(model, gridcv, pipe_params={}):
    
    models = {'lr': LogisticRegression(),
              'nb': MultinomialNB(),
              'rf': RandomForestClassifier(),
              'et': ExtraTreesClassifier(),
              'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
              'knn': KNeighborsClassifier()
             }
    
    if model == 'nb':
            pipe = imbpipeline([
                ('sm', SMOTE(random_state=42)), 
                (model, models[model]),
            ])
    else:
            pipe = imbpipeline([
                ('ss', StandardScaler()),
                ('sm', SMOTE(random_state=42)), 
                (model, models[model]),
            ])
    
    pipe_params = pipe_params
    
    gridsearch = GridSearchCV(pipe, pipe_params, cv=gridcv, scoring='roc_auc', verbose=1)
    
    gridsearch.fit(X_train, y_train)
    
    preds = gridsearch.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    model_probs_train = gridsearch.predict_proba(X_train)[:, 1]
    model_probs_test = gridsearch.predict_proba(X_test)[:, 1]
    roc_auc = cross_val_score(pipe, X_train, y_train, scoring='roc_auc', cv=gridcv)
    
    metrics['model'] = models[model]
    metrics['accuracy'] = accuracy_score(y_test, preds)
    metrics['specificity'] = tn / (tn+fp)
    metrics['recall'] = recall_score(y_test, preds)
    metrics['precision'] = precision_score(y_test, preds)
    metrics['AUC_CV'] = roc_auc.mean()
    metrics['AUC_train'] = roc_auc_score(y_train, model_probs_train)
    metrics['AUC_test'] = roc_auc_score(y_test, model_probs_test)
    metrics['f1_score'] = f1_score(y_test, preds)

    print(gridsearch.best_params_)
    print(gridsearch.best_score_)
    return metrics

## Baseline 

In [220]:
get_model('lr', 5, pipe_params={})

Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{}
0.8327291889387476


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.743747,0.742643,0.763158,0.144279,0.832729,0.844461,0.816194,0.242678


## Random Forest

In [274]:
%%time

get_model('rf', 5, pipe_params={
    'rf__n_estimators': [100, 150],
    'rf__min_samples_split': [2, 3],
    'rf__min_samples_leaf': [2, 5, 10]
})

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'rf__min_samples_leaf': 10, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
0.8344685226910095
CPU times: user 1min 44s, sys: 1.1 s, total: 1min 45s
Wall time: 1min 7s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,RandomForestClassifier(),0.885323,0.910723,0.438596,0.218341,0.760445,0.926043,0.819012,0.291545


## K Nearest Neighbors

In [8]:
%%time

get_model('knn', 5, pipe_params={
    'knn__leaf_size': [10, 20, 30],
    'knn__n_neighbors': [10, 25, 50, 75],
    'knn__p': [1, 2]
})

Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'knn__leaf_size': 10, 'knn__n_neighbors': 50, 'knn__p': 1}
0.8056296123231377
CPU times: user 1min 55s, sys: 14.8 s, total: 2min 10s
Wall time: 49.7 s


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,KNeighborsClassifier(),0.772534,0.775561,0.719298,0.154135,0.732373,0.904529,0.815293,0.25387


- no need df condition
- add print best_params
- rename metrics['model']

In [74]:
test_weather

Unnamed: 0,Id,Tavg,DewPoint,PrecipTotal,StnPressure,AvgSpeed,Rain,Mist,Dust,CULEX PIPIENS,...,CULEX TARSALIS,CULEX TERRITANS,low_risk,medium_risk,high_risk,very_high_risk,mth_7,mth_8,mth_9,mth_10
0,1,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,74,56,0.00,29.28,10.0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,74,56,0.00,29.28,10.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,74,56,0.00,29.28,10.0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116288,116289,71,63,0.72,29.10,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
116289,116290,71,63,0.72,29.10,7.9,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
116290,116291,71,63,0.72,29.10,7.9,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
116291,116292,71,63,0.72,29.10,7.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [157]:
%%time

get_model('lr', 5, pipe_params={
    'lr__C' : np.logspace(0.01, 1, 5),
     'lr__solver' : ['saga', 'liblinear'],
    'lr__max_iter' : [2000, 10000]
})

Fitting 5 folds for each of 30 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'lr__C': 1.023292992280754, 'lr__max_iter': 2000, 'lr__solver': 'saga'}
0.8351383296818078
CPU times: user 4min 41s, sys: 1.87 s, total: 4min 42s
Wall time: 3min 30s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,accuracy,specificity,recall,precision,AUC_CV,AUC_train,AUC_test,f1_score
0,LogisticRegression(),0.738556,0.73616,0.780702,0.144013,0.835027,0.844238,0.82382,0.243169


In [265]:
X_full_test.isnull().sum()

Tavg                          0
DewPoint                      0
PrecipTotal                   0
StnPressure                   0
AvgSpeed                      0
Rain                          0
Mist                          0
Dust                          0
CULEX PIPIENS                 0
CULEX PIPIENS/RESTUANS        0
CULEX RESTUANS                0
CULEX SALINARIUS              0
CULEX TARSALIS                0
CULEX TERRITANS               0
low_risk                      0
medium_risk                   0
very_high_risk                0
very_low_risk                 0
DewPoint very_high_risk       0
Tavg very_high_risk           0
DewPoint CULEX PIPIENS        0
StnPressure very_high_risk    0
DewPoint StnPressure          0
Tavg DewPoint                 0
mth_7                         0
mth_8                         0
mth_9                         0
mth_10                        0
dtype: int64

In [283]:
X_full_train = train_weather.drop(columns=['WnvPresent'])
X_full_test = test_weather.drop(columns=['Id'])
y_full_train = train_weather['WnvPresent']

In [270]:
pipe = imbpipeline([
                ('ss', StandardScaler()),
                ('sm', SMOTE(random_state=42)), 
                ('ada', AdaBoostClassifier()),
            ])

In [271]:
pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()),
  ('sm', SMOTE(random_state=42)),
  ('ada', AdaBoostClassifier())],
 'verbose': False,
 'ss': StandardScaler(),
 'sm': SMOTE(random_state=42),
 'ada': AdaBoostClassifier(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'sm__k_neighbors': 5,
 'sm__n_jobs': None,
 'sm__random_state': 42,
 'sm__sampling_strategy': 'auto',
 'ada__algorithm': 'SAMME.R',
 'ada__base_estimator': None,
 'ada__learning_rate': 1.0,
 'ada__n_estimators': 50,
 'ada__random_state': None}

In [284]:
pipe = imbpipeline([
                ('ss', StandardScaler()),
                ('sm', SMOTE(random_state=42)), 
                ('rf', RandomForestClassifier()),
            ])

pipe_params = {
    'rf__n_estimators': [100,150],
    'rf__min_samples_split': [2],
    'rf__min_samples_leaf': [10]
}
    
gridsearch = GridSearchCV(pipe, pipe_params, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
    
gridsearch.fit(X_full_train, y_full_train)
    
preds = gridsearch.predict_proba(X_full_test)


model_probs_train = gridsearch.predict_proba(X_full_train)[:, 1]
model_probs_test = gridsearch.predict_proba(X_full_test)[:, 1]
roc_auc = cross_val_score(pipe, X_full_train, y_full_train, scoring='roc_auc', cv=5)
    
print( 'model', 'ada')
print('AUC_CV', roc_auc.mean())
print('AUC_train', roc_auc_score(y_full_train, model_probs_train))
print('best_params', gridsearch.best_params_)

print(gridsearch.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
model ada
AUC_CV 0.41853426857811193
AUC_train 0.920408157138779
best_params {'rf__min_samples_leaf': 10, 'rf__min_samples_split': 2, 'rf__n_estimators': 150}
0.5626193360540922


In [285]:
predict_y=pd.DataFrame(preds[:,1])

In [286]:
predict_y =predict_y.rename(columns={0:'WnvPresent'})

In [287]:
submission = pd.read_csv(r'../datasets/sampleSubmission.csv')

In [288]:
submission['WnvPresent'] = predict_y['WnvPresent']

In [289]:
submission.to_csv(r'../datasets/sampleSubmission_v8.csv', index=False)

In [290]:
submission

Unnamed: 0,Id,WnvPresent
0,1,0.017146
1,2,0.018111
2,3,0.043178
3,4,0.024364
4,5,0.024364
...,...,...
116288,116289,0.028237
116289,116290,0.027724
116290,116291,0.033462
116291,116292,0.033462


In [161]:
test_weather.isnull().sum()

Id                            0
Tavg                          0
DewPoint                      0
PrecipTotal                   0
StnPressure                   0
AvgSpeed                      0
Rain                          0
Mist                          0
Dust                          0
CULEX PIPIENS                 0
CULEX PIPIENS/RESTUANS        0
CULEX RESTUANS                0
CULEX SALINARIUS              0
CULEX TARSALIS                0
CULEX TERRITANS               0
low_risk                      0
medium_risk                   0
very_high_risk                0
very_low_risk                 0
DewPoint very_high_risk       0
Tavg very_high_risk           0
DewPoint CULEX PIPIENS        0
StnPressure very_high_risk    0
DewPoint StnPressure          0
Tavg DewPoint                 0
mth_7                         0
mth_8                         0
mth_9                         0
mth_10                        0
dtype: int64