In [139]:
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']

import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interactive, FloatSlider

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, log_loss, roc_curve, auc
from sklearn import svm

from xgboost import XGBClassifier, plot_importance

from imblearn.over_sampling import RandomOverSampler



ModuleNotFoundError: No module named 'imblearn'

In [13]:
def scores(ytest, predict):
    print("Recall:  " + str(recall_score(ytest, predict)))
    print("Accuracy:  " + str(accuracy_score(ytest, predict)))
    print()
    print(confusion_matrix(ytest, predict))

In [2]:
with open('sj_df.pickle', 'rb') as read_file:
    sj_df = pickle.load(read_file)
    
with open('iq_df.pickle', 'rb') as read_file:
    iq_df = pickle.load(read_file)

In [3]:
sj_df['isFallorWinter'] = np.where(sj_df['weekofyear'] >= 35, 1, 0)

In [116]:
iq_df['q*Td'] = iq_df['reanalysis_specific_humidity_g_per_kg'] * iq_df['reanalysis_dew_point_temp_k']

In [4]:
sj_df.head()

Unnamed: 0,outbreak,city,year,weekofyear,total_cases,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,isFallorWinter
0,0,sj,1990,18,4,1990-04-30,0.1226,0.103725,0.198483,0.177617,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,0
1,0,sj,1990,19,5,1990-05-07,0.1699,0.142175,0.162357,0.155486,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,0
2,0,sj,1990,20,4,1990-05-14,0.03225,0.172967,0.1572,0.170843,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,0
3,0,sj,1990,21,3,1990-05-21,0.128633,0.245067,0.227557,0.235886,...,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,0
4,0,sj,1990,22,6,1990-05-28,0.1962,0.2622,0.2512,0.24734,...,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,0


In [6]:
sj_df.outbreak.value_counts()

0    495
1    441
Name: outbreak, dtype: int64

In [117]:
iq_df.head()

Unnamed: 0,outbreak,city,year,weekofyear,total_cases,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,q*Td
1008,0,iq,2001,47,1,2001-11-19,0.3297,0.330417,0.340643,0.441571,...,88.837143,65.89,18.451429,9.228571,27.575,11.575,33.7,20.8,31.0,5475.514147
1009,0,iq,2001,48,1,2001-11-26,0.3015,0.3613,0.305571,0.369917,...,87.377143,47.46,17.427143,8.014286,28.233333,11.666667,35.3,21.8,12.2,5155.148024
1010,0,iq,2001,49,1,2001-12-03,0.325057,0.226471,0.299257,0.350471,...,87.24,46.86,17.595714,8.871429,28.425,11.65,35.5,21.2,23.1,5208.004651
1011,0,iq,2001,50,2,2001-12-10,0.326033,0.235533,0.315571,0.306643,...,96.118571,138.22,19.305714,7.785714,27.84,10.96,35.0,21.0,75.0,5743.45
1012,0,iq,2001,51,4,2001-12-17,0.222943,0.224071,0.212814,0.184129,...,93.947143,43.95,18.965714,7.385714,27.075,9.5,32.9,21.0,57.9,5636.827037


In [7]:
iq_df.outbreak.value_counts()

0    376
1     72
Name: outbreak, dtype: int64

In [118]:
mm_scaler = MinMaxScaler()
sj_train, sj_validate, sj_test = np.split(sj_df.sample(frac=1), [int(.6*len(sj_df)), int(.8*len(sj_df))])

sj_X_train = sj_train.iloc[:,6:]
sj_y_train = sj_train.iloc[:,0]

sj_X_val = sj_validate.iloc[:,6:]
sj_y_val = sj_validate.iloc[:,0]

sj_X_test = sj_test.iloc[:,6:]
sj_y_test = sj_test.iloc[:,0]

sj_X_train_mm = mm_scaler.fit_transform(sj_X_train)
sj_X_val_mm = mm_scaler.fit_transform(sj_X_val)
sj_X_test_mm = mm_scaler.fit_transform(sj_X_test)

iq_train, iq_validate, iq_test = np.split(iq_df.sample(frac=1), [int(.6*len(iq_df)), int(.8*len(iq_df))])

iq_X_train = iq_train.iloc[:,6:]
iq_y_train = iq_train.iloc[:,0]

iq_X_val = iq_validate.iloc[:,6:]
iq_y_val = iq_validate.iloc[:,0]

iq_X_test = iq_test.iloc[:,6:]
iq_y_test = iq_test.iloc[:,0]

iq_X_train_mm = mm_scaler.fit_transform(iq_X_train)
iq_X_val_mm = mm_scaler.fit_transform(iq_X_val)
iq_X_test_mm = mm_scaler.fit_transform(iq_X_test)

In [64]:
sj_train.head()

Unnamed: 0,outbreak,city,year,weekofyear,total_cases,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,isFallorWinter
279,1,sj,1995,36,40,1995-09-10,0.1071,0.1836,0.18515,0.169957,...,78.145714,20.23,18.545714,3.7,29.571429,7.914286,34.4,23.9,20.4,1
1,0,sj,1990,19,5,1990-05-07,0.1699,0.142175,0.162357,0.155486,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,0
290,0,sj,1995,47,19,1995-11-26,0.0334,0.114867,0.240014,0.2549,...,78.141429,90.49,16.541429,2.228571,27.314286,7.314286,32.2,22.8,5.9,1
557,0,sj,2001,3,17,2001-01-15,0.067733,0.02165,0.18585,0.179686,...,71.941429,0.0,13.921429,3.414286,25.071429,8.157143,30.6,19.4,4.8,0
436,1,sj,1998,38,54,1998-09-17,0.06275,0.0401,0.155483,0.14148,...,82.332857,35.03,18.847143,2.185714,28.728571,6.985714,34.4,24.4,39.7,1


In [59]:
logit_sj = LogisticRegression(C=100, penalty='l2')
logit_sj_fit = logit_sj.fit(sj_X_train_mm, sj_y_train)
logit_sj_predict = logit_sj_fit.predict(sj_X_val_mm)



In [60]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print(np.mean(cross_val_score(logit_sj, sj_X_train_mm, sj_y_train, cv=kf)))

0.645290771175727




In [61]:
scores(sj_y_val, logit_sj_predict)

Recall:  0.7674418604651163
Accuracy:  0.6203208556149733

[[50 51]
 [20 66]]


In [44]:
def make_confusion_matrix(model, X_test, y_test, threshold=0.5):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    # (model.predict(X_test) does this automatically with a threshold of 0.5)
    y_predict = (model.predict_proba(X_test)[:, 1] >= threshold)
    fraud_confusion = confusion_matrix(y_test, y_predict)
    plt.figure(dpi=80)
    sns.heatmap(fraud_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['healthy', 'outbreak'],
           yticklabels=['healthy', 'outbreak']);
    plt.xlabel('prediction')
    plt.ylabel('actual')

In [63]:
interactive(lambda threshold: make_confusion_matrix(logit_sj, sj_X_val_mm, sj_y_val, threshold), threshold=(0.0,1.0,0.02))

interactive(children=(FloatSlider(value=0.5, description='threshold', max=1.0, step=0.02), Output()), _dom_cla…

In [85]:
sj_predictions = logit_sj_fit.predict_proba(sj_X_test_mm)[:,1]
sj_predict_final = np.where(sj_predictions > 0.3, 1, 0)

In [86]:
scores(sj_y_test, sj_predict_final)

Recall:  0.8607594936708861
Accuracy:  0.601063829787234

[[45 64]
 [11 68]]


In [123]:
iq_X_train.reset_index().head()

Unnamed: 0,index,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,q*Td
0,1243,0.206514,0.117143,0.169271,0.170071,91.4,297.992857,299.114286,293.777143,307.7,...,80.385714,91.4,15.46,10.885714,27.466667,10.433333,34.0,21.7,83.0,4541.794629
1,1039,0.179171,0.2791,0.132614,0.213286,3.18,295.808571,296.907143,292.147143,305.3,...,82.167143,3.18,13.87,10.728571,26.15,10.8,32.6,18.6,7.1,4052.080871
2,1032,0.227257,0.178243,0.177386,0.210114,101.3,297.355714,298.435714,296.444286,304.4,...,95.22,101.3,18.024286,8.042857,27.5,11.525,34.0,21.3,25.4,5343.196504
3,1216,0.442429,0.348171,0.343071,0.473143,49.87,298.31,300.092857,296.895714,307.2,...,93.177143,49.87,18.582857,8.557143,27.6,10.15,34.5,21.5,139.5,5517.170645
4,1405,0.385914,0.290383,0.336429,0.292786,91.4,296.065714,296.992857,295.167143,306.2,...,95.385714,91.4,16.68,8.128571,27.6,10.766667,33.4,21.6,2.0,4923.387943


In [132]:
rf_random.best_params_

{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 70, 'bootstrap': True}

In [134]:
rf_iq = RandomForestClassifier(n_estimators=400, min_samples_split=10, min_samples_leaf=4, 
                               max_features='auto', max_depth=70, bootstrap=True)
rf_iq_fit = rf_iq.fit(iq_X_train_mm, iq_y_train)
rf_iq_predict = rf_iq_fit.predict(iq_X_val_mm)

In [135]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print(np.mean(cross_val_score(rf_iq, iq_X_train_mm, iq_y_train, cv=kf)))

0.820754716981132


In [136]:
scores(iq_y_val, rf_iq_predict)

Recall:  0.0
Accuracy:  0.8888888888888888

[[80  0]
 [10  0]]


In [137]:
interactive(lambda threshold: make_confusion_matrix(rf_iq, iq_X_val_mm, iq_y_val, threshold), threshold=(0.0,1.0,0.02))

interactive(children=(FloatSlider(value=0.5, description='threshold', max=1.0, step=0.02), Output()), _dom_cla…

In [138]:
iq_y_train.value_counts()

0    220
1     48
Name: outbreak, dtype: int64

In [128]:
rf = RandomForestClassifier()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [129]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [130]:
rf_random.fit(iq_X_train_mm, iq_y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   51.2s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 70, 'bootstrap': True}