In [161]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import RandomOverSampler, SMOTE

train = pd.read_csv('merged_train_weather-final')

In [162]:
pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 120)

In [163]:
train.head()

Unnamed: 0,Date,BCFG,BR,CALM,DZ,FG,FG+,FU,GR,HZ,MIFG,RA,SN,SQ,TS,TSRA,VCFG,VCTS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Species,Block,Latitude,Longitude,WnvPresent
0,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX PIPIENS/RESTUANS,41,41.95469,-87.800991,0
1,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX RESTUANS,41,41.95469,-87.800991,0
2,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX RESTUANS,62,41.994991,-87.769279,0
3,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX PIPIENS/RESTUANS,79,41.974089,-87.824812,0
4,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX RESTUANS,79,41.974089,-87.824812,0


In [164]:
# adding year, month, and day variables

train['Year'] = train['Date'].transform(lambda x: int(x[:4]))
train['Month'] = train['Date'].transform(lambda x: int(x[5:7]))
train['Day'] = train['Date'].transform(lambda x: int(x[8:]))

In [165]:
train.head()

Unnamed: 0,Date,BCFG,BR,CALM,DZ,FG,FG+,FU,GR,HZ,MIFG,RA,SN,SQ,TS,TSRA,VCFG,VCTS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Species,Block,Latitude,Longitude,WnvPresent,Year,Month,Day
0,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX PIPIENS/RESTUANS,41,41.95469,-87.800991,0,2007,5,29
1,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX RESTUANS,41,41.95469,-87.800991,0,2007,5,29
2,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX RESTUANS,62,41.994991,-87.769279,0,2007,5,29
3,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX PIPIENS/RESTUANS,79,41.974089,-87.824812,0,2007,5,29
4,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,CULEX RESTUANS,79,41.974089,-87.824812,0,2007,5,29


In [166]:
#adding species dummies

species_dummies = pd.get_dummies(train['Species'])
train = pd.concat([train.drop('Species', axis=1),species_dummies], axis=1)
train.drop('Date', axis=1, inplace=True)

In [167]:
block_dummies = pd.get_dummies(train['Block'])
train = pd.concat([train, block_dummies], axis=1)

In [168]:
train.head()

Unnamed: 0,BCFG,BR,CALM,DZ,FG,FG+,FU,GR,HZ,MIFG,RA,SN,SQ,TS,TSRA,VCFG,VCTS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Block,Latitude,Longitude,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,10,11,12,13,14,15,17,18,20,21,22,24,25,27,28,29,30,33,34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,55,58,60,61,62,63,64,65,66,67,68,70,71,72,73,75,77,79,80,81,82,89,90,91,93,96,98
0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,41,41.95469,-87.800991,0,2007,5,29,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,41,41.95469,-87.800991,0,2007,5,29,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,62,41.994991,-87.769279,0,2007,5,29,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,79,41.974089,-87.824812,0,2007,5,29,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,421,1917,0.0,29.415,30.1,5.8,17.0,6.95,79,41.974089,-87.824812,0,2007,5,29,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


# 1. Logistic Regression Under-Sampling

In [169]:
# X = train[['Latitude', 'Longitude', 'Heat', 'Cool', 'Depart', 'Tmin', 'Tmax', 'PrecipTotal', 'ResultDir', 'Tavg',
#         'Sunrise', 'Sunset', 'Block', 'FG', 'WetBulb', 'TS', 'Month']]

X = train[['Latitude', 'Longitude', 'Heat', 'Cool', 'Depart', 'Tmin', 'Tmax', 'PrecipTotal', 'ResultDir', 'Tavg',
        'Sunrise', 'Sunset', 'Block', 'FG', 'WetBulb', 'TS', 'Month', 'CULEX ERRATICUS', 'CULEX PIPIENS',
          'CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX SALINARIUS', 'CULEX TARSALIS']]

# X = train[['Latitude', 'Longitude', 'PrecipTotal', 'FG', 'Month', 'Sunrise', 'ResultDir', 'Tavg', 'Heat',
#           'Depart', 'Cool', 'Tmin', 'Tmax', 10, 11, 12, 13, 14, 15, 17, 18, 20, 21, 22, 24,
#           25, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46,
#           47, 48, 49, 50, 51, 52, 53, 55, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68,
#           70, 71, 72, 73, 75, 77, 79, 80, 81, 82, 89, 90, 91, 93, 96, 'CULEX ERRATICUS', 'CULEX PIPIENS',
#            'CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX SALINARIUS', 'CULEX TARSALIS']]

# X = train.loc[:, train.columns != 'WnvPresent']

y = train['WnvPresent']


In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [171]:
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [187]:
rus = RandomUnderSampler(random_state=42, ratio=.5)

In [188]:
X_rus, y_rus = rus.fit_sample(X_train, y_train)



In [189]:
print(type(X_rus))

<class 'numpy.ndarray'>


In [190]:
unique, counts = np.unique(y_rus, return_counts=True)

print(np.asarray((unique, counts)))

[[  0   1]
 [834 417]]


In [191]:
y_test.shape

(2627,)

In [192]:
logreg = LogisticRegression()
logreg_params = {
    'penalty':['l1', 'l2'],
    'C':[.1, .3, .5, .7, .9, 1.0]
}
gslog = GridSearchCV(logreg, param_grid=logreg_params, cv=5)
gslog.fit(X_rus, y_rus)
print(gslog.best_params_)
print('Train Score:', gslog.score(X_rus, y_rus))
print('Test Score:', gslog.score(X_test, y_test))
predictions = gslog.predict(X_test)
probs = gslog.predict_proba(X_test)


{'C': 0.7, 'penalty': 'l2'}
Train Score: 0.7521982414068745
Test Score: 0.8241339931480777


In [193]:
cnf_matrix = confusion_matrix(y_test, predictions)

In [194]:
cnf_matrix

array([[2104,  389],
       [  73,   61]])

In [195]:
roc_auc_score(y_test, predictions)

0.6495934886338465

# 2. SVM Under-Sampling

In [196]:
clf = svm.SVC()

gamma_range = np.logspace(-5, 2, 10)
C_range = np.logspace(-3, 2, 10)
kernel_range = ['rbf']

param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)

grid = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', verbose=1)
grid.fit(X_rus, y_rus)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   16.3s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': array([1.00000e-05, 5.99484e-05, 3.59381e-04, 2.15443e-03, 1.29155e-02,
       7.74264e-02, 4.64159e-01, 2.78256e+00, 1.66810e+01, 1.00000e+02]), 'C': array([1.00000e-03, 3.59381e-03, 1.29155e-02, 4.64159e-02, 1.66810e-01,
       5.99484e-01, 2.15443e+00, 7.74264e+00, 2.78256e+01, 1.00000e+02]), 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [197]:
predictions = grid.predict(X_test)

In [198]:
print(grid.best_params_)
print('Train Score:', grid.score(X_rus, y_rus))
print('Test Score:', grid.score(X_test, y_test))

{'C': 0.5994842503189409, 'gamma': 0.0774263682681127, 'kernel': 'rbf'}
Train Score: 0.7857713828936851
Test Score: 0.8519223448800913


In [199]:
cnf_matrix = confusion_matrix(y_test, predictions)

In [200]:
cnf_matrix

array([[2166,  327],
       [  62,   72]])

In [201]:
roc_auc_score(y_test, predictions)

0.7030730822422185

# 3. kNN Under-Sampling

In [202]:
knn = KNeighborsClassifier()

knn_params = {
    'n_neighbors':[5, 10, 15, 20, 25, 30, 35, 40],
    'weights':['uniform', 'distance'],
    'p':[1, 2]
    
}
gsknn = GridSearchCV(knn, param_grid=knn_params, cv=5)
gsknn.fit(X_rus, y_rus)
print(gsknn.best_params_)
print('Train Score:', gsknn.score(X_rus, y_rus))
print('Test Score:', gsknn.score(X_test, y_test))

predictions = gsknn.predict(X_test)

{'n_neighbors': 40, 'p': 1, 'weights': 'uniform'}
Train Score: 0.768185451638689
Test Score: 0.8431671107727445


In [203]:
roc_auc_score(y_test, predictions)

0.6878678209434177

In [204]:
cnf_matrix = confusion_matrix(y_test, predictions)
cnf_matrix
# 2305 188 87 47

array([[2146,  347],
       [  65,   69]])

# 4. Logistic Regression Over-Sampling

In [205]:
ros = RandomOverSampler(random_state=42)

In [206]:
X_ros, y_ros = ros.fit_sample(X_train, y_train)

In [207]:
X_ros.shape, y_ros.shape

((14924, 23), (14924,))

In [208]:
unique, counts = np.unique(y_ros, return_counts=True)

print(np.asarray((unique, counts)))

[[   0    1]
 [7462 7462]]


In [209]:
logreg2 = LogisticRegression()
logreg2_params = {
    'penalty':['l1', 'l2'],
    'C':[.1, .3, .5, .7, .9, 1.0]
}
gslog = GridSearchCV(logreg2, param_grid=logreg2_params, cv=5)
gslog.fit(X_ros, y_ros)
print(gslog.best_params_)
predictions = gslog.predict(X_test)
probs = gslog.predict_proba(X_test)

{'C': 0.9, 'penalty': 'l1'}


In [210]:
print('Train Score:', gslog.score(X_ros, y_ros))
print('Test Score:', gslog.score(X_test, y_test))

Train Score: 0.7302331814526937
Test Score: 0.6874762086029692


In [211]:
cnf_matrix = confusion_matrix(y_test, predictions)

In [212]:
cnf_matrix

array([[1705,  788],
       [  33,  101]])

In [213]:
roc_auc_score(y_test, predictions)

0.7188231525884416

# 5. SVM Over-Sampling

In [None]:
clf = svm.SVC()

gamma_range = np.logspace(-5, 2, 10)
C_range = np.logspace(-3, 2, 10)
kernel_range = ['rbf']

param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)

grid = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', verbose=1)
grid.fit(X_ros, y_ros)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
print(grid.best_params_)
print(grid.score('Train Score:', X_ros, y_ros))
print(grid.score('Test Score:', X_test, y_test))

In [None]:
predictions = grid.predict(X_test)

In [None]:
cnf_matrix = confusion_matrix(y_test, predictions)

In [None]:
cnf_matrix

# 6. kNN Over-Sampling

In [20]:
knn = KNeighborsClassifier()

knn_params = {
    'n_neighbors':[5, 10, 15, 20, 25, 30, 35, 40],
    'weights':['uniform', 'distance'],
    'p':[1, 2]
    
}
gsknn = GridSearchCV(knn, param_grid=knn_params, cv=5)
gsknn.fit(X_ros, y_ros)
print(gsknn.best_params_)

predictions = gsknn.predict(X_test)

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}


In [21]:
roc_auc_score(y_test, predictions)

0.5877935832270657

In [22]:
print(gsknn.best_params_)
print(gsknn.score(X_ros, y_ros))
print(gsknn.score(X_test, y_test))

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.9620745108549986
0.9078797106966121


In [23]:
cnf_matrix = confusion_matrix(y_test, predictions)

In [24]:
cnf_matrix

array([[2354,  139],
       [ 103,   31]])