In [1]:
import numpy as np
import pandas as pd

# Training Data

In [2]:
train = pd.read_csv('assets/train.csv')

In [3]:
train["Date"] = pd.to_datetime(train["Date"], infer_datetime_format=True)
train['Address'] = train['Address'].astype('category')
train['Species'] = train['Species'].astype('category')
train['Street'] = train['Street'].astype('category')
train['Trap'] = train['Trap'].astype('category')
train['AddressNumberAndStreet'] = train['AddressNumberAndStreet'].astype('category')

# Weather Data

In [4]:
weather = pd.read_csv('weather-nmo.csv', index_col=0)

In [5]:
# We need to create one line per date:
station1 = weather[weather['Station']==1]
station2 = weather[weather['Station']==2]
station1 = station1.drop('Station', axis=1)
station2 = station2.drop('Station', axis=1)

In [6]:
station1.columns = ['Date', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_CodeSum',
       'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed',
       'st1_ResultDir', 'st1_AvgSpeed', 'st1_Lat', 'st1_Long']
station2.columns = ['Date', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_CodeSum',
       'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed',
       'st2_ResultDir', 'st2_AvgSpeed', 'st2_Lat', 'st2_Long']

In [7]:
weather = pd.merge(station1, station2, on='Date')
weather["Date"] = pd.to_datetime(weather["Date"], infer_datetime_format=True)

In [8]:
# Feature engineer us some over time weather data
weather = weather.set_index('Date')

In [9]:
weather['precip_avg'] = (weather['st1_PrecipTotal'] + weather['st2_PrecipTotal'])/2
weather['2wk_precip'] = weather['precip_avg'].rolling(14, min_periods=1).sum()
weather['4wk_precip'] = weather['precip_avg'].rolling(28, min_periods=1).sum()
weather['90day_precip'] = weather['precip_avg'].rolling(90, min_periods=1).sum()

In [10]:
weather['temp_avg'] = (weather['st1_Tavg'] + weather['st2_Tavg'])/2
weather['2wk_tavg'] = weather['temp_avg'].rolling(14, min_periods=1).mean()
weather['4wk_tavg'] = weather['temp_avg'].rolling(28, min_periods=1).mean()
weather['90day_tavg'] = weather['temp_avg'].rolling(90, min_periods=1).mean()

In [11]:
weather['tempmin_avg'] = (weather['st1_Tmin'] + weather['st2_Tmin'])/2
weather['2wk_mintemp'] = weather['tempmin_avg'].rolling(14, min_periods=1).min()
weather['4wk_mintemp'] = weather['tempmin_avg'].rolling(28, min_periods=1).min()

In [12]:
weather['dew_avg'] = (weather['st1_DewPoint'] + weather['st2_DewPoint'])/2
weather['2wk_dew'] = weather['dew_avg'].rolling(14, min_periods=1).mean()
weather['4wk_dew'] = weather['dew_avg'].rolling(28, min_periods=1).mean()

In [13]:
weather = weather.reset_index()
train = pd.merge(train, weather, how='left', on='Date')

# Categories

In [14]:
final_df = pd.get_dummies(train, columns=['Species'])

# Time

In [15]:
final_df['Month'] = final_df['Date'].dt.month
final_df["Day"] = final_df['Date'].dt.dayofyear

# Location Info

In [16]:
#our two origins (the locations with the most WNV activity) are Chicago O'Hare and Doty Ave.
#the following values are their latitudes and longitudes
ohare_lon = -87.890615
ohare_lat = 41.974689
doty_lon =-87.599862
doty_lat=41.673408

In [17]:
lat = train.Latitude
lon = train.Longitude

In [18]:
#haversine takes two lat and longs and creates a distance, from the mean, in miles
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    mi = 3956   * c #Radius of earth in miles. Use 6367 for kilometers
    return mi, dlon, dlat

In [19]:
#apply haversine function to training dataset, creating a column called 'dist_from_ohare_MI'
final_df['dist_from_ohare_MI'] = [haversine(y, x, ohare_lon, ohare_lat)[0] for y, x in zip(lon, lat)]
#apply haversine function to training dataset, creating a column called 'dist_from_doty_MI'
final_df['dist_from_doty_MI'] = [haversine(y, x, doty_lon, doty_lat)[0] for y, x in zip(lon, lat)]

In [20]:
final_df.columns

Index(['Date', 'Address', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet',
       'Latitude', 'Longitude', 'AddressAccuracy', 'NumMosquitos',
       'WnvPresent', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint',
       'st1_WetBulb', 'st1_CodeSum', 'st1_SnowFall', 'st1_PrecipTotal',
       'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir',
       'st1_AvgSpeed', 'st1_Lat', 'st1_Long', 'st2_Tmax', 'st2_Tmin',
       'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_CodeSum',
       'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel',
       'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'st2_Lat',
       'st2_Long', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip',
       'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg',
       '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew',
       'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
    

In [21]:
test_features = final_df[['Latitude', 'Longitude', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir', 'st1_AvgSpeed', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip', 'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg', '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'Month', 'Day', 'dist_from_ohare_MI', 'dist_from_doty_MI']]
target = final_df.WnvPresent

# Scale stuff

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scale = StandardScaler()

In [24]:
test_features = pd.DataFrame(scale.fit_transform(test_features), columns=test_features.columns)

# Model Time

In [25]:
from sklearn.model_selection import train_test_split, cross_val_score

In [26]:
X_train, X_test, y_train, y_test = train_test_split(test_features, target, test_size=0.3, random_state=42)

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score

def eval_sklearn_model(y_true, predictions, model=None, X=None):
    """This function takes the true values for y and the predictions made by the model and prints out the confusion matrix along with Accuracy, Precision, and, if model and X provided, Roc_Auc Scores."""
    cnf_matrix = confusion_matrix(y_true, predictions)

    print('True Negative: ', cnf_matrix[0, 0], '| False Positive: ', cnf_matrix[0, 1])
    print('False Negative: ', cnf_matrix[1, 0], '| True Positive: ', cnf_matrix[1, 1], '\n')

    sensitivity = cnf_matrix[1, 1]/ (cnf_matrix[1, 0] + cnf_matrix[1, 1])
    specificity = cnf_matrix[0, 0]/ (cnf_matrix[0, 1] + cnf_matrix[0, 0])

    print('Sensitivity (TP/ TP + FN): ', sensitivity)
    print('Specificity (TN/ TN + FP): ', specificity, '\n')

    print('Accuracy: ', accuracy_score(y_true, predictions, normalize=True))
    print('Precision: ', precision_score(y_true, predictions))
    if model != None:
        print('Roc-Auc: ', roc_auc_score(y_true, [x[1] for x in model.predict_proba(X)]))
    else:
        pass
    print('\n')

# Here are some models and Grid Searches

More are found in the folder titled 'models'.  You can skip down to the Final Model section to run our final model.

## XGBoost

In [28]:
from xgboost import XGBClassifier



In [30]:
xgb = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic')
# make sure to pick the correct objective for the problem
# scale_pos_weight is supposed to help with unbalanced classes; it recommended number of negative cases divided by positive
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=18.101298701298703, seed=0, silent=True,
       subsample=1)

In [31]:
test_predictions = xgb.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=xgb, X=X_test)

True Negative:  2270 | False Positive:  716
False Negative:  33 | True Positive:  133 

Sensitivity (TP/ TP + FN):  0.801204819277
Specificity (TN/ TN + FP):  0.760214333557 

Accuracy:  0.762373096447
Precision:  0.156654888104
Roc-Auc:  0.847410405184




## Random Forests

In [32]:
from sklearn.ensemble import ExtraTreesClassifier

In [33]:
%%time
etc = ExtraTreesClassifier(class_weight='balanced', max_features='sqrt', min_samples_leaf=5, n_estimators=100, n_jobs=-1)
etc.fit(X_train, y_train)
test_predictions = etc.predict(X_test)
print('Extra Random Forest TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions,model=etc,X=X_test)

Extra Random Forest TEST SCORE:

True Negative:  2423 | False Positive:  563
False Negative:  52 | True Positive:  114 

Sensitivity (TP/ TP + FN):  0.686746987952
Specificity (TN/ TN + FP):  0.811453449431 

Accuracy:  0.804885786802
Precision:  0.168389955687
Roc-Auc:  0.839361800854


Wall time: 361 ms


In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
%%time
rfc = RandomForestClassifier(class_weight='balanced', max_features='sqrt', min_samples_leaf=5, n_estimators=1000, n_jobs=-1)
rfc.fit(X_train, y_train)
test_predictions = rfc.predict(X_test)
print('Random Forest TEST SCORE:\n')
eval_sklearn_model(y_test, test_predictions,model=etc,X=X_test)

Random Forest TEST SCORE:

True Negative:  2731 | False Positive:  255
False Negative:  90 | True Positive:  76 

Sensitivity (TP/ TP + FN):  0.457831325301
Specificity (TN/ TN + FP):  0.914601473543 

Accuracy:  0.890545685279
Precision:  0.229607250755
Roc-Auc:  0.839361800854


Wall time: 1.85 s


## Grid Search

In [38]:
from sklearn.model_selection import GridSearchCV
import time
import numpy as np

In [53]:
start_time = time.time()

# Pick which estimators you want to test (example is for random forest)
param_grid = dict(n_estimators = [100, 1000],
                 max_features = [10, 20, 30, 'sqrt'],
                 min_samples_leaf = [2, 3, 4, 5, 6],
                 )
# How many cross validation folds do you want?
cross_val=3

# Switch out the model here that you would like to test
model = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1)

grid = GridSearchCV(model, param_grid, cv=cross_val, scoring='roc_auc', verbose=1)

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_model = grid.best_estimator_

best_model = best_model.fit(X_train, y_train)

score = best_model.score(X_test, y_test)

print('Number of Models Run: ', np.prod([len(param_grid[i]) for i in param_grid]) * cross_val)
print("{} Score: {:0.3}".format('Decision Tree Classifier', score.mean().round(3)), '\n')
print('Elapsed Time: {:0.3}'.format( time.time() - start_time), ' seconds', '\n')
print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  2.3min finished


Number of Models Run:  120
Decision Tree Classifier Score: 0.798 

Elapsed Time: 1.39e+02  seconds 

ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='sqrt',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=6,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False) 

Best Hyperparameters we tested for 
 {'params': [('max_features', 'sqrt'), ('min_samples_leaf', 6), ('n_estimators', 1000)], 'score': 0.79040803737396559}


In [55]:
# Your best model from the grid is already fit and saved as best_model
test_predictions = best_model.predict(X_test)
print('Grid Search TEST SCORE:\n')
# function created above should be run before this cell
eval_sklearn_model(y_test, test_predictions, model=best_model, X=X_test)

Grid Search TEST SCORE:

True Negative:  2398 | False Positive:  588
False Negative:  49 | True Positive:  117 

Sensitivity (TP/ TP + FN):  0.704819277108
Specificity (TN/ TN + FP):  0.803081044876 

Accuracy:  0.797906091371
Precision:  0.165957446809
Roc-Auc:  0.842486826072




In [56]:
start_time = time.time()

# Pick which estimators you want to test (example is for random forest)
param_grid = dict(n_estimators = [1000, 2000],
                 max_features = [10, 20, 30, 'sqrt'],
                 min_samples_leaf = [2, 3, 4, 5, 6],
                 )
# How many cross validation folds do you want?
cross_val=3

# Switch out the model here that you would like to test
model = RandomForestClassifier(class_weight='balanced', n_jobs=-1)

grid = GridSearchCV(model, param_grid, cv=cross_val, scoring='roc_auc', verbose=1)

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_model = grid.best_estimator_

best_model = best_model.fit(X_train, y_train)

score = best_model.score(X_test, y_test)

print('Number of Models Run: ', np.prod([len(param_grid[i]) for i in param_grid]) * cross_val)
print("{} Score: {:0.3}".format('Decision Tree Classifier', score.mean().round(3)), '\n')
print('Elapsed Time: {:0.3}'.format( time.time() - start_time), ' seconds', '\n')
print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  6.9min finished


Number of Models Run:  120
Decision Tree Classifier Score: 0.899 

Elapsed Time: 4.21e+02  seconds 

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=20,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=6,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False) 

Best Hyperparameters we tested for 
 {'params': [('max_features', 20), ('min_samples_leaf', 6), ('n_estimators', 1000)], 'score': 0.80202909231930231}


In [57]:
# Your best model from the grid is already fit and saved as best_model
test_predictions = best_model.predict(X_test)
print('Grid Search TEST SCORE:\n')
# function created above should be run before this cell
eval_sklearn_model(y_test, test_predictions, model=best_model, X=X_test)

Grid Search TEST SCORE:

True Negative:  2762 | False Positive:  224
False Negative:  95 | True Positive:  71 

Sensitivity (TP/ TP + FN):  0.427710843373
Specificity (TN/ TN + FP):  0.924983255191 

Accuracy:  0.898794416244
Precision:  0.240677966102
Roc-Auc:  0.841158337301




### Feature Importances

In [58]:
feature_import = best_model.feature_importances_
features = X_train.columns
pd.DataFrame(columns=['feature', 'import'], data=list(zip(features, feature_import))).sort_values('import',ascending=False)

Unnamed: 0,feature,import
48,Day,0.123397
1,Longitude,0.100915
49,dist_from_ohare_MI,0.086271
39,4wk_dew,0.081232
50,dist_from_doty_MI,0.074891
0,Latitude,0.074716
33,90day_tavg,0.054776
47,Month,0.042208
32,4wk_tavg,0.034593
36,4wk_mintemp,0.025118


# Final Model

Based on grid searching and testing of other models as seen in models found in model folder

In [29]:
from xgboost import XGBClassifier

In [30]:
test_features = test_features.drop(['st1_SnowFall', 'st2_SnowFall', 'Species_CULEX TARSALIS', 'Species_CULEX SALINARIUS', 'Species_CULEX ERRATICUS', 'Species_CULEX TERRITANS', 'st2_DewPoint', 'st2_PrecipTotal', 'st1_DewPoint', 'st1_WetBulb', 'dew_avg', 'st2_WetBulb', 'st1_PrecipTotal', 'precip_avg', 'st1_SeaLevel', 'tempmin_avg', 'st1_Tmin', 'st2_Tavg', 'st2_SeaLevel',], 1)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(test_features, target, test_size=0.3, random_state=42)

In [32]:
xgb = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic', gamma=0.35, learning_rate=0.02, max_depth=3, n_estimators=200)
xgb.fit(X_train, y_train)
test_predictions = xgb.predict(X_test)
print('XGBoost Train/Test Score:\n')
eval_sklearn_model(y_test, test_predictions, model=xgb, X=X_test)

XGBoost Train/Test Score:

True Negative:  2112 | False Positive:  874
False Negative:  30 | True Positive:  136 

Sensitivity (TP/ TP + FN):  0.819277108434
Specificity (TN/ TN + FP):  0.707300736772 

Accuracy:  0.713197969543
Precision:  0.134653465347
Roc-Auc:  0.839673496397




# Set up test data and export

In [33]:
test = pd.read_csv('assets/test.csv')

In [34]:
test["Date"] = pd.to_datetime(test["Date"], infer_datetime_format=True)

In [35]:
test["Date"] = pd.to_datetime(test["Date"], infer_datetime_format=True)
test['Address'] = test['Address'].astype('category')
test['Species'] = test['Species'].astype('category')
test['Street'] = test['Street'].astype('category')
test['Trap'] = test['Trap'].astype('category')
test['AddressNumberAndStreet'] = test['AddressNumberAndStreet'].astype('category')

In [36]:
test = pd.merge(test, weather, how='left', on='Date')

In [37]:
test = pd.get_dummies(test, columns=['Species'])

In [38]:
test['Month'] = test['Date'].dt.month
test["Day"] = test['Date'].dt.dayofyear

In [39]:
lat = test.Latitude
lon = test.Longitude

In [40]:
#apply haversine function to training dataset, creating a column called 'dist_from_ohare_MI'
test['dist_from_ohare_MI'] = [haversine(y, x, ohare_lon, ohare_lat)[0] for y, x in zip(lon, lat)]
#apply haversine function to training dataset, creating a column called 'dist_from_doty_MI'
test['dist_from_doty_MI'] = [haversine(y, x, doty_lon, doty_lat)[0] for y, x in zip(lon, lat)]

In [41]:
# Make match above
pred_features = test[['Latitude', 'Longitude', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir', 'st1_AvgSpeed', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip', 'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg', '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'Month', 'Day', 'dist_from_ohare_MI', 'dist_from_doty_MI']]
pred_features = pred_features.drop(['st1_SnowFall', 'st2_SnowFall', 'Species_CULEX TARSALIS', 'Species_CULEX SALINARIUS', 'Species_CULEX ERRATICUS', 'Species_CULEX TERRITANS', 'st2_DewPoint', 'st2_PrecipTotal', 'st1_DewPoint', 'st1_WetBulb', 'dew_avg', 'st2_WetBulb', 'st1_PrecipTotal', 'precip_avg', 'st1_SeaLevel', 'tempmin_avg', 'st1_Tmin', 'st2_Tavg', 'st2_SeaLevel',], 1)

In [42]:
# CHECK QUICK
pred_features = pd.DataFrame(scale.fit_transform(pred_features), columns=pred_features.columns)

In [52]:
# Whatever model you decided on:
predictions = xgb.predict(pred_features)

In [44]:
submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test.Id, predictions)))
submission = submission.set_index('Id')
print('Submission Shape: ', submission.shape, '\n')
print('Submission Results: \n', submission['WnvPresent'].value_counts())

Submission Shape:  (116293, 1) 



# Predict_Proba

For Kaggle submission

In [43]:
# Whatever model you decided on:
predictions = xgb.predict_proba(pred_features)

In [44]:
predictions = xgb.predict_proba(pred_features)
submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test.Id, predictions)))
print('Submission Shape: ', submission.shape, '\n')

In [50]:
submission.WnvPresent = submission.WnvPresent.apply(lambda x: x[1])

In [51]:
submission.to_csv('submission.csv',)