In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
#Referenced 6.02-lesson-bagging-rfs for below

In [115]:
#Creating decision tree model with combined + cleaned dataset: train_merge_all.csv, test_merge_all.csv
train = pd.read_csv('train_merge_all.csv')
test = pd.read_csv('test_merge_all.csv')

train = train.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 1)
test = test.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 1)

In [116]:
#check train + test set variable sets: these do not include target Y = WnvPresent (0/1)
print(train.shape, test.shape)
print('\n')
print(train.columns, '\n\n', test.columns)

(10506, 28) (116293, 28)


Index(['Date', 'Species', 'Trap', 'Longitude', 'Latitude', 'Tmax', 'Tmin',
       'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise',
       'Sunset', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'DaysFromSpray', 'LogDays', 'SprayEffect'],
      dtype='object') 

 Index(['Date', 'Species', 'Trap', 'Longitude', 'Latitude', 'Tmax', 'Tmin',
       'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise',
       'Sunset', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'DaysFromSpray', 'LogDays', 'SprayEffect'],
      dtype='object')


In [109]:
#turn text features (col CodeSum from weather data) into categories
train = pd.get_dummies(train)
train.head()

test = pd.get_dummies(test)
test.head()

Unnamed: 0,Longitude,Latitude,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,"CodeSum_['RA', 'BR', 'HZ', 'TS', 'TSRA']","CodeSum_['RA', 'BR', 'HZ', 'TSRA', 'VCTS']","CodeSum_['RA', 'BR', 'TS', 'TSRA']","CodeSum_['RA', 'SQ', 'BR', 'HZ', 'TS', 'TSRA']","CodeSum_['RA', 'TS', 'TSRA']","CodeSum_['RA', 'TSRA']",CodeSum_['RA'],"CodeSum_['TS', 'RA', 'TSRA']","CodeSum_['VCTS', 'BR', 'RA', 'TSRA']",CodeSum_[]
0,-87.800991,41.95469,82.3,64.7,73.7,7.3,65.2,68.2,0.0,8.7,...,0,0,0,0,0,0,0,0,0,0
1,-87.800991,41.95469,82.3,64.7,73.7,7.3,65.2,68.2,0.0,8.7,...,0,0,0,0,0,0,0,0,0,0
2,-87.800991,41.95469,82.3,64.7,73.7,7.3,65.2,68.2,0.0,8.7,...,0,0,0,0,0,0,0,0,0,0
3,-87.800991,41.95469,82.3,64.7,73.7,7.3,65.2,68.2,0.0,8.7,...,0,0,0,0,0,0,0,0,0,0
4,-87.800991,41.95469,82.3,64.7,73.7,7.3,65.2,68.2,0.0,8.7,...,0,0,0,0,0,0,0,0,0,0


In [49]:
#Set X (variable) and y (target) datasets for training data
X = train

y = pd.read_csv('./datasets/train.csv')
y = y['WnvPresent']

print(X.shape, y.shape)

(10506, 308) (10506,)


In [50]:
#Set baseline accuracy
baseline = y.value_counts() / len(y)
print(baseline)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64


In [51]:
#train-test-split training set: 70/30 split, stratify y to preseve same distro seen in overall training set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y, test_size = 0.3)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7354, 308) (7354,)
(3152, 308) (3152,)


In [78]:
#instantiate, fit + score Decision Tree classifier
tree = DecisionTreeClassifier()

tree.fit(X_train, y_train)
print('train:', tree.score(X_train, y_train))
print('test:', tree.score(X_test, y_test))    #shows slight bias

train: 0.9843622518357357
test: 0.9238578680203046


In [79]:
#bagging classifier with DecisionTreeClassifier
bag = BaggingClassifier(random_state = 42, base_estimator = DecisionTreeClassifier(), n_estimators = 100)
bag.fit(X_train, y_train)
print('train:', bag.score(X_train, y_train))
print('test:', bag.score(X_test, y_test))    #shows slight bias, can tune hyperparameter n_estimators

train: 0.984226271416916
test: 0.9359137055837563


In [92]:
#bagging classifier with DecisionTreeClassifier tuning for hyperparameters
bag = BaggingClassifier(random_state=42, base_estimator = DecisionTreeClassifier())
bag_params = {
    'base_estimator__max_depth': range(1, 10),    #iter1: range(2, 4)
    'base_estimator__min_samples_split': range(2, 10),    #iter1: range(2, 5)
    'n_estimators': [1, 2, 5, 7, 9, 10]    #iter1: [10, 20, 50, 75, 100]
}


gs = GridSearchCV(bag, param_grid=bag_params)
gs.fit(X_train, y_train)

print('train:', gs.best_score_)
print('train:', gs.best_params_)


train: 0.9475115583355996
train: {'base_estimator__max_depth': 1, 'base_estimator__min_samples_split': 2, 'n_estimators': 1}


In [95]:
#gs.best_params_.keys()

In [98]:
#try bagging classifier with best_params from tuning
best = BaggingClassifier(random_state = 42, 
                         base_estimator = DecisionTreeClassifier(
                             max_depth = gs.best_params_['base_estimator__max_depth'], 
                             min_samples_split = gs.best_params_['base_estimator__min_samples_split']), 
                         n_estimators = gs.best_params_['n_estimators'])

best.fit(X_train, y_train)
print('train:', best.score(X_train, y_train))
print('test:', best.score(X_test, y_test))    #better bias-variance tradeoff

train: 0.9475115583355996
test: 0.9476522842639594


In [82]:
#Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print('train:', rf.score(X_train, y_train))
print('test:', rf.score(X_test, y_test))    #biased

train: 0.9772912700571118
test: 0.9406725888324873


In [83]:
#ExtraTrees classifier
et = ExtraTreesClassifier()
et.fit(X_train, y_train)
print('train:', et.score(X_train, y_train))
print('test:', et.score(X_test, y_test))    #biased

train: 0.9843622518357357
test: 0.9441624365482234


In [142]:
X_train.shape

(7354, 308)

In [99]:
#comparing against LogReg model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
print('train:', lr.score(X_train, y_train))
print('test:', lr.score(X_test, y_test))    #very similar to results from bagged DecisionTree


train: 0.9479194995920588
test: 0.9486040609137056


In [85]:
#best test set score for DT: bagging with decisiontree classifier
    #gridsearch for hyperparameters
#on par with score from LogReg

#both on par with baseline, not performing particularly better
baseline

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

In [172]:
#predict target WnvPresent in test_merge_all.csv
    #1. Bagged/tuned DT predict
        #take unique values of CodeSum and one-hot encode 1 if value in CodeSum col
    #2. LogReg predict
        #take unique values of CodeSum and one-hot encode 1 if value in CodeSum col
    #3. LogReg predict 
        #simplified
        #w/o CodeSum, Date, Trap, + unspecified species in test for type of mosquito
            #pd.get_dummies returns different number of cols in train vs test sets


In [170]:
#3. LogReg model: simplified
X = pd.read_csv('train_merge_all.csv')
X = X.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Date', 'CodeSum', 'Trap'], axis = 1)
X = pd.get_dummies(X)
print(list(X.columns), X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify =  y, test_size = 0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print('train:', lr.score(X_train, y_train))
print('test:', lr.score(X_test, y_test))


['Longitude', 'Latitude', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'DaysFromSpray', 'LogDays', 'SprayEffect', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS'] (10506, 31)
(7354, 31) (7354,)
(3152, 31) (3152,)
train: 0.9475115583355996
test: 0.9476522842639594


In [189]:
#3. predict target
X = pd.read_csv('test_merge_all.csv')
X = X.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Date', 'CodeSum', 'Trap'], axis = 1)
X = pd.get_dummies(X).drop(['Species_UNSPECIFIED CULEX'], axis = 1)
print(list(X.columns), X.shape)

y_pred = lr.predict(X)

['Longitude', 'Latitude', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'DaysFromSpray', 'LogDays', 'SprayEffect', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS'] (116293, 31)


In [194]:
#input target values to sample submission format + save to_csv
X['WnvPresent'] = y_pred
X = X.loc[:, ['WnvPresent']]
X.index += 1    #reset index to start at 1
X.index.names = ['Id']

X.to_csv('sample1hc.csv')


In [197]:
X['WnvPresent'].value_counts()
#need to look into this

0    116293
Name: WnvPresent, dtype: int64

In [195]:
#predict y value for test_merge_all.csv
#train_checker = pd.get_dummies(train['CodeSum']).columns
#test_checker = pd.get_dummies(test['CodeSum']).columns

#i = 0
#for train_col in train_checker:
    #print(train_col)
#    i += 1
#    for test_col in test_checker:
#        print(train_col, test_col)
#        if train_col == test_col:
#            print (i, 'yes')
#        else:
#            print (i, 'no') 
    
#lr.predict(test)