# James' Preliminary Modeling Notebook

In [5]:
# import all relevant packages

import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Dropout

%matplotlib inline

In [21]:
# define dataframes

train = pd.read_csv('./assets/train.csv')
test = pd.read_csv('./assets/test.csv')
weather = pd.read_csv('./assets/weather.csv')
spray = pd.read_csv('./assets/spray.csv')

In [3]:
train

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0
5,2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8,2,0
6,2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8,1,0
7,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,0
8,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,2,0
9,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,1,0


In [22]:
test

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
6,7,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",UNSPECIFIED CULEX,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
7,8,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX ERRATICUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9
9,10,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9


In [23]:
weather

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.00,29.10,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.00,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.00,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.00,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.00,29.39,30.12,11.7,7,11.9
5,2,2007-05-03,67,48,58,M,40,50,7,0,...,HZ,M,M,M,0.00,29.46,30.12,12.9,6,13.2
6,1,2007-05-04,66,49,58,4,41,50,7,0,...,RA,0,M,0.0,T,29.31,30.05,10.4,8,10.8
7,2,2007-05-04,78,51,M,M,42,50,M,M,...,,M,M,M,0.00,29.36,30.04,10.1,7,10.4
8,1,2007-05-05,66,53,60,5,38,49,5,0,...,,0,M,0.0,T,29.40,30.10,11.7,7,12.0
9,2,2007-05-05,66,54,60,M,39,50,5,0,...,,M,M,M,T,29.46,30.09,11.2,7,11.5


In [26]:
spray

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.390410,-88.088858
5,2011-08-29,6:57:48 PM,42.390395,-88.088315
6,2011-08-29,6:57:58 PM,42.390673,-88.088002
7,2011-08-29,6:58:08 PM,42.391027,-88.088002
8,2011-08-29,6:58:18 PM,42.391403,-88.088003
9,2011-08-29,6:58:28 PM,42.391718,-88.087995


In [None]:
# courtesy of Brian Collins - copied

def date_separate(df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

# make all the dataset applied to the function
train=date_separate(train)
weather=date_separate(weather)
spray=date_separate(spray)

In [4]:
pandas_profiling.ProfileReport(train)

0,1
Number of variables,12
Number of observations,10506
Total Missing (%),0.0%
Total size in memory,985.0 KiB
Average record size in memory,96.0 B

0,1
Numeric,5
Categorical,6
Boolean,1
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,138
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0

0,1
"ORD Terminal 5, O'Hare International Airport, Chicago, IL 60666, USA",750
"South Doty Avenue, Chicago, IL, USA",542
"South Stony Island Avenue, Chicago, IL, USA",314
Other values (135),8900

Value,Count,Frequency (%),Unnamed: 3
"ORD Terminal 5, O'Hare International Airport, Chicago, IL 60666, USA",750,7.1%,
"South Doty Avenue, Chicago, IL, USA",542,5.2%,
"South Stony Island Avenue, Chicago, IL, USA",314,3.0%,
"4100 North Oak Park Avenue, Chicago, IL 60634, USA",185,1.8%,
"4200 West 127th Street, Alsip, IL 60803, USA",183,1.7%,
"2200 North Cannon Drive, Chicago, IL 60614, USA",163,1.6%,
"2400 East 105th Street, Chicago, IL 60617, USA",160,1.5%,
"7000 West Armitage Avenue, Chicago, IL 60707, USA",156,1.5%,
"3700 118th Street, Chicago, IL 60617, USA",152,1.4%,
"University of Illinois at Chicago, 1100 South Ashland Avenue, Chicago, IL 60607, USA",151,1.4%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,7.8195
Minimum,3
Maximum,9
Zeros (%),0.0%

0,1
Minimum,3
5-th percentile,5
Q1,8
Median,8
Q3,9
95-th percentile,9
Maximum,9
Range,6
Interquartile range,1

0,1
Standard deviation,1.4529
Coef of variation,0.18581
Kurtosis,0.65475
Mean,7.8195
MAD,1.0534
Skewness,-1.3695
Sum,82152
Variance,2.111
Memory size,82.2 KiB

Value,Count,Frequency (%),Unnamed: 3
8,4628,44.1%,
9,3980,37.9%,
5,1807,17.2%,
3,91,0.9%,

Value,Count,Frequency (%),Unnamed: 3
3,91,0.9%,
5,1807,17.2%,
8,4628,44.1%,
9,3980,37.9%,

Value,Count,Frequency (%),Unnamed: 3
3,91,0.9%,
5,1807,17.2%,
8,4628,44.1%,
9,3980,37.9%,

0,1
Distinct count,138
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0

0,1
"1000 W OHARE AIRPORT, Chicago, IL",750
"1200 S DOTY AVE, Chicago, IL",542
"1000 S STONY ISLAND AVE, Chicago, IL",314
Other values (135),8900

Value,Count,Frequency (%),Unnamed: 3
"1000 W OHARE AIRPORT, Chicago, IL",750,7.1%,
"1200 S DOTY AVE, Chicago, IL",542,5.2%,
"1000 S STONY ISLAND AVE, Chicago, IL",314,3.0%,
"4100 N OAK PARK AVE, Chicago, IL",185,1.8%,
"4200 W 127TH PL, Chicago, IL",183,1.7%,
"2200 N CANNON DR, Chicago, IL",163,1.6%,
"2400 E 105TH ST, Chicago, IL",160,1.5%,
"7000 W ARMITAGE AVENUE, Chicago, IL",156,1.5%,
"3700 E 118TH ST, Chicago, IL",152,1.4%,
"1100 S ASHLAND AVE, Chicago, IL",151,1.4%,

0,1
Distinct count,64
Unique (%),0.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,35.688
Minimum,10
Maximum,98
Zeros (%),0.0%

0,1
Minimum,10
5-th percentile,10
Q1,12
Median,33
Q3,52
95-th percentile,82
Maximum,98
Range,88
Interquartile range,40

0,1
Standard deviation,24.339
Coef of variation,0.68201
Kurtosis,-0.73761
Mean,35.688
MAD,20.794
Skewness,0.62672
Sum,374936
Variance,592.41
Memory size,82.2 KiB

Value,Count,Frequency (%),Unnamed: 3
10,1722,16.4%,
11,736,7.0%,
12,605,5.8%,
22,500,4.8%,
13,345,3.3%,
37,330,3.1%,
17,305,2.9%,
42,300,2.9%,
70,295,2.8%,
52,277,2.6%,

Value,Count,Frequency (%),Unnamed: 3
10,1722,16.4%,
11,736,7.0%,
12,605,5.8%,
13,345,3.3%,
14,97,0.9%,

Value,Count,Frequency (%),Unnamed: 3
90,77,0.7%,
91,111,1.1%,
93,21,0.2%,
96,31,0.3%,
98,23,0.2%,

0,1
Distinct count,95
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0

0,1
2007-08-01,551
2007-08-15,276
2007-08-21,186
Other values (92),9493

Value,Count,Frequency (%),Unnamed: 3
2007-08-01,551,5.2%,
2007-08-15,276,2.6%,
2007-08-21,186,1.8%,
2013-08-01,186,1.8%,
2007-08-24,186,1.8%,
2007-10-04,185,1.8%,
2007-08-07,184,1.8%,
2013-07-19,182,1.7%,
2013-07-12,182,1.7%,
2013-08-08,181,1.7%,

0,1
Distinct count,138
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,41.841
Minimum,41.645
Maximum,42.017
Zeros (%),0.0%

0,1
Minimum,41.645
5-th percentile,41.673
Q1,41.733
Median,41.846
Q3,41.955
95-th percentile,41.992
Maximum,42.017
Range,0.37282
Interquartile range,0.22171

0,1
Standard deviation,0.11274
Coef of variation,0.0026945
Kurtosis,-1.4402
Mean,41.841
MAD,0.10219
Skewness,-0.08944
Sum,439580
Variance,0.012711
Memory size,82.2 KiB

Value,Count,Frequency (%),Unnamed: 3
41.974689,750,7.1%,
41.673408,542,5.2%,
41.726465000000005,314,3.0%,
41.95469,185,1.8%,
41.662014,183,1.7%,
41.921965,163,1.6%,
41.704572,160,1.5%,
41.916265,156,1.5%,
41.680946,152,1.4%,
41.868077,151,1.4%,

Value,Count,Frequency (%),Unnamed: 3
41.644612,17,0.2%,
41.659112,112,1.1%,
41.662014,183,1.7%,
41.673408,542,5.2%,
41.678618,129,1.2%,

Value,Count,Frequency (%),Unnamed: 3
42.008314,139,1.3%,
42.009876,50,0.5%,
42.010412,63,0.6%,
42.011601,66,0.6%,
42.01743,69,0.7%,

0,1
Distinct count,138
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-87.7
Minimum,-87.931
Maximum,-87.532
Zeros (%),0.0%

0,1
Minimum,-87.931
5-th percentile,-87.891
Q1,-87.76
Median,-87.695
Q3,-87.628
95-th percentile,-87.547
Maximum,-87.532
Range,0.39936
Interquartile range,0.13227

0,1
Standard deviation,0.096514
Coef of variation,-0.0011005
Kurtosis,-0.43627
Mean,-87.7
MAD,0.077598
Skewness,-0.37028
Sum,-921380
Variance,0.009315
Memory size,82.2 KiB

Value,Count,Frequency (%),Unnamed: 3
-87.890615,750,7.1%,
-87.599862,542,5.2%,
-87.585413,314,3.0%,
-87.800991,185,1.8%,
-87.72460799999999,183,1.7%,
-87.632085,163,1.6%,
-87.56566600000001,160,1.5%,
-87.800515,156,1.5%,
-87.535198,152,1.4%,
-87.666901,151,1.4%,

Value,Count,Frequency (%),Unnamed: 3
-87.930995,140,1.3%,
-87.890615,750,7.1%,
-87.862995,80,0.8%,
-87.832763,133,1.3%,
-87.824812,35,0.3%,

Value,Count,Frequency (%),Unnamed: 3
-87.53869300000001,112,1.1%,
-87.536497,83,0.8%,
-87.535198,152,1.4%,
-87.531657,23,0.2%,
-87.531635,45,0.4%,

0,1
Distinct count,50
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,12.854
Minimum,1
Maximum,50
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,2
Median,5
Q3,17
95-th percentile,50
Maximum,50
Range,49
Interquartile range,15

0,1
Standard deviation,16.134
Coef of variation,1.2552
Kurtosis,0.5224
Mean,12.854
MAD,12.778
Skewness,1.4049
Sum,135039
Variance,260.3
Memory size,82.2 KiB

Value,Count,Frequency (%),Unnamed: 3
1,2307,22.0%,
2,1300,12.4%,
50,1019,9.7%,
3,896,8.5%,
4,593,5.6%,
5,489,4.7%,
6,398,3.8%,
7,326,3.1%,
8,244,2.3%,
9,237,2.3%,

Value,Count,Frequency (%),Unnamed: 3
1,2307,22.0%,
2,1300,12.4%,
3,896,8.5%,
4,593,5.6%,
5,489,4.7%,

Value,Count,Frequency (%),Unnamed: 3
46,43,0.4%,
47,37,0.4%,
48,36,0.3%,
49,35,0.3%,
50,1019,9.7%,

0,1
Distinct count,7
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
CULEX PIPIENS/RESTUANS,4752
CULEX RESTUANS,2740
CULEX PIPIENS,2699
Other values (4),315

Value,Count,Frequency (%),Unnamed: 3
CULEX PIPIENS/RESTUANS,4752,45.2%,
CULEX RESTUANS,2740,26.1%,
CULEX PIPIENS,2699,25.7%,
CULEX TERRITANS,222,2.1%,
CULEX SALINARIUS,86,0.8%,
CULEX TARSALIS,6,0.1%,
CULEX ERRATICUS,1,0.0%,

0,1
Distinct count,128
Unique (%),1.2%
Missing (%),0.0%
Missing (n),0

0,1
W OHARE AIRPORT,750
S DOTY AVE,542
S STONY ISLAND AVE,347
Other values (125),8867

Value,Count,Frequency (%),Unnamed: 3
W OHARE AIRPORT,750,7.1%,
S DOTY AVE,542,5.2%,
S STONY ISLAND AVE,347,3.3%,
S ASHLAND AVE,266,2.5%,
N OAK PARK AVE,216,2.1%,
W 51ST ST,185,1.8%,
W 127TH PL,183,1.7%,
N PULASKI RD,173,1.6%,
N CANNON DR,172,1.6%,
E 105TH ST,160,1.5%,

0,1
Distinct count,136
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0

0,1
T900,750
T115,542
T138,314
Other values (133),8900

Value,Count,Frequency (%),Unnamed: 3
T900,750,7.1%,
T115,542,5.2%,
T138,314,3.0%,
T002,185,1.8%,
T135,183,1.7%,
T054,163,1.6%,
T128,160,1.5%,
T151,156,1.5%,
T212,152,1.4%,
T090,151,1.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.052446

0,1
0,9955
1,551

Value,Count,Frequency (%),Unnamed: 3
0,9955,94.8%,
1,551,5.2%,

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [53]:
print("Train set data types:\n")
print(train.dtypes)
print("-----")
print("Weather set data types:\n")
print(weather.dtypes)
print("-----")
print("Spray set data types:\n")
print(spray.dtypes)

Train set data types:

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object
-----
Weather set data types:

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object
----

In [5]:
train.describe()

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
count,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0
mean,35.687797,41.841139,-87.699908,7.819532,12.853512,0.052446
std,24.339468,0.112742,0.096514,1.452921,16.133816,0.222936
min,10.0,41.644612,-87.930995,3.0,1.0,0.0
25%,12.0,41.732984,-87.76007,8.0,2.0,0.0
50%,33.0,41.846283,-87.694991,8.0,5.0,0.0
75%,52.0,41.95469,-87.627796,9.0,17.0,0.0
max,98.0,42.01743,-87.531635,9.0,50.0,1.0


In [59]:
train['Species'].value_counts()

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64

In [25]:
train_weather = train.merge(weather[weather['Station']==2], how='left', on='Date')

In [27]:
train_weather

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
5,2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
6,2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
7,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
8,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4
9,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,...,BR HZ,M,M,M,0.00,29.44,30.09,5.8,16,7.4


In [52]:
train_weather.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
Station                   0
Tmax                      0
Tmin                      0
Tavg                      0
Depart                    0
DewPoint                  0
WetBulb                   0
Heat                      0
Cool                      0
Sunrise                   0
Sunset                    0
CodeSum                   0
Depth                     0
Water1                    0
SnowFall                  0
PrecipTotal               0
StnPressure               0
SeaLevel                  0
ResultSpeed               0
ResultDir                 0
AvgSpeed                  0
dtype: int64

In [31]:
train_weather.describe()

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Station,Tmax,Tmin,DewPoint,ResultSpeed,ResultDir
count,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0
mean,35.687797,41.841139,-87.699908,7.819532,12.853512,0.052446,2.0,81.986198,64.578336,59.58814,5.905854,17.758709
std,24.339468,0.112742,0.096514,1.452921,16.133816,0.222936,0.0,8.301924,7.393863,7.87718,2.925563,9.041969
min,10.0,41.644612,-87.930995,3.0,1.0,0.0,2.0,58.0,44.0,38.0,1.1,1.0
25%,12.0,41.732984,-87.76007,8.0,2.0,0.0,2.0,78.0,60.0,54.0,4.1,10.0
50%,33.0,41.846283,-87.694991,8.0,5.0,0.0,2.0,83.0,67.0,60.0,5.5,19.0
75%,52.0,41.95469,-87.627796,9.0,17.0,0.0,2.0,87.0,70.0,66.0,7.8,24.0
max,98.0,42.01743,-87.531635,9.0,50.0,1.0,2.0,97.0,79.0,73.0,15.4,36.0


In [32]:
train_weather.shape

(10506, 33)

In [33]:
train_weather.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Station', 'Tmax', 'Tmin', 'Tavg',
       'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset',
       'CodeSum', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [42]:
train_weather_carry = pd.get_dummies(train_weather['Species'])[['CULEX PIPIENS/RESTUANS','CULEX PIPIENS','CULEX RESTUANS']]

In [56]:
y_Train = train_weather.WnvPresent
 
X_Train = pd.DataFrame()
X_Test = pd.DataFrame()

In [49]:
X_train['Latitude'] = train_weather.Latitude
X_train['Tmin'] = train_weather['Tmin']
X_train['Longitude'] = train_weather.Longitude
X_train['Tmax'] = train_weather.Tmax
X_train['AvgSpeed'] = train_weather['AvgSpeed']
X_train['DewPoint'] = train_weather.DewPoint
X_train['DayOfYear'] = pd.to_datetime(train_weather['Date'])
X_train['CULEX PIPIENS'] = train_weather_carry['CULEX PIPIENS']
X_train['CULEX RESTUANS'] = train_weather_carry['CULEX RESTUANS']
X_train['WetBulb'] = train_weather.WetBulb

X_train.shape

(10506, 10)

In [50]:
X_Train.dtypes

Latitude                 float64
Tmin                       int64
Longitude                float64
Tmax                       int64
DewPoint                   int64
DayOfYear         datetime64[ns]
CULEX PIPIENS              uint8
CULEX RESTUANS             uint8
WetBulb                   object
AvgSpeed                  object
dtype: object

In [61]:
model = GradientBoostingClassifier(max_features = 6, max_depth = 100) 
scores = cross_val_score(model, X_Train, y_Train, cv=3)
print(scores)
print(np.mean(scores))

ValueError: Found input variables with inconsistent numbers of samples: [0, 10506]

In [58]:
model = RandomForestClassifier(max_features = 6, max_depth = 20) 
scores = cross_val_score(model, X_Train, y_Train, cv=3)
print(scores)
print(np.mean(scores))

ValueError: Found input variables with inconsistent numbers of samples: [0, 10506]