In [1]:
#Loading cleaned train weather dataset
import pandas as pd
train_cleaned=pd.read_csv("train_cleaned.csv")
train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8610 entries, 0 to 8609
Data columns (total 16 columns):
Unnamed: 0                8610 non-null int64
Date                      8610 non-null object
Address                   8610 non-null object
Species                   8610 non-null object
Block                     8610 non-null int64
Street                    8610 non-null object
Trap                      8610 non-null object
AddressNumberAndStreet    8610 non-null object
Latitude                  8610 non-null float64
Longitude                 8610 non-null float64
AddressAccuracy           8610 non-null int64
WnvPresent                8610 non-null int64
NumMosquitos              8610 non-null int64
year                      8610 non-null int64
month                     8610 non-null int64
day                       8610 non-null int64
dtypes: float64(2), int64(8), object(6)
memory usage: 1.1+ MB


In [15]:
#Loading spray data
spray = pd.read_csv('spray.csv')
spray.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14835 entries, 0 to 14834
Data columns (total 4 columns):
Date         14835 non-null object
Time         14251 non-null object
Latitude     14835 non-null float64
Longitude    14835 non-null float64
dtypes: float64(2), object(2)
memory usage: 463.7+ KB


In [16]:
#Create a table distinct Date,Lat,Long from train
train_long_lat= train_cleaned[['Date','Latitude','Longitude']]
train_long_lat=train_long_lat.drop_duplicates()
train_long_lat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4616 entries, 0 to 8609
Data columns (total 3 columns):
Date         4616 non-null object
Latitude     4616 non-null float64
Longitude    4616 non-null float64
dtypes: float64(2), object(1)
memory usage: 144.2+ KB


In [17]:
#Create a table distinct Date,Lat,Long from spray
spray_long_lat= spray[['Date','Latitude','Longitude']]
spray_long_lat=spray_long_lat.drop_duplicates()
spray_long_lat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14294 entries, 0 to 14834
Data columns (total 3 columns):
Date         14294 non-null object
Latitude     14294 non-null float64
Longitude    14294 non-null float64
dtypes: float64(2), object(1)
memory usage: 446.7+ KB


In [18]:
import sqlite3
conn = sqlite3.connect(':memory:')
train_long_lat.to_sql('train_long_lat', conn, index=False)
spray.to_sql('spray', conn, index=False)

In [19]:
#Create distinct combinations of Date,Lat,Long from train and spray
qry = '''
    select  
        distinct train_long_lat.*,
        spray.date as spray_date,
        spray.Latitude as spray_Latitude,
        spray.Longitude as spray_Longitude
    from
        train_long_lat,spray
    '''

In [None]:
import pandas as pd
train_long_lat_1 = pd.read_sql_query(qry, conn)
train_long_lat_1.info()

In [6]:
#Function to calculate distance between 2 geolocations
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    miles = 6367 * c/1.609
    return miles

In [21]:
train_long_lat_1['dist']= haversine_np(train_long_lat_1['Longitude'],train_long_lat_1['Latitude'],
                              train_long_lat_1['spray_Longitude'],train_long_lat_1['spray_Latitude'])
train_long_lat_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65981104 entries, 0 to 65981103
Data columns (total 7 columns):
Date               object
Latitude           float64
Longitude          float64
spray_date         object
spray_Latitude     float64
spray_Longitude    float64
dist               float64
dtypes: float64(5), object(2)
memory usage: 3.4+ GB


In [22]:
#Selecting the records with nearest spray for all the available spray dates
train_long_lat_2=train_long_lat_1.sort_values(['dist'],ascending=True).groupby(['Date','Latitude','Longitude','spray_date']).head(1)


In [23]:
train_long_lat_3=train_long_lat_2.sort_values(['Date','Latitude','Longitude','spray_date'])

In [24]:
#Calculating days from the available sprays
train_long_lat_3['spray_date'] = pd.to_datetime(train_long_lat_3['spray_date'])
train_long_lat_3['Date'] = pd.to_datetime(train_long_lat_3['Date'])
train_long_lat_3['days_frm_spray'] = (train_long_lat_3['spray_date'] - train_long_lat_3['Date']).dt.days


In [25]:
train_long_lat_3.head(20)

Unnamed: 0,Date,Latitude,Longitude,spray_date,spray_Latitude,spray_Longitude,dist,days_frm_spray
114357,2007-05-29,41.688324,-87.676709,2011-08-29,42.390395,-88.088315,52.885172,1553
114470,2007-05-29,41.688324,-87.676709,2011-09-07,41.968553,-87.788695,20.193797,1562
117652,2007-05-29,41.688324,-87.676709,2013-07-17,41.714098,-87.659542,1.988063,2241
118597,2007-05-29,41.688324,-87.676709,2013-07-25,41.939342,-87.716252,17.455543,2249
119834,2007-05-29,41.688324,-87.676709,2013-08-08,41.917765,-87.694332,15.872227,2263
122546,2007-05-29,41.688324,-87.676709,2013-08-15,41.887967,-87.745625,14.237648,2270
123777,2007-05-29,41.688324,-87.676709,2013-08-16,41.911435,-87.746675,15.824557,2271
125406,2007-05-29,41.688324,-87.676709,2013-08-22,41.714268,-87.61879,3.482891,2277
125744,2007-05-29,41.688324,-87.676709,2013-08-29,41.759397,-87.694132,4.990095,2284
128438,2007-05-29,41.688324,-87.676709,2013-09-05,41.977485,-87.83951,21.656838,2291


In [26]:
#Transposing the spray dates into columns and distance as values
train_long_lat_4 = pd.pivot_table(train_long_lat_3, values='dist', index=['Date','Latitude','Longitude'],
                    columns=['spray_date']).add_prefix('dist_').reset_index()
train_long_lat_4.head()

spray_date,Date,Latitude,Longitude,dist_2011-08-29 00:00:00,dist_2011-09-07 00:00:00,dist_2013-07-17 00:00:00,dist_2013-07-25 00:00:00,dist_2013-08-08 00:00:00,dist_2013-08-15 00:00:00,dist_2013-08-16 00:00:00,dist_2013-08-22 00:00:00,dist_2013-08-29 00:00:00,dist_2013-09-05 00:00:00
0,2007-05-29,41.688324,-87.676709,52.885172,20.193797,1.988063,17.455543,15.872227,14.237648,15.824557,3.482891,4.990095,21.656838
1,2007-05-29,41.720848,-87.666014,51.06151,18.234962,0.168424,15.310064,13.659711,12.248106,13.802109,2.261182,3.031135,19.845124
2,2007-05-29,41.731922,-87.677512,50.118495,17.31492,0.344506,14.463412,12.86435,11.333221,12.898928,2.850825,2.081839,18.896556
3,2007-05-29,41.732984,-87.649642,50.66984,17.772775,0.009122,14.658389,12.899166,11.789201,13.298065,1.436292,2.929726,19.507599
4,2007-05-29,41.862292,-87.64886,42.860201,10.271628,8.220944,6.326835,4.168725,5.282698,6.067388,3.088242,5.778194,11.877982


In [27]:
#Transposing the spray dates into columns and days from spray as values
train_long_lat_5 = pd.pivot_table(train_long_lat_3, values='days_frm_spray', index=['Date','Latitude','Longitude'],
                    columns=['spray_date']).add_prefix('days_').reset_index()
train_long_lat_5.head()

spray_date,Date,Latitude,Longitude,days_2011-08-29 00:00:00,days_2011-09-07 00:00:00,days_2013-07-17 00:00:00,days_2013-07-25 00:00:00,days_2013-08-08 00:00:00,days_2013-08-15 00:00:00,days_2013-08-16 00:00:00,days_2013-08-22 00:00:00,days_2013-08-29 00:00:00,days_2013-09-05 00:00:00
0,2007-05-29,41.688324,-87.676709,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
1,2007-05-29,41.720848,-87.666014,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
2,2007-05-29,41.731922,-87.677512,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
3,2007-05-29,41.732984,-87.649642,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
4,2007-05-29,41.862292,-87.64886,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291


In [28]:
train_long_lat_6=pd.merge(train_long_lat_4,train_long_lat_5, on=['Date','Latitude','Longitude'])
train_long_lat_6.head()

spray_date,Date,Latitude,Longitude,dist_2011-08-29 00:00:00,dist_2011-09-07 00:00:00,dist_2013-07-17 00:00:00,dist_2013-07-25 00:00:00,dist_2013-08-08 00:00:00,dist_2013-08-15 00:00:00,dist_2013-08-16 00:00:00,...,days_2011-08-29 00:00:00,days_2011-09-07 00:00:00,days_2013-07-17 00:00:00,days_2013-07-25 00:00:00,days_2013-08-08 00:00:00,days_2013-08-15 00:00:00,days_2013-08-16 00:00:00,days_2013-08-22 00:00:00,days_2013-08-29 00:00:00,days_2013-09-05 00:00:00
0,2007-05-29,41.688324,-87.676709,52.885172,20.193797,1.988063,17.455543,15.872227,14.237648,15.824557,...,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
1,2007-05-29,41.720848,-87.666014,51.06151,18.234962,0.168424,15.310064,13.659711,12.248106,13.802109,...,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
2,2007-05-29,41.731922,-87.677512,50.118495,17.31492,0.344506,14.463412,12.86435,11.333221,12.898928,...,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
3,2007-05-29,41.732984,-87.649642,50.66984,17.772775,0.009122,14.658389,12.899166,11.789201,13.298065,...,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291
4,2007-05-29,41.862292,-87.64886,42.860201,10.271628,8.220944,6.326835,4.168725,5.282698,6.067388,...,1553,1562,2241,2249,2263,2270,2271,2277,2284,2291


In [None]:
#CSV file in clean data folder
train_long_lat_6.to_csv('train_spray_merged_v1.csv',index=False)

In [2]:
import pandas as pd
train_long_lat_6=pd.read_csv("train_spray_merged_v1.csv",float_precision='round_trip')

In [4]:
#Adding the above fields to the train data
#train_cleaned['Date'] = pd.to_datetime(train_cleaned['Date'])
#train_long_lat_6['Date'] = pd.to_datetime(train_long_lat_6['Date'])
train_spray_merged=pd.merge(train_cleaned,train_long_lat_6, on=['Date','Latitude','Longitude'])
train_spray_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8610 entries, 0 to 8609
Data columns (total 36 columns):
Unnamed: 0                  8610 non-null int64
Date                        8610 non-null object
Address                     8610 non-null object
Species                     8610 non-null object
Block                       8610 non-null int64
Street                      8610 non-null object
Trap                        8610 non-null object
AddressNumberAndStreet      8610 non-null object
Latitude                    8610 non-null float64
Longitude                   8610 non-null float64
AddressAccuracy             8610 non-null int64
WnvPresent                  8610 non-null int64
NumMosquitos                8610 non-null int64
year                        8610 non-null int64
month                       8610 non-null int64
day                         8610 non-null int64
dist_2011-08-29 00:00:00    8610 non-null float64
dist_2011-09-07 00:00:00    8610 non-null float64
dist_2013-07-17

In [7]:
#create columns to calculate distance between trap and Station1/Station2
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
train_spray_merged['trap_dist_st1']= haversine_np(train_spray_merged['Longitude'],
                                                     train_spray_merged['Latitude'],-87.933,41.995)
train_spray_merged['trap_dist_st2']= haversine_np(train_spray_merged['Longitude'],
                                                     train_spray_merged['Latitude'],-87.752,41.786)
train_spray_merged.head()

Unnamed: 0.1,Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,days_2013-07-17 00:00:00,days_2013-07-25 00:00:00,days_2013-08-08 00:00:00,days_2013-08-15 00:00:00,days_2013-08-16 00:00:00,days_2013-08-22 00:00:00,days_2013-08-29 00:00:00,days_2013-09-05 00:00:00,trap_dist_st1,trap_dist_st2
0,0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,...,2241,2249,2263,2270,2271,2277,2284,2291,16.828145,7.529955
1,1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,...,2241,2249,2263,2270,2271,2277,2284,2291,16.828145,7.529955
2,2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,...,2241,2249,2263,2270,2271,2277,2284,2291,17.23834,7.479372
3,3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,...,2241,2249,2263,2270,2271,2277,2284,2291,15.813849,9.099614
4,4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,...,2241,2249,2263,2270,2271,2277,2284,2291,10.70297,8.413806


In [8]:
#Assign closest station to trap
def nearest_station(train_spray_merged):
    if (train_spray_merged['trap_dist_st2'] < train_spray_merged['trap_dist_st1']):
        return 2
    else:
        return 1

In [9]:
train_spray_merged['Station'] = train_spray_merged.apply(nearest_station, axis = 1)
train_spray_merged.head()

Unnamed: 0.1,Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,days_2013-07-25 00:00:00,days_2013-08-08 00:00:00,days_2013-08-15 00:00:00,days_2013-08-16 00:00:00,days_2013-08-22 00:00:00,days_2013-08-29 00:00:00,days_2013-09-05 00:00:00,trap_dist_st1,trap_dist_st2,Station
0,0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,...,2249,2263,2270,2271,2277,2284,2291,16.828145,7.529955,2
1,1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,...,2249,2263,2270,2271,2277,2284,2291,16.828145,7.529955,2
2,2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,...,2249,2263,2270,2271,2277,2284,2291,17.23834,7.479372,2
3,3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,...,2249,2263,2270,2271,2277,2284,2291,15.813849,9.099614,2
4,4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,...,2249,2263,2270,2271,2277,2284,2291,10.70297,8.413806,2


In [10]:
#station frequency closest to trap
train_spray_merged.Station.value_counts()

2    6090
1    2520
Name: Station, dtype: int64

In [11]:
import pandas as pd
weather_data=pd.read_csv("weather_var_cleaned.csv")
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 79 columns):
Station_Station1        1472 non-null int64
WeatherLat_Station1     1472 non-null float64
WeatherLon_Station1     1472 non-null float64
Date                    1472 non-null object
Tmax_Station1           1472 non-null int64
Tmin_Station1           1472 non-null int64
Tavg_Station1           1472 non-null int64
Depart_Station1         1472 non-null int64
DewPoint_Station1       1472 non-null int64
WetBulb_Station1        1472 non-null float64
Heat_Station1           1472 non-null int64
Cool_Station1           1472 non-null int64
CodeSum_Station1        1472 non-null object
PrecipTotal_Station1    1472 non-null float64
StnPressure_Station1    1472 non-null float64
SeaLevel_Station1       1472 non-null float64
ResultSpeed_Station1    1472 non-null float64
ResultDir_Station1      1472 non-null int64
AvgSpeed_Station1       1472 non-null float64
Sunrise_Station1        1472 non-null o

In [12]:
#Extracting station 1 & 2 columns 
st1_cols = [col for col in weather_data.columns if '1' in col]
st1_cols.append('Date')
st2_cols = [col for col in weather_data.columns if '2' in col]
st2_cols.append('Date')

In [13]:
weather_data_st1=weather_data.filter(st1_cols)
weather_data_st2=weather_data.filter(st2_cols)
weather_data_st1.head()

Unnamed: 0,Station_Station1,WeatherLat_Station1,WeatherLon_Station1,Tmax_Station1,Tmin_Station1,Tavg_Station1,Depart_Station1,DewPoint_Station1,WetBulb_Station1,Heat_Station1,...,HZ1,NO_EVENT1,RA1,SN1,SQ1,TS1,TSRA1,VCFG1,VCTS1,Date
0,1,41.995,-87.933,83,50,67,14,51,56.0,0,...,0,1,0,0,0,0,0,0,0,5/1/07
1,1,41.995,-87.933,59,42,51,-3,42,47.0,14,...,0,0,0,0,0,0,0,0,0,5/2/07
2,1,41.995,-87.933,66,46,56,2,40,48.0,9,...,0,1,0,0,0,0,0,0,0,5/3/07
3,1,41.995,-87.933,66,49,58,4,41,50.0,7,...,0,0,1,0,0,0,0,0,0,5/4/07
4,1,41.995,-87.933,66,53,60,5,38,49.0,5,...,0,1,0,0,0,0,0,0,0,5/5/07


In [14]:
list(weather_data_st1.columns)

['Station_Station1',
 'WeatherLat_Station1',
 'WeatherLon_Station1',
 'Tmax_Station1',
 'Tmin_Station1',
 'Tavg_Station1',
 'Depart_Station1',
 'DewPoint_Station1',
 'WetBulb_Station1',
 'Heat_Station1',
 'Cool_Station1',
 'CodeSum_Station1',
 'PrecipTotal_Station1',
 'StnPressure_Station1',
 'SeaLevel_Station1',
 'ResultSpeed_Station1',
 'ResultDir_Station1',
 'AvgSpeed_Station1',
 'Sunrise_Station1',
 'Sunset_Station1',
 'WetBulb_Station1_c',
 'Tavg_Station1_c',
 'rel_hum_station1',
 'BCFG1',
 'BR1',
 'DZ1',
 'FG1',
 'FGplus_1',
 'FU1',
 'GR1',
 'HZ1',
 'NO_EVENT1',
 'RA1',
 'SN1',
 'SQ1',
 'TS1',
 'TSRA1',
 'VCFG1',
 'VCTS1',
 'Date']

In [15]:
station_cols=['Station',
 'WeatherLat_Station',
 'WeatherLon_Station',
 'Tmax_Station',
 'Tmin_Station',
 'Tavg_Station',
 'Depart_Station',
 'DewPoint_Station',
 'WetBulb_Station',
 'Heat_Station',
 'Cool_Station',
 'CodeSum_Station',
 'PrecipTotal_Station',
 'StnPressure_Station',
 'SeaLevel_Station',
 'ResultSpeed_Station',
 'ResultDir_Station',
 'AvgSpeed_Station',
 'Sunrise_Station',
 'Sunset_Station',
 'WetBulb_Station_c',
 'Tavg_Station_c',
 'rel_hum_station',
 'BCFG',
 'BR',
 'DZ',
 'FG',
 'FGplus_',
 'FU',
 'GR',
 'HZ',
 'NO_EVENT',
 'RA',
 'SN',
 'SQ',
 'TS',
 'TSRA',
 'VCFG',
 'VCTS',
 'Date']

In [16]:
weather_data_st1.columns = weather_data_st1.columns[:0].tolist() + station_cols
weather_data_st2.columns = weather_data_st2.columns[:0].tolist() + station_cols

In [17]:
weather_data_st2.head()

Unnamed: 0,Station,WeatherLat_Station,WeatherLon_Station,Tmax_Station,Tmin_Station,Tavg_Station,Depart_Station,DewPoint_Station,WetBulb_Station,Heat_Station,...,HZ,NO_EVENT,RA,SN,SQ,TS,TSRA,VCFG,VCTS,Date
0,2,41.786,-87.752,84,52,68,14,51,57.0,0,...,0,1,0,0,0,0,0,0,0,5/1/07
1,2,41.786,-87.752,60,43,52,-3,42,47.0,13,...,0,0,0,0,0,0,0,0,0,5/2/07
2,2,41.786,-87.752,67,48,58,2,40,50.0,7,...,0,1,0,0,0,0,0,0,0,5/3/07
3,2,41.786,-87.752,78,51,64,4,42,50.0,7,...,0,0,1,0,0,0,0,0,0,5/4/07
4,2,41.786,-87.752,66,54,60,5,39,50.0,5,...,0,1,0,0,0,0,0,0,0,5/5/07


In [18]:
#Append startion 1 and station 2 data
weather_data_st_append=weather_data_st1.append(weather_data_st2)
weather_data_st_append.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2944 entries, 0 to 1471
Data columns (total 40 columns):
Station                2944 non-null int64
WeatherLat_Station     2944 non-null float64
WeatherLon_Station     2944 non-null float64
Tmax_Station           2944 non-null int64
Tmin_Station           2944 non-null int64
Tavg_Station           2944 non-null int64
Depart_Station         2944 non-null int64
DewPoint_Station       2944 non-null int64
WetBulb_Station        2944 non-null float64
Heat_Station           2944 non-null int64
Cool_Station           2944 non-null int64
CodeSum_Station        2944 non-null object
PrecipTotal_Station    2944 non-null float64
StnPressure_Station    2944 non-null float64
SeaLevel_Station       2944 non-null float64
ResultSpeed_Station    2944 non-null float64
ResultDir_Station      2944 non-null int64
AvgSpeed_Station       2944 non-null float64
Sunrise_Station        2944 non-null object
Sunset_Station         2944 non-null object
WetBulb_Statio

In [21]:
#Merge Train and weather
weather_data_st_append['Date'] = pd.to_datetime(weather_data_st_append['Date'])
train_spray_merged['Date'] = pd.to_datetime(train_spray_merged['Date'])
train_spray_weather_v1=pd.merge(train_spray_merged,weather_data_st_append, on=['Date','Station'])
train_spray_weather_v1.head()

Unnamed: 0.1,Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,GR,HZ,NO_EVENT,RA,SN,SQ,TS,TSRA,VCFG,VCTS
0,0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,...,0,1,0,0,0,0,0,0,0,0
1,1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,...,0,1,0,0,0,0,0,0,0,0
2,2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,...,0,1,0,0,0,0,0,0,0,0
3,3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,...,0,1,0,0,0,0,0,0,0,0
4,4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,...,0,1,0,0,0,0,0,0,0,0


In [22]:
#remove unwanted columns
train_spray_weather_v1=train_spray_weather_v1.drop(['Unnamed: 0'], axis=1)
train_spray_weather_v1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8610 entries, 0 to 8609
Data columns (total 76 columns):
Date                        8610 non-null datetime64[ns]
Address                     8610 non-null object
Species                     8610 non-null object
Block                       8610 non-null int64
Street                      8610 non-null object
Trap                        8610 non-null object
AddressNumberAndStreet      8610 non-null object
Latitude                    8610 non-null float64
Longitude                   8610 non-null float64
AddressAccuracy             8610 non-null int64
WnvPresent                  8610 non-null int64
NumMosquitos                8610 non-null int64
year                        8610 non-null int64
month                       8610 non-null int64
day                         8610 non-null int64
dist_2011-08-29 00:00:00    8610 non-null float64
dist_2011-09-07 00:00:00    8610 non-null float64
dist_2013-07-17 00:00:00    8610 non-null float64
dist_

In [23]:
train_spray_weather_v1.to_csv('train_spray_weather_v1.csv',index=False)                                   