In [1]:
# Import libraries

import numpy as np
import pandas as pd

In [2]:
# Import files

train = pd.read_csv('../assets/train_modified.csv')
test = pd.read_csv('../assets/test_modified.csv')
weather = pd.read_csv('../assets/weather_modified.csv')
spray = pd.read_csv('../assets/spray_modified.csv')

In [3]:
# Set columns and row values to see everything

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)

In [4]:
# Get all dates back to DateTime:

train["YMD"] = pd.to_datetime(train["YMD"], format = "%Y-%m-%d")
test["YMD"] = pd.to_datetime(test["YMD"], format = "%Y-%m-%d")
spray["YMD"] = pd.to_datetime(spray["YMD"], format = "%Y-%m-%d")
weather["YMD"] = pd.to_datetime(weather["YMD"], format = "%Y-%m-%d")

In [5]:
# Verify everything is correct now:

# Ritika's EDA function for initial investigation

def eda(dataframe):
    print "missing values \n", dataframe.isnull().sum(), "\n"
    print "dataframe types \n", dataframe.dtypes, "\n"
    print "dataframe shape \n", dataframe.shape, "\n"
    print "dataframe describe \n", dataframe.describe(), "\n"
    for item in dataframe:
        print item
        print dataframe[item].nunique()

# Confirm the clean data before moving forward

In [6]:
train.head(2)

Unnamed: 0,Species,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,YMD
0,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,1,0,2007-05-29
1,CULEX RESTUANS,41.95469,-87.800991,9,1,0,2007-05-29


In [7]:
eda(train)

missing values 
Species            0
Latitude           0
Longitude          0
AddressAccuracy    0
NumMosquitos       0
WnvPresent         0
YMD                0
dtype: int64 

dataframe types 
Species                    object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
YMD                datetime64[ns]
dtype: object 

dataframe shape 
(10506, 7) 

dataframe describe 
           Latitude     Longitude  AddressAccuracy  NumMosquitos    WnvPresent
count  10506.000000  10506.000000     10506.000000  10506.000000  10506.000000
mean      41.841139    -87.699908         7.819532     12.853512      0.052446
std        0.112742      0.096514         1.452921     16.133816      0.222936
min       41.644612    -87.930995         3.000000      1.000000      0.000000
25%       41.732984    -87.760070         8.000000      2.000000      0.000000
50%       41.846283    -87.

In [8]:
test.head(2)

Unnamed: 0,Species,Latitude,Longitude,AddressAccuracy,YMD
0,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,2008-06-11
1,CULEX RESTUANS,41.95469,-87.800991,9,2008-06-11


In [9]:
eda(test)

missing values 
Species            0
Latitude           0
Longitude          0
AddressAccuracy    0
YMD                0
dtype: int64 

dataframe types 
Species                    object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
YMD                datetime64[ns]
dtype: object 

dataframe shape 
(116293, 5) 

dataframe describe 
            Latitude      Longitude  AddressAccuracy
count  116293.000000  116293.000000    116293.000000
mean       41.849389     -87.693658         7.954357
std         0.106593       0.080699         1.252733
min        41.644612     -87.930995         3.000000
25%        41.753411     -87.750938         8.000000
50%        41.862292     -87.694991         8.000000
75%        41.951866     -87.648860         9.000000
max        42.017430     -87.531635         9.000000 

Species
8
Latitude
151
Longitude
151
AddressAccuracy
4
YMD
95


In [10]:
spray.head(2)

Unnamed: 0,Latitude,Longitude,YMD
0,42.391623,-88.089163,2011-08-29
1,42.391348,-88.089163,2011-08-29


In [11]:
eda(spray)

missing values 
Latitude     0
Longitude    0
YMD          0
dtype: int64 

dataframe types 
Latitude            float64
Longitude           float64
YMD          datetime64[ns]
dtype: object 

dataframe shape 
(14835, 3) 

dataframe describe 
           Latitude     Longitude
count  14835.000000  14835.000000
mean      41.904828    -87.736690
std        0.104381      0.067292
min       41.713925    -88.096468
25%       41.785001    -87.794225
50%       41.940075    -87.727853
75%       41.980978    -87.694108
max       42.395983    -87.586727 

Latitude
12887
Longitude
13007
YMD
10


In [12]:
weather.head(2)

Unnamed: 0,Station,Tmax,Tmin,DewPoint,WetBulb,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Tavg_int,Normal_Temp,Depart_calc,RA,BR,TS,HZ,SN,FG,FG+,FU,DZ,VC,MI,BC,YMD
0,1,83,50,51,56.0,448,1849,0.0,29.1,29.82,1.7,27,9.2,67,53.0,14.0,0,0,0,0,0,0,0,0,0,0,0,0,2007-05-01
1,2,84,52,51,57.0,448,1849,0.0,29.18,29.82,2.7,25,9.6,68,53.0,15.0,0,0,0,0,0,0,0,0,0,0,0,0,2007-05-01


In [13]:
eda(weather)

missing values 
Station        0
Tmax           0
Tmin           0
DewPoint       0
WetBulb        0
Sunrise        0
Sunset         0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
Tavg_int       0
Normal_Temp    0
Depart_calc    0
RA             0
BR             0
TS             0
HZ             0
SN             0
FG             0
FG+            0
FU             0
DZ             0
VC             0
MI             0
BC             0
YMD            0
dtype: int64 

dataframe types 
Station                 int64
Tmax                    int64
Tmin                    int64
DewPoint                int64
WetBulb               float64
Sunrise                 int64
Sunset                  int64
PrecipTotal           float64
StnPressure           float64
SeaLevel              float64
ResultSpeed           float64
ResultDir               int64
AvgSpeed              float64
Tavg_int                int64
Normal_Temp           float64
Depart_ca

## Working on species for model input

In [14]:
# Which species can be studied?
print train.Species.value_counts()
print test.Species.value_counts()

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64
CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64


In [15]:
#Mosquito species present

# Present:
# Pipiens, Restuans, Salinarius, Territans,
# Tarsalis, Erraticus

# Reasonable for analysis based on counts to train with?
# Pipiens, Restuans, Salinarius, Territans
# Ignore:  Tarsalis, Erraticus (7 samples)

# Create a list based on terms of interest to generate dummy variable
species_list = ["PIPIENS", "RESTUANS", "SALINARIUS", "TERRITANS"]

In [16]:
# Create a "Species" function; going back to Weather dummy function from original EDA:


def species_dummy(df_to_check, column_to_check, terms_to_check):

    # Iterate through the weather list types:
    for i in terms_to_check:
        # Iterate through the column:
        inner_list = []
        for j in df_to_check[column_to_check]:
            # Generate dummy if this description is found
            if i in j:
                inner_list.append(1)
            # If not, add a 0 instead
            else:
                inner_list.append(0)
        # Naming the column's existence creates that column
        df_to_check[i] = pd.DataFrame(inner_list)

In [17]:
# This will generate relevant dummy variable if
# species is found; else get a 0.

species_dummy(train, "Species", species_list)
species_dummy(test, "Species", species_list)

In [18]:
train.head(2)

Unnamed: 0,Species,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,YMD,PIPIENS,RESTUANS,SALINARIUS,TERRITANS
0,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,1,0,2007-05-29,1,1,0,0
1,CULEX RESTUANS,41.95469,-87.800991,9,1,0,2007-05-29,0,1,0,0


# Finding relevant weather data for an observation

In [19]:
# Station 1: Lat: 41.995 Lon: -87.933
# Station 2: Lat: 41.786 Lon: -87.752

# Find closest station then can do a join

def station_calculator(input_dataframe, latitude_column, longitude_column):

    st1_lat = 41.995
    st1_lon = -87.993
    st2_lat = 41.786
    st2_lon = -87.752

    outer_list = []  # Keep the station values
    
    for i, j in zip(input_dataframe[latitude_column], input_dataframe[longitude_column]):
        st1_lat_diff = abs(i - st1_lat)
        st2_lat_diff = abs(i - st2_lat)
        
        st1_lon_diff = abs(j - st1_lon)
        st2_lon_diff = abs(j - st2_lon)

        st1_tot_diff = st1_lat_diff + st1_lon_diff
        st2_tot_diff = st2_lat_diff + st2_lon_diff

        if st1_tot_diff < st2_tot_diff:
            outer_list.append(1)
        else:
            outer_list.append(2)
            
    input_dataframe["Station"] = outer_list # Common link for merge later

In [20]:
# Find closest station for train
station_calculator(train, "Latitude", "Longitude")

# Can re-use for test and get weather data for it!

In [21]:
# Merge weather data into training data
train_weather_df = pd.merge(train, weather, how='left', on=['YMD', 'Station'])

# Reference:
# http://stackoverflow.com/questions/21786490/pandas-left-outer-join-multiple-dataframes-on-multiple-columns

In [22]:
# Did it work?
train_weather_df.head(3)

Unnamed: 0,Species,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,YMD,PIPIENS,RESTUANS,SALINARIUS,TERRITANS,Station,Tmax,Tmin,DewPoint,WetBulb,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Tavg_int,Normal_Temp,Depart_calc,RA,BR,TS,HZ,SN,FG,FG+,FU,DZ,VC,MI,BC
0,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,1,0,2007-05-29,1,1,0,0,2,88,65,59,66.0,421,1917,0.0,29.44,30.09,5.8,16,7.4,77,64.0,13.0,0,1,0,1,0,0,0,0,0,0,0,0
1,CULEX RESTUANS,41.95469,-87.800991,9,1,0,2007-05-29,0,1,0,0,2,88,65,59,66.0,421,1917,0.0,29.44,30.09,5.8,16,7.4,77,64.0,13.0,0,1,0,1,0,0,0,0,0,0,0,0
2,CULEX RESTUANS,41.994991,-87.769279,9,1,0,2007-05-29,0,1,0,0,1,88,60,58,65.0,421,1917,0.0,29.39,30.11,5.8,18,6.5,74,64.0,10.0,0,1,0,1,0,0,0,0,0,0,0,0


In [23]:
# Repeat for test file
station_calculator(test, "Latitude", "Longitude")
test_weather_df = pd.merge(test, weather, how='left', on=['YMD', 'Station'])

In [24]:
test_weather_df.head(3)

Unnamed: 0,Species,Latitude,Longitude,AddressAccuracy,YMD,PIPIENS,RESTUANS,SALINARIUS,TERRITANS,Station,Tmax,Tmin,DewPoint,WetBulb,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Tavg_int,Normal_Temp,Depart_calc,RA,BR,TS,HZ,SN,FG,FG+,FU,DZ,VC,MI,BC
0,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,2008-06-11,1,1,0,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0
1,CULEX RESTUANS,41.95469,-87.800991,9,2008-06-11,0,1,0,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0
2,CULEX PIPIENS,41.95469,-87.800991,9,2008-06-11,1,0,0,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0


# Finding relevant spray data for a location

In [25]:
# Make the spray calculation:

def spray_calculator(df_data, df_spray, date_col, 
                     lat_col, long_col):
    
    td_outer_list = []  # Keep the timedeltas to append to df_data
    loc_outer_list = [] # Keep the location information for df_data
    
    # Combine the iterables similar to above
    zip_data = zip(df_data[date_col], df_data[lat_col], df_data[long_col])
    zip_spray = zip(df_spray[date_col], df_spray[lat_col], df_spray[long_col])
    
    for i, j, k in zip_data:    
        
        td_inner_list = []  # Keep the timedeltas for an iteration
        loc_inner_list = [] # Keep the location information for an interation
        min_indicies = [] # Need a list of indexes to grab the smallest distances later
        possible_locations_list = [] # Blank list for locations to find min
        
        for l, m, n in zip_spray:
            
            # Explained by variable names
            time_delta = i - l
            lat_diff = abs(j - m)
            lon_diff = abs(k - n)
            tot_dist = lat_diff + lon_diff
            
            td_inner_list.append(time_delta.days)
            loc_inner_list.append(tot_dist)
            
        # Need greater than 0 days for a time_delta, because the spray may occur
        # after the trap is checked and have no impact on that day's observation.
        # Therefore, find minimum > 0 and its index (if it exists) and use
        # the index to check same positions of location data
        
        # With negative or 0 time_delta, move to except and append a large value
        # as spray data is irrelevant.  Can adjust based on actual maximum days, as it's
        # reasonable to assume the impact is minimal.
         
        
        # Attempt this loop with try for the >0 minimum.  
        # As negative or 0 time_deltas are possible, don't want the loop to throw an error
        
        try:
            smallest_time_delta = min(i for i in td_inner_list if i > 0) # Find smallest_time_delta
            
            for index, time_delta in enumerate(td_inner_list):
                if time_delta == smallest_time_delta: # Check if it's equal to the minimum_value
                    min_indicies.append(index) # Add index to a list to search later

            
            for inner_index, location in enumerate(loc_inner_list):
                if inner_index in min_indicies:
                    possible_locations_list.append(location)

            smallest_loc_diff = (min(possible_locations_list))

        except:
            # On except, found a negative or 0 time_delta throughout the entire list
            # Set these values after running through once and looking at 
            # actual calculated maximum days and distances.  # Are in the same
            # scale but above the actual calculated maximum.
            
            # ~ Somewhat arbitrary, but reasonable to impute these as such
            smallest_time_delta = 720
            smallest_loc_diff = 1.5

    
        td_outer_list.append(smallest_time_delta)
        loc_outer_list.append(smallest_loc_diff)
    
    df_data["Days_Since_Spray"] = td_outer_list
    df_data["Dist_to_Closest_Spray"] = loc_outer_list
    

# Reference for finding minimum:
# http://stackoverflow.com/questions/15098642/python-getting-all-the-min-elements-and-its-indices-from-a-list

# Reference for finishing off index lookup:
# http://stackoverflow.com/questions/29452735/find-the-indices-at-which-any-element-of-one-list-occurs-in-another

In [26]:
train_weather_df.head()
test_weather_df.head()

Unnamed: 0,Species,Latitude,Longitude,AddressAccuracy,YMD,PIPIENS,RESTUANS,SALINARIUS,TERRITANS,Station,Tmax,Tmin,DewPoint,WetBulb,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Tavg_int,Normal_Temp,Depart_calc,RA,BR,TS,HZ,SN,FG,FG+,FU,DZ,VC,MI,BC
0,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,2008-06-11,1,1,0,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0
1,CULEX RESTUANS,41.95469,-87.800991,9,2008-06-11,0,1,0,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0
2,CULEX PIPIENS,41.95469,-87.800991,9,2008-06-11,1,0,0,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0
3,CULEX SALINARIUS,41.95469,-87.800991,9,2008-06-11,0,0,1,0,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0
4,CULEX TERRITANS,41.95469,-87.800991,9,2008-06-11,0,0,0,1,2,86,66,55,64.0,416,1926,0.0,29.34,29.97,9.4,18,10.4,76,67.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0


# Create the final DataFrames for Models

In [27]:
# Set up the DataFrame for Train
spray_calculator(train_weather_df, spray, "YMD", "Latitude", "Longitude")
print train_weather_df.head()
print train_weather_df.tail()

                  Species   Latitude  Longitude  AddressAccuracy  \
0  CULEX PIPIENS/RESTUANS  41.954690 -87.800991                9   
1          CULEX RESTUANS  41.954690 -87.800991                9   
2          CULEX RESTUANS  41.994991 -87.769279                9   
3  CULEX PIPIENS/RESTUANS  41.974089 -87.824812                8   
4          CULEX RESTUANS  41.974089 -87.824812                8   

   NumMosquitos  WnvPresent        YMD  PIPIENS  RESTUANS  SALINARIUS  \
0             1           0 2007-05-29        1         1           0   
1             1           0 2007-05-29        0         1           0   
2             1           0 2007-05-29        0         1           0   
3             1           0 2007-05-29        1         1           0   
4             4           0 2007-05-29        0         1           0   

   TERRITANS  Station  Tmax  Tmin  DewPoint  WetBulb  Sunrise  Sunset  \
0          0        2    88    65        59     66.0      421    1917   
1     

In [28]:
# Write out as final weather-joined, spray-added, completed DataFrame:

train_completed = train_weather_df.to_csv("../assets/train_complete.csv", index = False, encoding = "utf-8")

In [None]:
# Repeat for Test
spray_calculator(test_weather_df, spray, "YMD", "Latitude", "Longitude")
print test_weather_df.head()
print test_weather_df.tail()

In [None]:
# Repeat for Test
test_completed = test_weather_df.to_csv("../assets/test_complete.csv", index = False, encoding = "utf-8")