# Kaggle Challenge: West Nile Virus Prediction
https://www.kaggle.com/c/predict-west-nile-virus/overview

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime, time
from sklearn.linear_model import LinearRegression

# Clean and Format Data

In [2]:
def fill_missing_data(df, drop_cols, missing_vals=['M','-'], fill_val=np.nan):
    '''Remove missing values from DataFrame.
    
    Parameters
    ----------
    df : pandas.DataFrame
    drop_cols : list (list of columns in df)
    missing_vals : list (list of entries indicating missing data)
    fill_val : float, int, str, etc (value to set missing data)
    '''
    df = df.copy()
    for col in df.columns:
        for val in missing_vals:
            mask = df[col] == val
            df.loc[mask, col] = fill_val
    df.drop(drop_cols, axis=1, inplace=True)
    return df

def fill_trace_data(df, trace_cols=['SnowFall', 'PrecipTotal'], fill_vals=[0.001, 0.0001]):
    df = df.copy()
    for i, col in enumerate(trace_cols):
        df[col] = df[col].str.strip(' ')
        df.loc[df[col] == 'T', col] = np.array(fill_vals)[i]
    return df

In [3]:
def find_closest_station(df):
    '''
    Use latitude and longitude data to find the closest weather station.
    
    Parameters
    ----------
    df : pandas.DataFrame
    '''
    out_df = pd.DataFrame(index=df.index, columns=['ClosestStation', 'DistStation1', 'DistStation2'])
    # Station 1: Chicago O'Hare International Airport, latitude & longitude
    station1 = np.array([41.995, -87.933])
    # Station 2: Chicago Midway International Airport, latitude & longitude
    station2 = np.array([41.786, -87.752])
    for row in df.index:
        location = df[['Latitude', 'Longitude']].loc[row].values
        dist_station1 = np.linalg.norm(location - station1)
        dist_station2 = np.linalg.norm(location - station2)
        out_df.loc[row, 'DistStation1'] = dist_station1
        out_df.loc[row, 'DistStation2'] = dist_station2
        if dist_station1 < dist_station2:
            out_df.ClosestStation.loc[row] = 1
        else:
            out_df.ClosestStation.loc[row] = 2
    return out_df

In [4]:
def make_match_weather_df(df, weather1_df, weather2_df, shift_days=0):
    '''
    Use the date and closest station to get the appropriate weather data.
    
    df : pandas.DataFrame (data indexed by date)
    weather1_df : pandas.DataFrame (Data from weather station 1)
    weather2_df : pandas.DataFrame (Data from weather station 2)
    shift_days : int (number of days to shift weather data backwards, >= 0)
    '''
    out_df = pd.DataFrame(index=df.index, columns=weather1_df.columns)
    for ind in df.index:
        if shift_days > 0:
            date = pd.to_datetime(df.Date.loc[ind]) - datetime.timedelta(days=int(shift_days))
        else:
            date = pd.to_datetime(df.Date.loc[ind])
        date_str = date.strftime('%Y-%m-%d')
        if df.loc[ind, 'ClosestStation'] == 1:
            out_df.loc[ind] = weather1_df.loc[date_str]
        elif df.loc[ind, 'ClosestStation'] == 2:
            out_df.loc[ind] = weather2_df.loc[date_str]
    return out_df

In [5]:
def get_unique_codes(codes, separator=' '):
    '''
    Identify the unique codes from a list of codes.
    
    codes : list or array-like
    separator : str (str that can be used to separate entries)
    
    Parameters
    ----------
    codes : list (list of strings)'''
    # convert codes to a string
    join_codes = separator.join(codes)
    # convert string of codes to a list
    split_codes = join_codes.split(separator)
    # get unique codes
    unique_codes = list(np.unique(split_codes))
    unique_codes.remove('')
    return unique_codes

def add_weather_codes(df):
    '''
    Add a column for each code indicating presence of weather event.
    
    Parameters
    ----------
    df : pandas.DataFrame
    '''
    codes = get_unique_codes(df.CodeSum)
    out_df = pd.DataFrame(index=df.index, columns=codes)
    for ind in df.index:
        for code in codes:
            if code in df.loc[ind, 'CodeSum']:
                out_df.loc[ind, code] = 1
            else:
                out_df.loc[ind, code] = 0
    return pd.concat([df, out_df], axis=1)

In [6]:
# Define directory information
projdir = '/Users/klarnemann/Documents/Insight/predict-west-nile-virus'
datadir = '%s/data' % (projdir)

In [7]:
# Load data
spray_df = pd.read_csv('%s/spray.csv' % (datadir))
test_df = pd.read_csv('%s/test.csv' % (datadir))
train_df = pd.read_csv('%s/train.csv' % (datadir))
weather_df = pd.read_csv('%s/weather.csv' % (datadir))

In [8]:
# Divide the information on the weather codes (i.e. "CodeSum") into separate columns for each code
weather_df = add_weather_codes(weather_df)
weather_df.drop(['CodeSum'], axis=1, inplace=True)

In [9]:
# Find the closest weather station for each index in train_df
station_df = find_closest_station(train_df)

In [10]:
# Add information on the closest weather station to train_df
train_df = pd.concat([train_df, station_df], axis=1)

In [11]:
# Select and clean weather data from station 1
weather1_df = weather_df[weather_df.Station == 1]
weather1_df = fill_missing_data(weather1_df.copy(), drop_cols=['Water1', 'Station'])
weather1_df = fill_trace_data(weather1_df.copy())
weather1_df.index = weather1_df.Date

  result = method(y)


In [12]:
# Select and clean weather data from station 2
weather2_df = weather_df[weather_df.Station == 2]
weather2_df = fill_missing_data(weather2_df.copy(), drop_cols=['Water1', 'Station'])
weather2_df = fill_trace_data(weather2_df.copy())
weather2_df.index = weather2_df.Date

In [13]:
# Select the weather data from the closest station for each entry in train_df
match_weather_df = make_match_weather_df(train_df, weather1_df, weather2_df, shift_days=0)

# Analyze Data

Given weather, location, testing, and spraying data, this competition asks you to predict when and where different species of mosquitos will test positive for West Nile virus. A more accurate method of predicting outbreaks of West Nile virus in mosquitos will help the City of Chicago and CPHD more efficiently and effectively allocate resources towards preventing transmission of this potentially deadly virus.

In [14]:
train_df.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'ClosestStation', 'DistStation1',
       'DistStation2'],
      dtype='object')

In [15]:
match_weather_df.columns

Index(['Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'Depth', 'SnowFall', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'BCFG', 'BR', 'DZ', 'FG', 'FG+', 'FU', 'GR', 'HZ', 'MIFG', 'RA', 'SN',
       'SQ', 'TS', 'TSRA', 'VCFG', 'VCTS'],
      dtype='object')

In [16]:
np.unique(train_df.Species)

array(['CULEX ERRATICUS', 'CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS',
       'CULEX RESTUANS', 'CULEX SALINARIUS', 'CULEX TARSALIS',
       'CULEX TERRITANS'], dtype=object)

In [26]:
# Percent of mosquitos with WNV by species
tmp_df = train_df.groupby('Species')['NumMosquitos', 'WnvPresent'].apply(np.sum)
tmp_df['WnvPresent'] / tmp_df['NumMosquitos']

Species
CULEX ERRATICUS           0.000000
CULEX PIPIENS             0.005373
CULEX PIPIENS/RESTUANS    0.003954
CULEX RESTUANS            0.002091
CULEX SALINARIUS          0.000000
CULEX TARSALIS            0.000000
CULEX TERRITANS           0.000000
dtype: float64