In [20]:
"""
This file performs preprocessing on the relevant data, to prepare for model building.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from typing import Union, List
import xgboost

from sklearn import preprocessing, tree, ensemble, linear_model, metrics, model_selection, svm


In [21]:
def twentyfour_hour_to_float(HHMM: str) -> float:
    """
    Converts a time in HHMM format to a float representing hours.
    
    Parameters
    ----------
    HHMM : str
        A string representing time in HHMM format (e.g., '1230' for 12:30 PM).
    
    Returns:
    ----------
    float
        A float representing the time in hours. If HHMM is '-', returns NaN.
    """

    if HHMM == '-':
        return np.nan
    else:
        hour = float(HHMM[:2])
        minute = float(HHMM[2:])
        return hour + minute/60
    

def avg_wind_direction(S1: Union[float, np.nan], S2: Union[float, np.nan]) -> float:
    """
    Calculates the average wind direction from two stations, accounting for circular nature of wind direction.

    Parameters
    ----------
    S1 : float or np.nan
        Wind direction from Station 1 (in 10-degree increments, 0-35 scale).
    S2 : float or np.nan
        Wind direction from Station 2 (in 10-degree increments, 0-35 scale).

    Returns
    -------
    float
        Averaged wind direction. Handles missing values and circular differences.
    """

    # Handle missing values
    if S1 == np.nan:
        avg = float(S2)
    elif S2 == np.nan:
        avg = float(S1)
    # Handle circular difference (i.e., crossing the 0/36 boundary)
    elif np.abs(S1 - S2) > 18:
        if S1 < S2:
            S1 += 36
        else:
            S2 += 36
        avg = (S1 + S2) / 2.0
    else:
        avg = (S1 + S2) / 2.0

    # Adjust back if average exceeds 36 (due to circular adjustment)
    if avg >= 36:
        avg -= 36
        
    return avg


def combine_conditions(S1: str, S2: str) -> Union[str, List[str]]:
    """
    Combines weather condition codes from two stations.

    Parameters
    ----------
    S1 : str
        Condition string from Station 1 (space-separated codes).
    S2 : str
        Condition string from Station 2 (space-separated codes).

    Returns
    -------
    str or list
        Combined unique condition codes. Returns empty string if both are empty.
    """

    # Handle empty conditions from both stations
    if S1 == " " and S2 == " ":
        return ""
    elif S1 == " ":
        return S2
    elif S2 == " ":
        return S1
    else:
        # Split condition codes into lists and combine unique codes
        S1_list = S1.split(" ")
        S2_list = S2.split(" ")
        return list(set(S1_list + S2_list))


def avg_col(S1: Union[float, np.nan], S2: Union[float, np.nan]) -> float:
    """
    Computes the average of two numeric values, handling missing data.

    Parameters
    ----------
    S1 : float or np.nan
        Value from Station 1.
    S2 : float or np.nan
        Value from Station 2.

    Returns
    -------
    float
        Average of S1 and S2, or the non-missing value if one is missing.
    """

    # Handle missing values
    if S1 == np.nan:
        avg = float(S2)
    elif S2 == np.nan:
        avg = float(S1)
    else:
        avg = (S1 + S2) / 2.0
        
    return avg


In [22]:
spray = pd.read_pickle("../data/spray.pkl")
weather = pd.read_pickle("../data/weather.pkl")
mosquito = pd.read_pickle("../data/mosquito_data.pkl")

In [23]:
# Convert dates in each file to datetime objects
for df in [spray, weather, mosquito]:
    df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d')

In [24]:
# In a similar way, many other variables in the weather dataframe are objects due to the values that
# some observations have, such as "M" for missing values. These will be converted to numeric.
num_vars = ['Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'PrecipTotal',
                 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed']
for var in num_vars:
    weather[var] = pd.to_numeric(weather[var], errors='coerce')

In [25]:
# Extracting time features for time columns
weather.Sunrise = weather.Sunrise.apply(twentyfour_hour_to_float)
weather.Sunset = weather.Sunset.apply(twentyfour_hour_to_float)

# Calculate hours in day
weather["hours_in_day"] = weather.Sunset - weather.Sunrise

In [26]:
# Dropping unnecessary columns since they are mostly all NaNs
weather.drop(['Depth', 'Water1', 'SnowFall'], axis=1, inplace=True)

In [None]:
# For simplicity, the data from both weather stations will be combined

# Initialize dataframe to hold combined weather data
combined_weather = pd.DataFrame()

# Split the weather data into two stations
s1 = weather[weather.Station==1]
s2 = weather[weather.Station==2]

# Set columns that only have data for Station 1 or the data being the same.
for col in ['Date', 'Depart', 'Sunrise', 'Sunset', 'hours_in_day']:
    combined_weather[col] = s1[col].values

# Apply average functions to the columns that have data for both stations
for col in ['Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'AvgSpeed']:
    combined_weather[col] = [avg_col(a, b) for a, b in zip(s1[col].values, s2[col].values)]

# Applying related weather average functions
combined_weather['ResultDir'] = [avg_wind_direction(a, b) for a, b in zip(s1.ResultDir.values, s2.ResultDir.values)]
combined_weather['CodeSum'] = [combine_conditions(a, b) for a, b in zip(s1.CodeSum.values, s2.CodeSum.values)]

# PrecipTotal for each station is too different, so I keep both separate.
combined_weather['PrecipTotal_station1'] = weather.PrecipTotal[weather.Station==1].values
combined_weather['PrecipTotal_station2'] = weather.PrecipTotal[weather.Station==2].values

In [None]:
# However, the precipitation totals have too many missing values to be worth imputing in an accurate 
# sense. They will be dropped here.
combined_weather.drop(['PrecipTotal_station1', 'PrecipTotal_station2'], axis=1, inplace=True)

In [None]:
# Here, I impute missing values for some columns. I try using a unique strategy, where I train a 
# small model to predict the missing values based on the other columns. This is done for the
# WetBulb and StnPressure columns, which have a significant number of missing values.

# WetBulb
X = combined_weather.dropna().drop(['Date', 'WetBulb', 'CodeSum'], axis=1)
y = combined_weather.dropna()['WetBulb']

dt_WetBulb = ensemble.RandomForestRegressor(n_estimators=100)
dt_WetBulb.fit(X, y)

subset = combined_weather.columns.drop('WetBulb')
predicted_WetBulb = dt_WetBulb.predict(combined_weather.dropna(subset=subset).drop(['Date', 'WetBulb', 'CodeSum'], axis=1)[combined_weather.WetBulb.isnull()])

combined_weather.WetBulb[combined_weather.WetBulb.isnull()] = predicted_WetBulb

# StnPressure
X = combined_weather.dropna().drop(['Date', 'StnPressure', 'CodeSum'], axis=1)
y = combined_weather.dropna(subset=subset)['StnPressure']

xb_StnPressure = xgboost.XGBRegressor()
xb_StnPressure.fit(X, y)

predicted_StnPressure = xb_StnPressure.predict(combined_weather.drop(['Date', 'StnPressure', 'CodeSum'], axis=1)[combined_weather.StnPressure.isnull()])

combined_weather.StnPressure[combined_weather.StnPressure.isnull()] = predicted_StnPressure