In [None]:
import pandas as pd
import datetime as dt
import numpy as np

In [None]:
def convert_F_to_C(temp):
    return((temp-32)/1.8)

In [None]:
weather = pd.read_csv("./atlanta_weather_summary_1999-2019.csv")
#selects weather data from Atlanta Hartsfield International Airport
airport_weather = weather.loc[weather['STATION'] == "USW00013874"]

In [None]:
airport_weather.PRCP.isna().sum()

In [None]:
airport_weather = airport_weather[['DATE','TMIN','TMAX','PRCP']]
#interpolates to fill in 1 missing value for PRCP
airport_weather = airport_weather.interpolate()
airport_weather.PRCP.isna().sum()

In [None]:
airport_weather.loc[:,"TCMAX"] = convert_F_to_C(airport_weather.loc[:,"TMAX"])
airport_weather.loc[:,"TCMIN"] = convert_F_to_C(airport_weather.loc[:,"TMIN"])
#Calculates Growing Degree Days
#(A measure of heat accumulation used by horticulturists
# gardeners and farmers)
airport_weather.loc[:,"GDD"] = (airport_weather.loc[:,"TCMAX"]+airport_weather.loc[:,"TCMIN"])/2-10

In [None]:
airport_weather.loc[airport_weather["GDD"]<0,"GDD"] = 0

In [None]:
airport_weather.loc[:,"DATE"] =  pd.to_datetime(airport_weather.loc[:,"DATE"])

In [None]:
def add_gdd_sum(df, year):
    #calculates cumulative growing degree days
    #starting at Jan 1 each year
    year_match_boolean_vector = df.loc[:,"DATE"].dt.year == year
    df = df.loc[year_match_boolean_vector,:]
    df.loc[:,'GDDSUM'] = df.loc[:,'GDD'].cumsum()
    df.DATE.min()
    return(df)
train_weather = add_gdd_sum(airport_weather, 2000)
for y in list(range(2001,2020)):
    train_weather = train_weather.append(add_gdd_sum(airport_weather,y))

train_weather.GDDSUM.plot()

In [None]:
def add_shifted_column(df,colname,shift_num):
    # offset the time of a specified column
    # filling in zeros in the beginning positions
    # emptied by the offset
    shifted_colname = colname + "_shift" + str(shift_num)
    df.loc[:,shifted_colname] = df.loc[:,colname].shift(periods=shift_num, fill_value=0)
    return(df)

def add_shifted_group(train_weather, shift_num):
    train_weather = add_shifted_column(train_weather,"PRCP",shift_num)
    train_weather = add_shifted_column(train_weather,"GDDSUM",shift_num)
    train_weather = add_shifted_column(train_weather,"GDD",shift_num)
    train_weather = add_shifted_column(train_weather,"TMIN",shift_num)
    train_weather = add_shifted_column(train_weather,"TMAX",shift_num)
    return(train_weather)
    
train_weather = add_shifted_group(train_weather,7)
train_weather = add_shifted_group(train_weather,14)    
train_weather = add_shifted_group(train_weather,30)
train_weather = add_shifted_group(train_weather,60)

# Write Weather Training Data to weather.csv

In [None]:
train_weather.to_csv("./train_weather.csv")

# Pollen Data Processing

In [None]:
pollen = pd.read_csv("./atlanta_pollen_history.csv")

pollen = pollen.rename(columns={'date':'Date',
                               'pcount':'Count'})
pollen['Date']= pd.to_datetime(pollen['Date'])
pollen= pollen.loc[:,['Date','Count']]

In [None]:
pollen['Doy'] = pollen["Date"].dt.dayofyear
pollen['Year'] = pollen["Date"].dt.year

In [None]:
#trim pollen data by start and end date
pollen = pollen[(pollen["Date"].dt.year >= 2000) & (pollen["Date"].dt.year <= 2019)]
pollen = pollen[pollen["Date"] <= dt.datetime.strptime("2019-06-19",'%Y-%m-%d')]
pollen_interpolated = pollen.interpolate()[pollen.interpolate().Count.notnull()]


# Write pollen and pollen interpolated to csv files

In [None]:
pollen.to_csv("./pollen.csv")
pollen_interpolated.to_csv("./pollen_interpolated.csv")

# Create final data frames for analysis

In [None]:
final_data= pd.merge(pollen, train_weather, left_on='Date', right_on='DATE')
final_data_interpolated = pd.merge(pollen_interpolated, train_weather, left_on='Date', right_on='DATE')

# Write final data frames to csv

In [None]:
final_data.to_csv("./final_data.csv")
final_data_interpolated.to_csv("./final_data_interpolated.csv")