In [2]:
%matplotlib inline
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
#airport geo
init_Data = data = pd.read_csv(
    os.path.join('data', 'train.csv.bz2')
)
airportname = init_Data['Departure'].unique()
airport = pd.read_csv("airport_data/airport-codes.txt",sep=",")
airport = airport[airport['local_code'].isin(airportname)]
airport = airport[airport['iso_country']=='US'].reset_index()
airport['StateCodes'] = airport['iso_region'].str.split('-',expand=True).iloc[:,1]
airport.rename(columns = {'municipality':'City'}, inplace=True)
airport.replace('Dallas-Fort Worth','Dallas', inplace=True)

In [4]:
#https://www.feiertagskalender.ch/ferien.php?geo=3537&jahr=2012&klasse=0&hl=en
#type of date:datetime.date
#need to turn datetime to date: df[''].dt.date
import datetime as dt
import holidays

#get holidays 
Holidays_US = holidays.US()[dt.date(2011,7, 1):dt.date(2013,6, 5)] + holidays.US()[dt.date(2012,1, 1):dt.date(2012,12, 31)]

school_break = pd.read_csv('schoolholiday.csv')
school_break.loc[:, 'start'] =pd.to_datetime(school_break.loc[:, 'start']).dt.date
school_break.loc[:, 'end'] = pd.to_datetime(school_break.loc[:, 'end']).dt.date

def nextworkday(date):
    one_day = dt.timedelta(days=1)
    next_day = date + one_day 
    while next_day.weekday() in holidays.WEEKEND or next_day in Holidays_US:
        next_day += one_day 
    return (next_day - date).days
    
def lastworkday(date):
    one_day = dt.timedelta(days=1)
    last_day = date - one_day 
    while last_day.weekday() in holidays.WEEKEND or last_day in Holidays_US:
        last_day -= one_day 
    return (date - last_day).days   

def schoolbreak(date):
    one_day = dt.timedelta(days=1)
    lastschoolday = lastworkday(date)
    nextschoolday = nextworkday(date)
    for i in range(len(school_break['start'])-1):
        if date >= school_break['start'][i] and date <= school_break['end'][i]:
            lastschoolday = (date - school_break['start'][i] + one_day).days
            nextschoolday = (school_break['end'][i] + one_day - date).days   
        elif date == school_break['end'][i] + one_day:
            lastschoolday = (date - school_break['start'][i] + one_day).days
        elif date == school_break['start'][i] - one_day:
            nextschoolday = (school_break['end'][i] + one_day - date).days   
    return lastschoolday, nextschoolday


In [5]:
#census, geo and eco data of state
#https://www.kaggle.com/lislejoem/us_energy_census_gdp_10-14
steco = pd.read_csv('state-eco.csv')
steco.set_index(['StateCodes'])
steco.drop([col for col in steco.columns if '2010' in col or '2014' in col], axis=1, inplace=True)
stgeo = steco[['StateCodes','State', 'Region', 'Division', 'Coast', 'Great Lakes']]
col = [col for col in steco.columns 
       if 'POP' in col 
       or 'RBIRTH' in col 
       or 'RDEATH' in col 
       or 'StateCodes' in col
       or 'RNETMIG' in col]
stcensus = steco[col].set_index(['StateCodes'])


In [6]:
# census, eco data of city
#https://apps.bea.gov/regional/histdata/releases/0615rpi/index.cfm
#https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1#

city = airport['City'].unique()

citygdp = pd.read_csv('citygdp.csv')
citygdp = citygdp[citygdp['GeoName'].isin(city)].reset_index()
citygdp = citygdp[['GeoName',"GDP2011","GDP2012","GDP2013"]]
citygdp.rename(columns={"GeoName":'City'}, inplace=True)

cityincome = pd.read_csv('cityincome.csv')
cityincome = cityincome[cityincome['GeoName'].isin(city)].reset_index()
cityincome = cityincome[['GeoName',"RPI2011","RPI2012","RPI2013"]]
cityincome.rename(columns={"GeoName":'City'}, inplace=True)

citycensus = pd.read_csv('citycensus.csv')
citycensus = citycensus[citycensus['city'].isin(city)].reset_index()
citycensus = citycensus[['city',"2011","2012","2013"]]
citycensus.rename(
    columns={'city':'City',"2011":'POP2011',"2012":'POP2012',"2013":'POP2013'}, 
             inplace=True)

In [7]:
df_merged = pd.merge(
    airport, stgeo, how='left', on=['StateCodes'], sort=False
    )
df_merged = pd.merge(
    df_merged, stcensus, how='left', on=['StateCodes'], sort=False
    )
df_merged = pd.merge(
    df_merged, citygdp, how='left', on=['City'], sort=False
    )
df_merged = pd.merge(
    df_merged, citycensus, how='left', on=['City'], sort=False
    )
df_merged = pd.merge(
    df_merged, cityincome, how='left', on=['City'], sort=False
    )


In [8]:
df1 = df_merged.drop(
    [col for col in df_merged.columns if '2012' in col or '2013' in col], axis=1)
df2 = df_merged.drop(
    [col for col in df_merged.columns if '2011' in col or '2013' in col], axis=1)
df3 = df_merged.drop(
    [col for col in df_merged.columns if '2011' in col or '2012' in col], axis=1)

df1['Year']= 2011
df1.rename(columns =
           {'GDP2011':'GDP','POP2011':'POP','RPI2011':'RPI',
            'POPESTIMATE2011':'StPOP','RBIRTH2011':'StRBirth',
            'RDEATH2011':'StRDeath','RNETMIG2011':'StRMig'}, inplace=True)
df2['Year']= 2012
df2.rename(columns =
           {'GDP2012':'GDP','POP2012':'POP','RPI2012':'RPI',
           'POPESTIMATE2012':'StPOP','RBIRTH2012':'StRBirth',
            'RDEATH2012':'StRDeath','RNETMIG2012':'StRMig'}, inplace=True)
df3['Year']= 2013
df3.rename(columns =
           {'GDP2013':'GDP','POP2013':'POP','RPI2013':'RPI',
           'POPESTIMATE2013':'StPOP','RBIRTH2013':'StRBirth',
            'RDEATH2013':'StRDeath','RNETMIG2013':'StRMig'}, inplace=True)

df_merged = pd.concat([df1, df2, df3], axis=0)

In [9]:
df_merged.drop('continent', axis=1, inplace=True)

In [10]:
df_merged.rename(columns={'iata_code':'AirPort'}, inplace=True)

In [11]:
df_merged.drop(labels=['index','ident','type','name','iso_country','gps_code','local_code','local_code'], axis=1, inplace=True)

In [12]:
delay1= pd.read_csv("./airport_data/delay.csv")
delay1.dropna(axis=0, subset=['Facility'], how='any', inplace=True)
delay1['Date'] = pd.to_datetime(delay1['Date'])

In [13]:
delay1

Unnamed: 0,Date,Facility,Total_ops,Total Delays,TMI_to,Dep,Abrn,TMI_from_local,TMI_from_non_local,Total_occ_at,...,Class_AT,Class_GA,Class_Mil,Cause_Wx,Cause_Vol,Cause_Equip,Cause_Rwy,Cause_Other,Avg_delay_time,Total_delay_time
0,2011-01-01,ATL,2101,23,1,22,0,0,3,25,...,0,0,0,22,1,0,0,0,19.0,437
1,2011-01-01,BOS,696,0,0,0,0,1,5,6,...,0,0,0,0,0,0,0,0,0.0,0
2,2011-01-01,CLT,1183,9,9,0,0,0,4,4,...,1,0,0,1,8,0,0,0,28.0,252
3,2011-01-01,DEN,1606,0,0,0,0,0,9,9,...,0,0,0,0,0,0,0,0,0.0,0
4,2011-01-01,DFW,1511,0,0,0,0,0,6,6,...,0,0,0,0,0,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23029,2014-01-01,ORD,1546,52,0,52,0,0,6,58,...,0,0,0,52,0,0,0,0,23.0,1196
23030,2014-01-01,PHL,989,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
23031,2014-01-01,PHX,1113,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
23032,2014-01-01,SEA,775,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0.0,0


In [14]:
delay1 = delay1[['Date','Facility','Total_ops','Total Delays','Avg_delay_time','Total_delay_time']]
delay1.rename(columns={'Facility':'AirPort'},inplace=True)

In [15]:
delay2= pd.read_csv("airport_data/delays.csv")
delay2['Date'] = pd.to_datetime(delay2['Date'])

In [16]:
delay2= delay2[['Date','Facility','Actual Departures', 'Actual Arrivals', 'Departure Cancellations','Arrival Cancellations','Delayed Arrivals','Average Delay Per Delayed Arrival']]
delay2.rename(columns={'Facility':'AirPort'},inplace=True)

In [17]:
oil = pd.read_csv("airport_data/oil_series.csv")
oil['Date'] = pd.to_datetime(oil['Date'])
oil['oil_price'] = oil['Price']

In [18]:
external = pd.read_csv(r'submissions\use_external_data\external_data.csv', header=0)

In [21]:
external['Date']=pd.to_datetime(external['Date'])
external_data = pd.merge(left=external, right=delay1, how='left', on=['Date','AirPort'], sort=False)
external_data = pd.merge(left=external_data, right=delay2, how='left', on=['Date','AirPort'], sort=False)
external_data = pd.merge(left=external_data, right=oil, how='left', on=['Date'], sort=False)
external_data['Year'] = external_data['Date'].dt.year
external_data = pd.merge(left=external_data, right=df_merged, how='left', on=['Year','AirPort'], sort=False)

In [24]:
external_data.columns

Index(['Date', 'AirPort', 'Max TemperatureC', 'Mean TemperatureC',
       'Min TemperatureC', 'Dew PointC', 'MeanDew PointC', 'Min DewpointC',
       'Max Humidity', 'Mean Humidity', 'Min Humidity',
       'Max Sea Level PressurehPa', 'Mean Sea Level PressurehPa',
       'Min Sea Level PressurehPa', 'Max VisibilityKm', 'Mean VisibilityKm',
       'Min VisibilitykM', 'Max Wind SpeedKm/h', 'Mean Wind SpeedKm/h',
       'Max Gust SpeedKm/h', 'Precipitationmm', 'CloudCover', 'Events',
       'WindDirDegrees', 'Total_ops', 'Total Delays', 'Avg_delay_time',
       'Total_delay_time', 'Actual Departures', 'Actual Arrivals',
       'Departure Cancellations', 'Arrival Cancellations', 'Delayed Arrivals',
       'Average Delay Per Delayed Arrival', 'Unnamed: 0', 'Price', 'oil_price',
       'Year', 'elevation_ft', 'iso_region', 'City', 'coordinates',
       'StateCodes', 'State', 'Region', 'Division', 'Coast', 'Great Lakes',
       'StPOP', 'StRBirth', 'StRDeath', 'StRMig', 'GDP', 'POP', 'RPI'],


In [26]:
d = external_data.coordinates.str.split(',', n=1, expand=True)
external_data['Latitude'] = d[0].astype(float)
external_data['longitude'] = d[1].astype(float)
external_data.to_csv(r'submissions\use_external_data\external_data_mod.csv')