In [1]:
%matplotlib inline
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#airport geo
init_Data = data = pd.read_csv(
    os.path.join('data', 'train.csv.bz2')
)
airportname = init_Data['Departure'].unique()
airport = pd.read_csv("airport_data/airport-codes.txt",sep=",")
airport = airport[airport['local_code'].isin(airportname)]
airport = airport[airport['iso_country']=='US'].reset_index()
airport['StateCodes'] = airport['iso_region'].str.split('-',expand=True).iloc[:,1]
airport.rename(columns = {'municipality':'City'}, inplace=True)
airport.replace('Dallas-Fort Worth','Dallas', inplace=True)

In [3]:
#https://www.feiertagskalender.ch/ferien.php?geo=3537&jahr=2012&klasse=0&hl=en
#type of date:datetime.date
#need to turn datetime to date: df[''].dt.date
import datetime as dt
import holidays

#get holidays 
Holidays_US = holidays.US()[dt.date(2011,7, 1):dt.date(2013,6, 5)] + holidays.US()[dt.date(2012,1, 1):dt.date(2012,12, 31)]

school_break = pd.read_csv('schoolholiday.csv')
school_break.loc[:, 'start'] =pd.to_datetime(school_break.loc[:, 'start']).dt.date
school_break.loc[:, 'end'] = pd.to_datetime(school_break.loc[:, 'end']).dt.date

def nextworkday(date):
    one_day = dt.timedelta(days=1)
    next_day = date + one_day 
    while next_day.weekday() in holidays.WEEKEND or next_day in Holidays_US:
        next_day += one_day 
    return (next_day - date).days
    
def lastworkday(date):
    one_day = dt.timedelta(days=1)
    last_day = date - one_day 
    while last_day.weekday() in holidays.WEEKEND or last_day in Holidays_US:
        last_day -= one_day 
    return (date - last_day).days   

def schoolbreak(date):
    one_day = dt.timedelta(days=1)
    lastschoolday = lastworkday(date)
    nextschoolday = nextworkday(date)
    for i in range(len(school_break['start'])-1):
        if date >= school_break['start'][i] and date <= school_break['end'][i]:
            lastschoolday = (date - school_break['start'][i] + one_day).days
            nextschoolday = (school_break['end'][i] + one_day - date).days   
        elif date == school_break['end'][i] + one_day:
            lastschoolday = (date - school_break['start'][i] + one_day).days
        elif date == school_break['start'][i] - one_day:
            nextschoolday = (school_break['end'][i] + one_day - date).days   
    return lastschoolday, nextschoolday


In [4]:
#census, geo and eco data of state
#https://www.kaggle.com/lislejoem/us_energy_census_gdp_10-14
steco = pd.read_csv('state-eco.csv')
steco.set_index(['StateCodes'])
steco.drop([col for col in steco.columns if '2010' in col or '2014' in col], axis=1, inplace=True)
stgeo = steco[['StateCodes','State', 'Region', 'Division', 'Coast', 'Great Lakes']]
col = [col for col in steco.columns 
       if 'POP' in col 
       or 'RBIRTH' in col 
       or 'RDEATH' in col 
       or 'StateCodes' in col
       or 'RNETMIG' in col]
stcensus = steco[col].set_index(['StateCodes'])


In [5]:
# census, eco data of city
#https://apps.bea.gov/regional/histdata/releases/0615rpi/index.cfm
#https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1#

city = airport['City'].unique()

citygdp = pd.read_csv('citygdp.csv')
citygdp = citygdp[citygdp['GeoName'].isin(city)].reset_index()
citygdp = citygdp[['GeoName',"GDP2011","GDP2012","GDP2013"]]
citygdp.rename(columns={"GeoName":'City'}, inplace=True)

cityincome = pd.read_csv('cityincome.csv')
cityincome = cityincome[cityincome['GeoName'].isin(city)].reset_index()
cityincome = cityincome[['GeoName',"RPI2011","RPI2012","RPI2013"]]
cityincome.rename(columns={"GeoName":'City'}, inplace=True)

citycensus = pd.read_csv('citycensus.csv')
citycensus = citycensus[citycensus['city'].isin(city)].reset_index()
citycensus = citycensus[['city',"2011","2012","2013"]]
citycensus.rename(
    columns={'city':'City',"2011":'POP2011',"2012":'POP2012',"2013":'POP2013'}, 
             inplace=True)

In [6]:
df_merged = pd.merge(
    airport, stgeo, how='left', on=['StateCodes'], sort=False
    )
df_merged = pd.merge(
    df_merged, stcensus, how='left', on=['StateCodes'], sort=False
    )
df_merged = pd.merge(
    df_merged, citygdp, how='left', on=['City'], sort=False
    )
df_merged = pd.merge(
    df_merged, citycensus, how='left', on=['City'], sort=False
    )
df_merged = pd.merge(
    df_merged, cityincome, how='left', on=['City'], sort=False
    )


In [7]:
df1 = df_merged.drop(
    [col for col in df_merged.columns if '2012' in col or '2013' in col], axis=1)
df2 = df_merged.drop(
    [col for col in df_merged.columns if '2011' in col or '2013' in col], axis=1)
df3 = df_merged.drop(
    [col for col in df_merged.columns if '2011' in col or '2012' in col], axis=1)

df1['Year']= 2011
df1.rename(columns =
           {'GDP2011':'GDP','POP2011':'POP','RPI2011':'RPI',
            'POPESTIMATE2011':'StPOP','RBIRTH2011':'StRBirth',
            'RDEATH2011':'StRDeath','RNETMIG2011':'StRMig'}, inplace=True)
df2['Year']= 2012
df2.rename(columns =
           {'GDP2012':'GDP','POP2012':'POP','RPI2012':'RPI',
           'POPESTIMATE2012':'StPOP','RBIRTH2012':'StRBirth',
            'RDEATH2012':'StRDeath','RNETMIG2012':'StRMig'}, inplace=True)
df3['Year']= 2013
df3.rename(columns =
           {'GDP2013':'GDP','POP2013':'POP','RPI2013':'RPI',
           'POPESTIMATE2013':'StPOP','RBIRTH2013':'StRBirth',
            'RDEATH2013':'StRDeath','RNETMIG2013':'StRMig'}, inplace=True)

df_merged = pd.concat([df1, df2, df3], axis=0)

In [8]:
df_merged

Unnamed: 0,index,ident,type,name,elevation_ft,continent,iso_country,iso_region,City,gps_code,iata_code,local_code,coordinates,StateCodes,State,Region,Division,Coast,Great Lakes,StPOP,StRBirth,StRDeath,StRMig,GDP,POP,RPI,Year
0,26134,KATL,large_airport,Hartsfield Jackson Atlanta International Airport,1026.0,,US,US-GA,Atlanta,KATL,ATL,ATL,"33.6367, -84.428101",GA,Georgia,3.0,5.0,1.0,0.0,9813201,13.58401,7.280748,3.375007,286108,437812,40064,2011
1,26260,KBOS,large_airport,General Edward Lawrence Logan International Ai...,20.0,,US,US-MA,Boston,KBOS,BOS,BOS,"42.36429977, -71.00520325",MA,Massachusetts,1.0,1.0,1.0,0.0,6612270,11.107938,8.198178,4.789037,341225,630505,50687,2011
2,26421,KCLT,large_airport,Charlotte Douglas International Airport,748.0,,US,US-NC,Charlotte,KCLT,CLT,CLT,"35.2140007019043, -80.94309997558594",NC,North Carolina,3.0,5.0,1.0,0.0,9651502,12.601329,8.323153,5.047944,124937,754829,40103,2011
3,26563,KDEN,large_airport,Denver International Airport,5431.0,,US,US-CO,Denver,KDEN,DEN,DEN,"39.861698150635, -104.672996521",CO,Colorado,4.0,8.0,0.0,0.0,5119661,12.965474,6.297848,6.933159,160492,620530,45280,2011
4,26568,KDFW,large_airport,Dallas Fort Worth International Airport,607.0,,US,US-TX,Dallas,KDFW,DFW,DFW,"32.896801, -97.038002",TX,Texas,3.0,7.0,1.0,0.0,25657477,14.998902,6.602926,7.675275,402824,1218282,42646,2011
5,26616,KDTW,large_airport,Detroit Metropolitan Wayne County Airport,645.0,,US,US-MI,Detroit,KDTW,DTW,DTW,"42.212398529052734, -83.35340118408203",MI,Michigan,2.0,3.0,0.0,1.0,9875736,11.541682,9.100641,-2.546041,206508,705118,40025,2011
6,27090,KEWR,large_airport,Newark Liberty International Airport,18.0,,US,US-NJ,Newark,KEWR,EWR,EWR,"40.692501068115234, -74.168701171875",NJ,New Jersey,1.0,2.0,1.0,0.0,8842614,12.091446,8.054088,0.454262,1368438,277574,44793,2011
7,27600,KIAH,large_airport,George Bush Intercontinental Houston Airport,97.0,,US,US-TX,Houston,KIAH,IAH,IAH,"29.984399795532227, -95.34140014648438",TX,Texas,3.0,7.0,1.0,0.0,25657477,14.998902,6.602926,7.675275,441736,2126032,46307,2011
8,27683,KJFK,large_airport,John F Kennedy International Airport,13.0,,US,US-NY,New York,KJFK,JFK,JFK,"40.63980103, -73.77890015",NY,New York,1.0,2.0,1.0,1.0,19521745,12.491813,7.694653,1.49435,1368438,8272948,44793,2011
9,27797,KLAS,large_airport,McCarran International Airport,2181.0,,US,US-NV,Las Vegas,KLAS,LAS,LAS,"36.08010101, -115.1520004",NV,Nevada,4.0,8.0,0.0,0.0,2718586,13.221128,7.45323,-0.080781,85173,586606,34324,2011
