In [14]:
import pandas as pd
import psycopg2
import pandas.io.sql as sqlio
import requests
import pickle

import config as cfg

In [90]:
df = pd.read_csv('../data/flights_samp.csv')
df_air = pd.read_csv('../data/airports_usa.csv')

In [91]:
df.shape

(5000, 43)

In [3]:
host= cfg.db['host']
port = cfg.db['port']
user = cfg.db['user']
pwd = cfg.db['pwd']
database= cfg.db['database']


con = psycopg2.connect(database=database, user=user, password=pwd, host=host, port=port)

print("Database opened successfully")

Database opened successfully


In [29]:
sql = '''
SELECT * from flights 
where fl_date >= '2018-12-27' and fl_date <= '2019-01-12';
'''

# data = sqlio.read_sql_query(sql, con)
# data.to_csv('flights.csv', index=False)
# data_samp = data.sample(n = 5000, random_state=0).reset_index()
# data_samp.to_csv('flights_samp.csv', index=False)

In [74]:
def to_hhmmss(df_time_col):
    '''
    Change the format of the time in hhmm
    to hh:mm:ss, where ss is 00 in this case
    Input: `df_time_col`: a Pandas Series
    Return: a Pandas Series
    '''
    
    hhmm = []
    crs_hm = df_time_col.astype('str')
    for t in crs_hm:
        if len(t) == 1:
            hhmm.append('0' + t + ':00:00')
        elif (len(t) == 2) & (t < '24'):
            hhmm.append(t + ':00:00')
        elif (len(t) == 2) & (t > '24'):
            hhmm.append('00:' + t + ':00')
        elif len(t) == 3:
            hhmm.append('0' + t[0] + ':' + t[1:] + ':00')
        else:
            hhmm.append(t[:2] + ':' + t[2:] + ":00")
    return hhmm

In [75]:
df['crs_dep_time_hhmm'] = to_hhmmss(df.crs_dep_time)

In [None]:
def to_city_state(city_state_col):
    '''
    Change the format of `city1/city2, state`
    to `city1, state` 
    Input: A Pandas Series
    Output: A Pandas Series
    '''
    return pd.Series(map(lambda x: x[0].split('/')[0] + ',' + x[-1].strip() ,
                    city_state_col.str.split(',')))

In [92]:
df_air = df_air[['IATA_CODE', 'LATITUDE', 'LONGITUDE']]
df_air = df_air.rename(columns={'IATA_CODE': 'origin'})

In [93]:
df_air.head(1)

Unnamed: 0,origin,LATITUDE,LONGITUDE
0,ABE,40.65236,-75.4404


In [94]:
top20_airport_code = ['LAX', 'ORD', 'EWR', 'SFO', 'LGA', 'DFW', 'LAS', 'CLT', 'DEN',
                      'PHL', 'IAH', 'SEA', 'ATL', 'PHX', 'MCO', 'DTW', 'SLC', 'BOS',
                      'JFK', 'MSP']

In [95]:
df = df.merge(df_air, on='origin', how='left')
# df_air_origin = df_air[df_air.origin.isin(top20_airport_code)]
df['ll'] = df.LATITUDE.astype('str') + ',' + df.LONGITUDE.astype('str')
df.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)

In [96]:
df_origin = df[df.origin.isin(top20_airport_code)]
df_origin = df_origin [['fl_date', 'll']]
df_origin = df_origin.drop_duplicates().reset_index(drop=True)

In [83]:
def get_weather(df):
    '''
    request weather information
    input: a data frame in which
        - first column: date
        - second column: location in "latitude, longitude"        
    output: a dictionary 
    '''
    key = cfg.rapid_api['key']
    base_url = "https://dark-sky.p.rapidapi.com"
    headers = { 'x-rapidapi-host': "dark-sky.p.rapidapi.com",
                'x-rapidapi-key': key }
    weather_dict = { 'weather':[] }

    count = 1
    for row in df.values:
        print('Query Count: ', count)
        date = row[0]
        ll = row[1]
        url = base_url + '/' + ll + ',' + date + 'T' + '13:00:00'        
        res = requests.get(url, headers=headers)
        if res.status_code == 200:
            weather_json = res.json()
            try:
                weather = weather_json['currently']['summary']
            except:
                weather = 'NA'
            finally:        
                weather_dict['weather'].append(weather)
        else:
            weather_dict['weather'].append('NA')            
        count += 1
    return weather_dict

In [101]:
def merge_df_dict(df_time_ll, dic):
    '''
    Make a new dataframe with weather column,
    then merge the new dataframe with original dataframe
    input: df_time_ll, the dataframe where the `dic` is going to be concatenated on
    return: The joined version of original dataframe
    '''
    df_time_ll = pd.concat([df_time_ll, pd.DataFrame.from_dict(dic)], axis=1)
    df = df.merge(df_time_ll, on=['fl_date', 'll'], how='left')
    return df

In [104]:
df.head()

Unnamed: 0,index,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,ll,weather
0,244649,2019-01-07,AA,AA,AA,362,AA,N161AA,362,12892,...,,,,,,,,,"33.94254,-118.40807",Overcast
1,190528,2019-01-04,UA,UA_CODESHARE,UA,3788,ZW,N437AW,3788,13930,...,,,,,,,,,"41.9796,-87.90446",Partly Cloudy
2,50595,2018-12-29,WN,WN,WN,5741,WN,N7738A,5741,13871,...,,,,,,,,,"41.30252,-95.89417",
3,116214,2019-01-01,WN,WN,WN,1641,WN,N423WN,1641,15304,...,,,,,,,,,"27.97547,-82.53325",
4,134474,2019-01-02,UA,UA_CODESHARE,UA,4233,EV,N14558,4233,11618,...,0.0,22.0,0.0,0.0,,,,,"40.6925,-74.16866",Mostly Cloudy


In [103]:
file = open('weather.pkl', 'wb')
pickle.dump(weather, file)