In [35]:
import numpy as np
import pandas as pd

In [37]:
import pandas as pd
import psycopg2
import pandas.io.sql as sqlio
import requests
import pickle
import config as cfg

class Data_Gathering:
    
    def __init__(self, df):
        self.df = df
        
    def get_table(self, sql_str):
        '''
        query the database and return a dataframe with given sql query string
        input: `sql_str`, query string
        return: a dataframe
        '''
        host= cfg.db['host']
        port = cfg.db['port']
        user = cfg.db['user']
        pwd = cfg.db['pwd']
        database= cfg.db['database']

        con = psycopg2.connect(database=database, user=user, password=pwd, host=host, port=port)

        print("Database opened successfully")

        sql = "'''" + sql_str + "'''"
        df = sqlio.read_sql_query(sql, con)

        return df
    
    def get_weather(self, df):
        '''
        request weather information
        input: a data frame in which
            - first column: date
            - second column: location in "latitude, longitude"        
        output: a dictionary 
        '''
        key = cfg.rapid_api['key']
        base_url = "https://dark-sky.p.rapidapi.com"
        headers = { 'x-rapidapi-host': "dark-sky.p.rapidapi.com",
                    'x-rapidapi-key': key }
        weather_dict = { 'weather':[] }

        count = 1
        for row in df.values:
            print('Query Count: ', count)
            date = row[0]
            ll = row[1]
            time = row[2]
            url = base_url + '/' + ll + ',' + date + 'T' + time        
            res = requests.get(url, headers=headers)
            if res.status_code == 200:
                weather_json = res.json()
                try:
                    weather = weather_json['currently']['summary']
                except:
                    weather = 'NA'
                finally:        
                    weather_dict['weather'].append(weather)
            else:
                weather_dict['weather'].append('NA')            
            count += 1
        return weather_dict


In [38]:
df = pd.read_csv('../../data/flights_samp.csv')
df_air = pd.read_csv('../../data/airports_usa.csv')

In [10]:
df_air = df_air[['IATA_CODE', 'LATITUDE', 'LONGITUDE']]
df_air
# df_air = df_air.rename(columns={'IATA_CODE': 'origin'})

Unnamed: 0,IATA_CODE,LATITUDE,LONGITUDE
0,ABE,40.65236,-75.44040
1,ABI,32.41132,-99.68190
2,ABQ,35.04022,-106.60919
3,ABR,45.44906,-98.42183
4,ABY,31.53552,-84.19447
...,...,...,...
317,WRG,56.48433,-132.36982
318,WYS,44.68840,-111.11764
319,XNA,36.28187,-94.30681
320,YAK,59.50336,-139.66023


In [11]:
top20_airport_code = ['LAX', 'ORD', 'EWR', 'SFO', 'LGA', 'DFW', 'LAS', 'CLT', 'DEN',
                      'PHL', 'IAH', 'SEA', 'ATL', 'PHX', 'MCO', 'DTW', 'SLC', 'BOS',
                      'JFK', 'MSP']

In [15]:
import help_functions as hf

In [95]:
df = df.merge(df_air, on='origin', how='left')
df['ll'] = df.LATITUDE.astype('str') + ',' + df.LONGITUDE.astype('str')
df.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)

In [96]:
df_origin = df[df.origin.isin(top20_airport_code)]
df_origin = df_origin [['fl_date', 'll']]
df_origin = df_origin.drop_duplicates().reset_index(drop=True)

In [116]:
def get_avg_delay(df, col_list):
    df.loc[:, col_list] = df.loc[:, col_list].fillna(0)
    df_avg_delay = pd.DataFrame(df.dest.unique(), columns=['dest'])
    for col in col_list:
        s = df.groupby('dest')[col].mean()
        s.name = 'avg_' + col
        df_avg_delay = df_avg_delay.merge(s.to_frame(), on='dest', how='left')
    return df_avg_delay

In [117]:
delay_col_list= ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

In [118]:
get_avg_delay(df, delay_col_list)

Unnamed: 0,dest,avg_carrier_delay,avg_weather_delay,avg_nas_delay,avg_security_delay,avg_late_aircraft_delay
0,ORD,5.937238,0.158996,3.987448,0.037657,7.288703
1,AZO,0.000000,16.000000,0.000000,0.000000,3.333333
2,PHX,3.953271,0.654206,1.859813,0.457944,6.775701
3,MSY,9.085714,0.000000,2.600000,0.000000,8.542857
4,DCA,4.323077,0.123077,6.384615,0.000000,4.507692
...,...,...,...,...,...,...
274,ABY,0.000000,0.000000,0.000000,0.000000,0.000000
275,PGV,0.000000,0.000000,0.000000,0.000000,0.000000
276,PQI,0.000000,0.000000,0.000000,0.000000,0.000000
277,AVP,0.000000,0.000000,0.000000,0.000000,0.000000


In [90]:
s = df.groupby('dest')['weather_delay'].mean()
s.name = 'avg_weather_delay'
df2 = s.to_frame()

In [91]:
df2.head(1)

Unnamed: 0_level_0,avg_weather_delay
dest,Unnamed: 1_level_1
ABE,0.0


In [93]:
df1.merge(df2, on='dest')

Unnamed: 0_level_0,avg_carrier_delay,avg_weather_delay
dest,Unnamed: 1_level_1,Unnamed: 2_level_1
ABE,8.000000,0.000000
ABI,0.000000,0.000000
ABQ,2.307692,0.615385
ABR,0.000000,0.000000
ABY,0.000000,0.000000
...,...,...
TYS,4.277778,0.000000
USA,0.000000,0.000000
VPS,0.000000,0.000000
XNA,0.000000,54.714286
