In [5]:
import pandas as pd
import psycopg2
import pandas.io.sql as sqlio
import requests
import pickle
import config as cfg

class Data_Gathering:
    
    def __init__(self, df):
        self.df = df
        
    def get_table(self, sql_str):
        '''
        query the database and return a dataframe with given sql query string
        input: `sql_str`, query string
        return: a dataframe
        '''
        host= cfg.db['host']
        port = cfg.db['port']
        user = cfg.db['user']
        pwd = cfg.db['pwd']
        database= cfg.db['database']

        con = psycopg2.connect(database=database, user=user, password=pwd, host=host, port=port)

        print("Database opened successfully")

        sql = "'''" + sql_str + "'''"
        df = sqlio.read_sql_query(sql, con)

        return df
    
    def get_weather(self, df):
        '''
        request weather information
        input: a data frame in which
            - first column: date
            - second column: location in "latitude, longitude"        
        output: a dictionary 
        '''
        key = cfg.rapid_api['key']
        base_url = "https://dark-sky.p.rapidapi.com"
        headers = { 'x-rapidapi-host': "dark-sky.p.rapidapi.com",
                    'x-rapidapi-key': key }
        weather_dict = { 'weather':[] }

        count = 1
        for row in df.values:
            print('Query Count: ', count)
            date = row[0]
            ll = row[1]
            url = base_url + '/' + ll + ',' + date + 'T' + '13:00:00'        
            res = requests.get(url, headers=headers)
            if res.status_code == 200:
                weather_json = res.json()
                try:
                    weather = weather_json['currently']['summary']
                except:
                    weather = 'NA'
                finally:        
                    weather_dict['weather'].append(weather)
            else:
                weather_dict['weather'].append('NA')            
            count += 1
        return weather_dict


In [2]:
df = pd.read_csv('../data/flights_samp.csv')
df_air = pd.read_csv('../data/airports_usa.csv')

In [92]:
df_air = df_air[['IATA_CODE', 'LATITUDE', 'LONGITUDE']]
df_air = df_air.rename(columns={'IATA_CODE': 'origin'})

In [94]:
top20_airport_code = ['LAX', 'ORD', 'EWR', 'SFO', 'LGA', 'DFW', 'LAS', 'CLT', 'DEN',
                      'PHL', 'IAH', 'SEA', 'ATL', 'PHX', 'MCO', 'DTW', 'SLC', 'BOS',
                      'JFK', 'MSP']

In [95]:
df = df.merge(df_air, on='origin', how='left')
df['ll'] = df.LATITUDE.astype('str') + ',' + df.LONGITUDE.astype('str')
df.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)

In [96]:
df_origin = df[df.origin.isin(top20_airport_code)]
df_origin = df_origin [['fl_date', 'll']]
df_origin = df_origin.drop_duplicates().reset_index(drop=True)