In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from sklearn.preprocessing import StandardScaler
import config as cfg

In [2]:
df = pd.read_csv('../data/flights_samp.csv')

In [18]:
def split_numeric_categorical(df, numeric=True):
    '''    
    Return either numerical columns, or categorical columns in a data frame
    numeric: default True, return dataframe of all numerical columns    
    input: a data frame
    output: a data frame
    
    '''
    numeric_data = df.select_dtypes(include=[np.number])
    categorical_data = df.select_dtypes(include='category')
    if numeric:
        return numeric_data
    else:
        return categorical_data

In [82]:
def make_categorical(df, cols):
    '''
    Convert columns in `cols` to type `categorical`
    input: df - data frame, cols - a list of columns
    output: a copy of the data frame with the converted columns
    '''
    col_dict = {col:'category' for col in cols}
    df = df.astype(col_dict)
    return df

In [79]:
cols = ['mkt_carrier', 'tail_num']

In [80]:
df1 = make_categorical(df, cols)

In [83]:
def to_hhmmss(df_time_col):
    '''
    Change the format of the time in hhmm
    to hh:mm:ss, where ss is 00 in this case
    Input: `df_time_col`: a Pandas Series
    Return: a Pandas Series
    '''
    
    hhmm = []
    crs_hm = df_time_col.astype('str')
    for t in crs_hm:
        if len(t) == 1:
            hhmm.append('0' + t + ':00:00')
        elif (len(t) == 2) & (t < '24'):
            hhmm.append(t + ':00:00')
        elif (len(t) == 2) & (t > '24'):
            hhmm.append('00:' + t + ':00')
        elif len(t) == 3:
            hhmm.append('0' + t[0] + ':' + t[1:] + ':00')
        else:
            hhmm.append(t[:2] + ':' + t[2:] + ":00")
    return hhmm

In [84]:
df['crs_dep_time_hhmm'] = to_hhmmss(df.crs_dep_time)

In [85]:
def to_city_state(city_state_col):
    '''
    Change the format of `city1/city2, state`
    to `city1, state` 
    Input: A Pandas Series
    Output: A Pandas Series
    '''
    return pd.Series(map(lambda x: x[0].split('/')[0] + ',' + x[-1].strip() ,
                    city_state_col.str.split(',')))

In [86]:
df.origin_city_name = to_city_state(df.origin_city_name)

In [90]:
def request_weather(df):
    '''
    request weather information
    input: a data frame in which
        - first column: time 
        - second column: location
        - third column: date
    output: a dictionary 
    '''
    unit_group="us"
    api_key = cfg.visual_crossing['api_key']
    requestUrl = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline'
    weather_dict = { 'conditions': [] }
    count = 1
    for row in df.values:
        print('Query Count: ', count)
        time = row[0]
        loc = row[1]
        date = row[2]
        url = requestUrl + '/' + loc + '/' + date + 'T' + time + '?key=' + api_key   
        res = requests.get(url)
        if res.status_code == 200:
            weather_json = res.json()
            try:
                condition = weather_json['days'][0]['conditions']
            except:
                condition = 'NA'
            else:        
                weather_dict['conditions'].append(condition)
        else:
            weather_dict['conditions'].append('NA')
            print('Error: ', res.status_code)    
        count += 1
    return weather_dict

In [29]:
def to_dummies(df, col_array):
    return pd.get_dummies(df, columns= col_array, drop_first=True)

In [55]:
def to_scale(df, col_array):
    sc = StandardScaler()
    df[col_array] = pd.DataFrame(sc.fit_transform(df[col_array]))   

In [56]:
def inverse_scale(df, col_array, sc):    
    df[col_array] = pd.DataFrame(sc.inverse_transform(df[col_array]))

In [57]:
col_array = ['mkt_carrier_fl_num', 'op_carrier_fl_num']

In [58]:
df.columns

Index(['index', 'fl_date', 'mkt_unique_carrier', 'branded_code_share',
       'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')

In [59]:
to_scale(df, col_array)

In [60]:
df

Unnamed: 0,index,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,244649,2019-01-07,AA,AA,AA,-1.303816,AA,N161AA,-1.303748,12892,...,1744.0,,,,,,,,,
1,190528,2019-01-04,UA,UA_CODESHARE,UA,0.498127,ZW,N437AW,0.497896,13930,...,122.0,,,,,,,,,
2,50595,2018-12-29,WN,WN,WN,1.525329,WN,N7738A,1.524927,13871,...,1037.0,,,,,,,,,
3,116214,2019-01-01,WN,WN,WN,-0.631112,WN,N423WN,-0.631155,15304,...,488.0,,,,,,,,,
4,134474,2019-01-02,UA,UA_CODESHARE,UA,0.732179,EV,N14558,0.731910,11618,...,199.0,0.0,0.0,22.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,118168,2019-01-01,DL,DL,DL,-0.351301,DL,N535US,-0.351390,12892,...,1535.0,,,,,,,,,
4996,306574,2019-01-10,G4,G4,G4,-0.648469,G4,253NV,-0.648509,14082,...,963.0,,,,,,,,,
4997,303601,2019-01-10,DL,DL_CODESHARE,DL,0.235672,9E,N294PQ,0.235485,11003,...,694.0,,,,,,,,,
4998,346537,2019-01-12,AA,AA,AA,-0.016789,AA,N338RS,-0.016935,12953,...,1096.0,,,,,,,,,
