In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from sklearn.preprocessing import StandardScaler
import config as cfg
import modules.help_functions as hf

In [4]:
df = pd.read_csv('../data/flights_samp.csv')

In [18]:
def split_numeric_categorical(df, numeric=True):
    '''    
    Return either numerical columns, or categorical columns in a data frame
    numeric: default True, return dataframe of all numerical columns    
    input: a data frame
    output: a data frame    
    '''
    numeric_data = df.select_dtypes(include=[np.number])
    categorical_data = df.select_dtypes(include='category')
    if numeric:
        return numeric_data
    else:
        return categorical_data

In [82]:
def make_categorical(df, cols):
    '''
    Convert columns in `cols` to type `categorical`
    input: df - data frame, cols - a list of columns
    output: a copy of the data frame with the converted columns
    '''
    col_dict = {col:'category' for col in cols}
    df = df.astype(col_dict)
    return df

In [None]:
def to_hhmmss(df_time_col):
    '''
    Change the format of the time in hhmm
    to hh:mm:ss, where ss is 00 in this case
    Input: `df_time_col`: a Pandas Series
    Return: a Pandas Series
    '''
    
    hhmm = []
    crs_hm = df_time_col.astype('str')
    for t in crs_hm:
        if len(t) == 1:
            hhmm.append('0' + t + ':00:00')
        elif (len(t) == 2) & (t < '24'):
            hhmm.append(t + ':00:00')
        elif (len(t) == 2) & (t > '24'):
            hhmm.append('00:' + t + ':00')
        elif len(t) == 3:
            hhmm.append('0' + t[0] + ':' + t[1:] + ':00')
        else:
            hhmm.append(t[:2] + ':' + t[2:] + ":00")
    return hhmm

In [None]:
def to_city_state(city_state_col):
    '''
    Change the format of `city1/city2, state`
    to `city1, state` 
    Input: A Pandas Series
    Output: A Pandas Series
    '''
    return pd.Series(map(lambda x: x[0].split('/')[0] + ',' + x[-1].strip() ,
                    city_state_col.str.split(',')))

In [83]:
def to_hhmmss(df_time_col):
    '''
    Change the format of the time in hhmm
    to hh:mm:ss, where ss is 00 in this case
    Input: `df_time_col`: a Pandas Series
    Return: a Pandas Series
    '''
    
    hhmm = []
    crs_hm = df_time_col.astype('str')
    for t in crs_hm:
        if len(t) == 1:
            hhmm.append('0' + t + ':00:00')
        elif (len(t) == 2) & (t < '24'):
            hhmm.append(t + ':00:00')
        elif (len(t) == 2) & (t > '24'):
            hhmm.append('00:' + t + ':00')
        elif len(t) == 3:
            hhmm.append('0' + t[0] + ':' + t[1:] + ':00')
        else:
            hhmm.append(t[:2] + ':' + t[2:] + ":00")
    return hhmm

In [None]:
def merge_df_dict(df_time_ll, dic):
    '''
    Make a new dataframe with weather column,
    then merge the new dataframe with original dataframe
    input: df_time_ll, the dataframe where the `dic` is going to be concatenated on
    return: The joined version of original dataframe
    '''
    df_time_ll = pd.concat([df_time_ll, pd.DataFrame.from_dict(dic)], axis=1)
    df = df.merge(df_time_ll, on=['fl_date', 'll'], how='left')
    return df

In [29]:
def to_dummies(df, col_array):
    '''
    change the given columns into dummy variables
    and return the dataframe
    '''
    return pd.get_dummies(df, columns= col_array, drop_first=True)

In [55]:
def to_scale(df, col_array):
    '''
    scale the numeric variables to center around 0
    and standard deviation of 1
    '''
    sc = StandardScaler()
    df[col_array] = pd.DataFrame(sc.fit_transform(df[col_array]))   

In [56]:
def inverse_scale(df, col_array, sc):
    '''
    inverse scale back to the original numeric values
    '''
    df[col_array] = pd.DataFrame(sc.inverse_transform(df[col_array]))

In [70]:
def make_qbin_column(df, col_name, n_bin_range):
    '''
    Convert a numeric column to a ordinal column based on quantiles.
    Assumption: the column that is going to be binned
    must be positive numeric numbers
    input:
      - df: data frame
      - col_name: column in string
      - q_list: a list of quantiles to be binned, 
        eg., [0, 0.25, 0.50, 0.75, 1] for 4-quantiles
    return: a data frame with the newly binned column  
    '''
    # make bins and bin labels
    bin_names = list(range(1, len(q_list)))
    
    # perform the binning
    new_col_name = col_name + '_bin'
    df[new_col_name] = pd.qcut(np.array(df[col_name]), 
                               q=n_bin_range,
                               labels=bin_names, 
                               duplicates='drop')
    return df


In [81]:
def make_col_value_qbins(df, col_name, new_col_bin_name, n_bin_range):
    type_count = df[col_name].value_counts()
    df1 = pd.DataFrame(type_count).reset_index()
    df1 = df1.rename(columns={'index': 'bin_on', col_name: 'count'})       
    df1 = make_qbin_column(df1, 'count', n_bin_range)
    df = df.merge(df1, left_on=col_name, right_on='bin_on', how='left')
    df = df.rename(columns ={'count_bin': new_col_bin_name})
    df.drop(columns=['count', col_name], inplace=True)
    return df

In [83]:
pd.qcut(df1['count'], q=6, duplicates='drop')

0       (28.0, 227.0]
1       (28.0, 227.0]
2       (28.0, 227.0]
3       (28.0, 227.0]
4       (28.0, 227.0]
            ...      
271    (0.999, 2.667]
272    (0.999, 2.667]
273    (0.999, 2.667]
274    (0.999, 2.667]
275    (0.999, 2.667]
Name: count, Length: 276, dtype: category
Categories (5, interval[float64, right]): [(0.999, 2.667] < (2.667, 4.0] < (4.0, 9.0] < (9.0, 28.0] < (28.0, 227.0]]

In [21]:
def get_avg_dep_delay(df, col_list):
    df.loc[:, col_list] = df.loc[:, col_list].fillna(0)
    df_avg_delay = pd.DataFrame(df.dest.unique(), columns=['origin'])
    for col in col_list:
        s = df.groupby('origin')[col].mean()
        s.name = 'avg_' + col
        df_avg_delay = df_avg_delay.merge(s.to_frame(), on='origin', how='left')
    return df_avg_delay

In [22]:
df.columns

Index(['index', 'fl_date', 'mkt_unique_carrier', 'branded_code_share',
       'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')

In [23]:
get_avg_dep_delay(df, ['dep_delay'])

Unnamed: 0,origin,avg_dep_delay
0,ORD,10.775120
1,AZO,4.000000
2,PHX,5.223214
3,MSY,1.952381
4,DCA,10.344828
...,...,...
274,ABY,
275,PGV,-10.000000
276,PQI,
277,AVP,-3.333333
