## Encode Data To Model
This is a cumulation of our data analysis and feature engineering.
* Used to generate a .csv file that can be used directly to split for model testing.
* Data we gathered from sql is striped of extra columns and formated in the way that the testing data was given so that we can replicate the steps when given the testing data. 
* The various features were tweaked, added, one hot encoded, binned and dropped as a result of testing on a baseline linear regression model and feature analysis. 
* Lastly the encoded data is saved as a .csv and stored in the data folder.

In [231]:
import numpy as np
import pandas as pd

import modules.help_functions as hf

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

from sklearn.linear_model import LinearRegression

import os

In [232]:
os.getcwd()

'/Users/lilakelland/Desktop/lighthouse_lab_midterm_project/src'

In [233]:
# read in files
df = pd.read_csv('../../data/flights.csv')
df_dep_delays = hf.get_avg_dep_delay(df, ['dep_delay'])
df_dest_delays = hf.get_avg_dest_delay(df, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'])

In [198]:
def change_format_to_test_data(df):
    """ Convert our testing data to be in the same format as the data to test (drop columns and reformat date)"""
    
    #convert date to datetime with 0's
    df.fl_date = (df.fl_date + ' 00:00:00')
    pd.to_datetime(df['fl_date'])
    
    #drop columns not present in test format
    df.drop(columns=['dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled',
       'cancellation_code', 'diverted', 'actual_elapsed_time', 'air_time', 
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime','no_name'], inplace = True)
    return df


In [210]:
def add_feature_engineering(df):
    """Adds in columns for model fitting and converts to numeric/ encoded categorical for ML model"""
    
    # Split crs_arr_time and crs_dep_time into hour of day (local)
    df = hf.split_time_of_day_departure(df)
    df = hf.split_time_of_day_arrival(df)
    df.drop(columns=['crs_dep_time', 'crs_arr_time'], inplace=True)
    
    # encode hour of day departure
    f = lambda x: str(int(np.floor(x)))

    df['dep_hour_enc'] = df['dep_hour'].apply(f)
    df['arr_hour_enc'] = df['arr_hour'].apply(f)
    df.rename({'dep_hour_enc': 'dep_hour', 'arr_hour_enc': 'arr_hour'})
    df = hf.encode_and_bind(df, 'dep_hour')
    df = hf.encode_and_bind(df, 'arr_hour')
 
    # Convert fl_date into day of week  
    df = hf.add_weekday(df)
    df.drop(columns=['fl_date'], inplace=True)
    df = hf.encode_and_bind(df, 'weekday')
    
    # Add average delays (weather, carrier, NAS, departure)
    df = df.merge(df_dest_delays, on='dest', how='left')
    df = df.merge(df_dep_delays, on='origin', how='left')
    
    # Split city and state 
    hf.split_origin_city_state(df)
    hf.split_dest_city_state(df)
    df.drop(columns=['dest_city_name', 'origin_city_name'], inplace=True)
    
    # Encode top 10 cities in terms of traffic
    city_list = ['Chicago','Atlanta','New York','Dallas/Fort Worth','Denver','Charlotte','Houston','Washington','Los Angeles','Seattle']    
    df.dest_city = np.where(df.dest_city.isin(city_list),df.dest_city, '0')
    df.origin_city = np.where(df.origin_city.isin(city_list),df.origin_city, '0')
    df = hf.encode_and_bind(df, 'origin_city')
    df = hf.encode_and_bind(df, 'dest_city')
    df.drop(columns=['dest_city', 'origin_city'], inplace=True)
    
    #Top 20 airport codes - in terms of number of flights
    top20_airport_code = ['LAX', 'ORD', 'EWR', 'SFO', 'LGA', 'DFW', 'LAS', 'CLT', 'DEN',
                      'PHL', 'IAH', 'SEA', 'ATL', 'PHX', 'MCO', 'DTW', 'SLC', 'BOS',
                      'JFK', 'MSP']
    df.dest = np.where(df.dest.isin(top20_airport_code),df.dest, '0')
    df.origin = np.where(df.origin.isin(top20_airport_code),df.origin, '0')
    df = hf.encode_and_bind(df, 'dest')
    df = hf.encode_and_bind(df, 'origin')
    
    # Remove negative targets (arr_delay - set to zero)  # kept this in here as it had an interesting effect - both MAE and R2 decreased
#     df.arr_delay = np.where(df.arr_delay >0,df.arr_delay, 0)
    
    # State - Encode (based on # flights)
    state_list = ['CA','TX', 'FL', 'IL', 'NY', 'GA', 'NC', 'CO', 'PA', 'WA']
    df.dest_state = np.where(df.dest_state.isin(state_list),df.dest_state, '0')
    df.origin_state = np.where(df.origin_state.isin(state_list),df.origin_state, '0')
    df = hf.encode_and_bind(df, 'origin_state')
    df = hf.encode_and_bind(df, 'dest_state')
    df.drop(columns=['dest_state', 'origin_state'], inplace=True)
          
    # Convert Airline Carrier - Encode 
    df = hf.encode_and_bind(df, 'mkt_unique_carrier')
    df.drop(columns = ['mkt_unique_carrier'], inplace=True)

    # Origin Airport Bussiness - Encode top 10 (rest in 'other') OR BIN according to passenger or flight volume
    df = hf.make_col_value_bins(df, 'origin', 'origin_airport_fl_amt_bin', 7) 
    
    # Dest Airport Bussiness- Encode top 10 or bin according to passenger of flight volume 
    df = hf.make_col_value_bins(df, 'dest', 'dest_airport_fl_amt_bin', 7) 
   
    # Flight number - drop for now
    df.drop(columns = ['mkt_carrier_fl_num'], inplace=True)
    
    # crs_elapsed # USE LONG HAUL SHORT HAUL
    df['log_crs_elapsed_time'] = np.log(df.crs_elapsed_time)
    df = hf.make_bin_column(df, 'log_crs_elapsed_time', 20) 
    df = hf.make_categorical(df, ['log_crs_elapsed_time_bin'])
    df = hf.encode_and_bind(df, 'log_crs_elapsed_time_bin')
    df.drop(columns = ['crs_elapsed_time','log_crs_elapsed_time', 'log_crs_elapsed_time_bin' ], inplace=True)

    # Drop rest
    df.drop(columns=['branded_code_share', 'mkt_carrier','op_unique_carrier', 'tail_num', 'distance'
                     'op_carrier_fl_num', 'dep_hour', 'origin_airport_id', 'arr_hour','dest_airport_id',  'dup', 'flights'], errors='ignore', inplace = True)

    # Drop features from LDA 
    df.drop(columns=['origin_airport_fl_amt_bin', 
                'dest_airport_fl_amt_bin', 
                'dest_0', 
                'weekday_0', 'weekday_1', 'weekday_2', 
                'weekday_3', 'weekday_4', 
                'weekday_5', 'weekday_6', 
                'origin_city_0', 
                'origin_state_0'], errors='ignore', inplace = True)

    return df

In [211]:
def remove_na_outliers(df):
    """ read data, convert to the format the given test data is in and add / format columns per feature engineering """
    # remove non-landing flights
    df.dropna(subset=['arr_delay'], inplace=True)
    
    # remove outliers 
    cols = ['arr_delay'] 
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

   
    return df

In [212]:
def scale(df):
    # scaler = MinMaxScaler()
    scaler = StandardScaler()

    # apply scaler() to all the numeric columns 
    numeric_vars = ['avg_carrier_delay', 
                     'avg_weather_delay', 
                     'avg_nas_delay', 
                     'avg_security_delay', 
                     'avg_late_aircraft_delay', 'avg_dep_delay'
                     ]
    df[numeric_vars] = scaler.fit_transform(df[numeric_vars])
    return df

In [213]:
# FOUND THAT THIS HAD NEGATIVE IMPACT AND NO LONGER USING
#  def remove_small_variance(x, variance_threshold = 0.1):
#     # Assumptions - target variable removed, df is numeric

#     vt = VarianceThreshold(variance_threshold)
#     x_transformed = vt.fit_transform(x)
#     selected_columns = x.columns[vt.get_support()]
#     x_transformed = pd.DataFrame(x_transformed, columns = selected_columns)
#     return(x_transformed)

In [214]:
def remove_highly_correlated_features(df, correlation_threshold=0.8):
    """ Any pair above the correlation threshold, one feature will be removed """
    
    df_corr = df.corr().abs()

    indices = np.where(df_corr > correlation_threshold)
    indices = [(df_corr.index[x], df_corr.columns[y])
    for x, y in zip(*indices)
        if x != y and x < y]

    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return df


In [215]:
# Transform df to encoded training data
# df = pd.read_csv('../data/flights.csv')
# df_dep_delays = hf.get_avg_dep_delay(df, ['dep_delay'])
# df_dest_delays = hf.get_avg_dest_delay(df, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'])

df = change_format_to_test_data(df)
df = add_feature_engineering(df)
df = data_to_model_format(df)
df = remove_na_outliers(df)
df = scale(df)
df = remove_highly_correlated_features(df)


In [206]:
# Save to data folder
df.to_csv("../../data/encoded_training_data.csv")

#### Linear Regression

In [216]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88)

lr_baseline = LinearRegression()
lr_baseline.fit(X_train, y_train)

# Save pickle file
# model = lr_baseline
# filename = '../model/linear_regression_all_features_except_neg_target.pkl'
# pickle.dump(model, open(filename, 'wb'))


y_pred = lr_baseline.predict(X_test)

r2_baseline = r2_score(y_test, y_pred)
MSE_baseline = mean_squared_error(y_test,y_pred) 
RMSE_baseline = mean_squared_error(y_test,y_pred,squared=False)
MAE_baseline = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2_baseline}\n MSE: {MSE_baseline}\n MAE: {MAE_baseline}\n')

 r2: 0.07588064956536089
 MSE: 271.8742627386297
 MAE: 12.848907195090016



In [None]:
 r2: 0.08013667354836995
 MSE: 270.622148082623
 MAE: 12.81701225015653