## Predict Test Data
After running a variety of models, the best one has been selected and will be used to predict the flight delays of the flights one week in advance. (From the past time for the first week of January) 


In [13]:
import numpy as np
import pandas as pd

import modules.help_functions as hf

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

# from sklearn.linear_model import LinearRegression


In [50]:
# Load and form dataframes

df_training = pd.read_csv('../data/flights.csv')
df_dep_delays = hf.get_avg_dep_delay(df_training, ['dep_delay'])
df_dest_delays = hf.get_avg_dest_delay(df_training, ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'])

df = pd.read_csv('../data/flights_test.csv')

In [51]:
# encode arrival and departure hours

# Split crs_arr_time and crs_dep_time into hour of day (local)
df = hf.split_time_of_day_departure(df)
df = hf.split_time_of_day_arrival(df)
df.drop(columns=['crs_dep_time', 'crs_arr_time'], inplace=True)

# encode hour of day departure
f = lambda x: str(int(np.floor(x)))

df['dep_hour'] = df['dep_hour'].apply(f)
df['arr_hour'] = df['arr_hour'].apply(f)
df.rename({'dep_hour_enc': 'dep_hour', 'arr_hour_enc': 'arr_hour'})
df = hf.encode_and_bind(df, 'dep_hour')
df = hf.encode_and_bind(df, 'arr_hour')

In [53]:
# encode weekday

# Convert fl_date into day of week  
df = hf.add_weekday(df)
#     df.drop(columns=['fl_date'], inplace=True) # drop at end
df = hf.make_categorical(df, ['weekday'])
df = hf.encode_and_bind(df, 'weekday')
df.drop(columns=['weekday'], inplace=True)
    

In [55]:
#  add  average delays

# Add average delays (weather, carrier, NAS, late aircraft)
df = df.merge(df_dest_delays, on='dest', how='left')
df = df.merge(df_dep_delays, on='origin', how='left')


In [56]:
#  encode top states (for number of flights)

# Split city and state 
hf.split_origin_city_state(df)
hf.split_dest_city_state(df)
df.drop(columns=['dest_city_name', 'origin_city_name'], inplace=True)

In [57]:
# Encode top 10 cities in terms of traffic 

city_list = ['Chicago','Atlanta','New York','Dallas/Fort Worth','Denver','Charlotte','Houston','Washington','Los Angeles','Seattle']    
df.dest_city = np.where(df.dest_city.isin(city_list),df.dest_city, '0')
df.origin_city = np.where(df.origin_city.isin(city_list),df.origin_city, '0')
df = hf.encode_and_bind(df, 'origin_city')
df = hf.encode_and_bind(df, 'dest_city')
df.drop(columns=['dest_city', 'origin_city'], inplace=True)

In [58]:
# encode top 20 airport codes - in terms of number of flights

top20_airport_code = ['LAX', 'ORD', 'EWR', 'SFO', 'LGA', 'DFW', 'LAS', 'CLT', 'DEN',
                  'PHL', 'IAH', 'SEA', 'ATL', 'PHX', 'MCO', 'DTW', 'SLC', 'BOS',
                  'JFK', 'MSP']
df.dest = np.where(df.dest.isin(top20_airport_code),df.dest, '0')
df.origin = np.where(df.origin.isin(top20_airport_code),df.origin, '0')
df = hf.encode_and_bind(df, 'dest')
df = hf.encode_and_bind(df, 'origin')

In [59]:
# State - Encode (based on # flights)

state_list = ['CA','TX', 'FL', 'IL', 'NY', 'GA', 'NC', 'CO', 'PA', 'WA']
df.dest_state = np.where(df.dest_state.isin(state_list),df.dest_state, '0')
df.origin_state = np.where(df.origin_state.isin(state_list),df.origin_state, '0')
df = hf.encode_and_bind(df, 'origin_state')
df = hf.encode_and_bind(df, 'dest_state')
df.drop(columns=['dest_state', 'origin_state'], inplace=True)

In [60]:
# Convert Airline Carrier - Encode 

df = hf.encode_and_bind(df, 'mkt_unique_carrier')
df.drop(columns = ['mkt_unique_carrier'], inplace=True)

In [61]:
# Origin Airport Busyness - Encode top 10 (rest in 'other') OR BIN according to passenger or flight volume
df = hf.make_col_value_bins(df, 'origin', 'origin_airport_fl_amt_bin', 7) 

# Dest Airport Busyness- Encode top 10 or bin according to passenger of flight volume 
df = hf.make_col_value_bins(df, 'dest', 'dest_airport_fl_amt_bin', 7) 

In [64]:
# Flight number

df_fl_num_delay = hf.fl_arr_delay(df_training)
df = df.merge(df_fl_num_delay, on="mkt_carrier_fl_num", how='left')

# deal with nulls
arr_delay_all_flights_median = -8.0 # this is median of whole sample
df.fl_num_delay.fillna(arr_delay_all_flights_median, inplace=True)
df.drop(columns = ['mkt_carrier_fl_num'], inplace=True)


In [65]:
# scheduled flight time - encode log of crs_elapsed 

df['log_crs_elapsed_time'] = np.log(df.crs_elapsed_time.abs())
df = hf.make_bin_column(df, 'log_crs_elapsed_time', 20) 
df = hf.make_categorical(df, ['log_crs_elapsed_time_bin'])
df = hf.encode_and_bind(df, 'log_crs_elapsed_time_bin')
df.drop(columns = ['crs_elapsed_time','log_crs_elapsed_time', 'log_crs_elapsed_time_bin' ], inplace=True)

In [66]:

# Drop rest that (that didn't plan to use)
df.drop(columns=['branded_code_share', 'mkt_carrier','op_unique_carrier', 'tail_num', 'op_carrier_fl_num',
                  'origin_airport_id', 'dest_airport_id', 'dep_hour','arr_hour', 'dup', 'flights', 'fl_date'], errors='ignore', inplace = True)


# Drop features from LDA 
df.drop(columns=['origin_airport_fl_amt_bin', 
            'dest_airport_fl_amt_bin', 
            'dest_0', 
#             'weekday_0', 'weekday_1', 'weekday_2', 
#             'weekday_3', 'weekday_4', 
#             'weekday_5', 'weekday_6', 
            'origin_city_0', 
            'origin_state_0','origin_state_PA', 'dest_state_PA'], errors='ignore', inplace = True)



In [67]:
def scale(df):
#     scaler = MinMaxScaler()
    scaler = StandardScaler()

    # apply scaler() to all the numeric columns 
    numeric_vars = ['avg_carrier_delay', 
                     'avg_weather_delay', 
                     'avg_nas_delay', 
                     'avg_security_delay', 
                     'avg_late_aircraft_delay', 'avg_dep_delay', 'distance', 'fl_num_delay'
                     ]
    df[numeric_vars] = scaler.fit_transform(df[numeric_vars])
    return df

In [68]:
def remove_highly_correlated_features(df, correlation_threshold=0.8):
    """ Any pair above the correlation threshold, one feature will be removed """
    
    df_corr = df.corr().abs()

    indices = np.where(df_corr > correlation_threshold)
    indices = [(df_corr.index[x], df_corr.columns[y])
    for x, y in zip(*indices)
        if x != y and x < y]

    for idx in indices: #each pair
        try:
            df.drop(idx[1], axis = 1, inplace=True)
        except KeyError:
            pass
    return df


In [69]:
df.columns

Index(['distance', 'dep_hour_1', 'dep_hour_10', 'dep_hour_11', 'dep_hour_12',
       'dep_hour_13', 'dep_hour_14', 'dep_hour_15', 'dep_hour_16',
       'dep_hour_17',
       ...
       'log_crs_elapsed_time_bin_11', 'log_crs_elapsed_time_bin_12',
       'log_crs_elapsed_time_bin_13', 'log_crs_elapsed_time_bin_14',
       'log_crs_elapsed_time_bin_15', 'log_crs_elapsed_time_bin_16',
       'log_crs_elapsed_time_bin_17', 'log_crs_elapsed_time_bin_18',
       'log_crs_elapsed_time_bin_19', 'log_crs_elapsed_time_bin_20'],
      dtype='object', length=169)

In [70]:
df = scale(df)
df = remove_highly_correlated_features(df)

In [72]:
# Save to data folder
df.to_csv("../data/testing_test_data_v2.csv", index=False)

In [71]:
df.columns

Index(['distance', 'dep_hour_1', 'dep_hour_10', 'dep_hour_11', 'dep_hour_12',
       'dep_hour_13', 'dep_hour_14', 'dep_hour_15', 'dep_hour_16',
       'dep_hour_17',
       ...
       'log_crs_elapsed_time_bin_11', 'log_crs_elapsed_time_bin_12',
       'log_crs_elapsed_time_bin_13', 'log_crs_elapsed_time_bin_14',
       'log_crs_elapsed_time_bin_15', 'log_crs_elapsed_time_bin_16',
       'log_crs_elapsed_time_bin_17', 'log_crs_elapsed_time_bin_18',
       'log_crs_elapsed_time_bin_19', 'log_crs_elapsed_time_bin_20'],
      dtype='object', length=141)

#### Fit Model

In [73]:
X_test = df.to_numpy()

# Load model from pickle file

# Save pickle file
# model = lr_baseline
# filename = '../model/linear_regression_all_features_except_neg_target.pkl'
# pickle.dump(model, open(filename, 'wb'))

predicted_delay = model.predict(X_test)

# ***MERGE y_pred to original test_df, drop all the non-delayed flights?

df_pred = df[['fl_date', 'mkt_carrier', 'mkt_carrier_fl_num', 'origin', 'dest', 'predicted_delay']]

# save to file 
# Save to data folder
df_pred.to_csv("../data/submission.csv", index=False)

NameError: name 'model' is not defined

In [None]:
 r2: 0.07588064956536089
 MSE: 271.8742627386297
 MAE: 12.848907195090016 
    
    r2: 0.08013667354836995
 MSE: 270.622148082623
 MAE: 12.81701225015653

In [None]:
# Predict test delays

df_t = pd.read_csv('../data/testing_test_data_v2.csv')
df_t.drop(columns=['origin_state_PA', 'dest_state_PA'], inplace=True, errors='ignore')
test_data = df_t.to_numpy()
y_pred = xgb_reg.predict(test_data)
df_flights_test = pd.read_csv('../data/flights_test.csv')
s_pred = pd.Series(y_pred, name='predicted_delay')
df_results = pd.concat([df_flights_test, s_pred], axis=1)
df_results.to_csv('../data/submission.csv', index=False)