### Read in .csv files to begin removing/adding any model-specific columns

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [2]:
# Function to calculate VIF
def calculate_vif(data):
    import statsmodels.api as sm
    vif_df = pd.DataFrame(columns = ['Var', 'Vif'])
    x_var_names = data.columns
    for i in range(0, x_var_names.shape[0]):
        y = data[x_var_names[i]]
        x = data[x_var_names.drop([x_var_names[i]])]
        r_squared = sm.OLS(y,x).fit().rsquared
        vif = round(1/(1-r_squared),2)
        vif_df.loc[i] = [x_var_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis = 0, ascending=False, inplace=False)

In [3]:
flights = pd.read_csv('flights_.csv', index_col=0) # the latest flight data, with feature_engineered columns
flights_test = pd.read_csv('test_flights_.csv', index_col=0) # the first 7 days of january 2020, with feature_engineered columns - more feature engineering will take place in this notebook
#passengers = pd.read_csv('passengers_jan-dec_18-19.csv') # the latest passenger data

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,day_of_the_week,origin_wind_speed,origin_visibility,origin_conditions,dest_wind_speed,dest_visibility,dest_conditions,holiday,year,month,scheduled_flight_hour_of_day
60659,2018-01-01,UA,UA,UA,745,UA,N848UA,745,12953,LGA,new york,11292,DEN,denver,1453,1440.0,-13.0,15.0,1455.0,1643.0,7.0,1726,1650.0,-36.0,0.0,,0.0,273.0,250.0,228.0,1620.0,,,,,,,,,Monday,14.8,9.9,Clear,12.3,9.1,Partially cloudy,True,2018,1,14
23125,2018-01-01,UA,UA_CODESHARE,UA,3582,YX,N649RW,3582,11298,DFW,dallas/fort worth,11292,DEN,denver,2016,2005.0,-11.0,13.0,2018.0,2102.0,7.0,2130,2109.0,-21.0,0.0,,0.0,134.0,124.0,104.0,641.0,,,,,,,,,Monday,15.0,9.9,Partially cloudy,12.3,9.1,Partially cloudy,True,2018,1,20
3646,2018-01-01,UA,UA,UA,1591,UA,N87527,1591,11298,DFW,dallas/fort worth,12892,LAX,los angeles,545,539.0,-6.0,17.0,556.0,656.0,24.0,734,720.0,-14.0,0.0,,0.0,229.0,221.0,180.0,1235.0,,,,,,,,,Monday,15.0,9.9,Partially cloudy,5.6,4.2,Clear,True,2018,1,5
73558,2018-01-01,WN,WN,WN,5500,WN,N8685B,5500,11292,DEN,denver,10397,ATL,atlanta,1820,1818.0,-2.0,9.0,1827.0,2239.0,8.0,2310,2247.0,-23.0,0.0,,0.0,170.0,149.0,132.0,1199.0,,,,,,,,,Monday,12.3,9.1,Partially cloudy,14.6,9.9,Partially cloudy,True,2018,1,18
77601,2018-01-01,WN,WN,WN,415,WN,N8690A,415,12892,LAX,los angeles,13232,MDW,chicago,1225,1226.0,1.0,7.0,1233.0,1758.0,4.0,1820,1802.0,-18.0,0.0,,0.0,235.0,216.0,205.0,1750.0,,,,,,,,,Monday,5.6,4.2,Clear,12.7,9.9,Clear,True,2018,1,12


### One-Hot encoding for:
- weather
- origin_city_name and dest_city_name
- day_of_week

In [5]:
flights.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'distance', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'first_dep_time', 'total_add_gtime', 'longest_add_gtime',
       'day_of_the_week', 'origin_wind_speed', 'origin_visibility',
       'origin_conditions', 'dest_wind_speed', 'dest_visibility',
       'dest_conditions', 'holiday', 'year', 'month',
       'scheduled_flight_hour_of_day'],
      dtype='object')

### Key model columns

In [6]:
flights['holiday'] = flights['holiday'].astype(int)
    
dest_conditions = pd.get_dummies(flights['dest_conditions'], prefix='dest')
origin_conditions = pd.get_dummies(flights['origin_conditions'], prefix='origin')
   
origin_airport_id = pd.get_dummies(flights['origin_airport_id'], prefix='origin')
dest_airport_id = pd.get_dummies(flights['dest_airport_id'], prefix='dest')
    
day_of_week = pd.get_dummies(flights['day_of_the_week'])
df_flights = pd.concat((
    day_of_week, 
    origin_airport_id, 
    origin_conditions, 
    flights['origin_visibility'], flights['origin_wind_speed'], 
    dest_airport_id, 
    dest_conditions, 
    flights['dest_visibility'], flights['dest_wind_speed'], 
    flights['holiday'], 
    flights['scheduled_flight_hour_of_day'], 
    flights['distance'],
    flights['arr_delay'] # target variable        
), axis=1)

df_flights = df_flights.dropna()

In [7]:
df_flights.shape

(204676, 61)

In [8]:
# flights['holiday'] = flights['holiday'].astype(int)

# dest_conditions = pd.get_dummies(flights['dest_conditions'], prefix='dest')
# origin_conditions = pd.get_dummies(flights['origin_conditions'], prefix='origin')

# origin_airport_id = pd.get_dummies(flights['origin_airport_id'], prefix='origin')
# dest_airport_id = pd.get_dummies(flights['dest_airport_id'], prefix='dest')

# day_of_week = pd.get_dummies(flights['day_of_the_week'])

# df_flights = pd.concat((
#     day_of_week, 
#     origin_airport_id, 
#     origin_conditions, 
#     flights['origin_visibility'], flights['origin_wind_speed'], 
#     dest_airport_id, 
#     dest_conditions, 
#     flights['dest_visibility'], flights['dest_wind_speed'], 
#     flights['holiday'], 
#     flights['scheduled_flight_hour_of_day'], 
#     flights['distance'],
#     flights['arr_delay'] # target variable
# ), axis=1)

# df_flights = df_flights.dropna()

In [9]:
# normalize arr_delay, 
# remove some weather columns, 
# change origin_city_name to airport_id, 
# add in hour of day, 
# remove year 
# remove month
# add distance
#
#

In [10]:
# the top 100 values in calue_counts sums up to roughly 97% of all data, which is 3 standard deviations worth
# drop anything else

# this change did NOT help the model, it made it significantly worse
#flights = flights.drop(flights[flights.arr_delay < 82].index)

# Linear Regression
### Model-Specific

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample,shuffle
from sklearn import preprocessing

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

import xgboost as xgb
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

In [12]:
X = df_flights.drop(columns=['arr_delay'])
y = df_flights['arr_delay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

### Baseline model

In [13]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression()

In [14]:
y_preds = clf.predict(X_test)

In [15]:
clf.score(X_test, y_test)

0.04587233022949233

In [16]:
# can only go up from here

In [17]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear',  random_state = 101)

In [18]:
xg_reg.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1,
             objective='reg:linear', random_state=101, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [19]:
y_preds = xg_reg.predict(X_test)

In [20]:
xg_reg.score(X_test, y_test)

0.16338748480016008

In [21]:
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

RMSE: 44.721290


### Some tweaking

In [22]:
def run_xg_reg():
    '''
    This function calls the (currently) best XGB model to generate results.
    This function helps speed up the tuning process.
    '''
    
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.3, max_depth = 6, reg_alpha = 0.0001, n_estimators = 100, reg_lambda = 5, random_state = 101)
    xg_reg.fit(X_train, y_train)
    y_preds = xg_reg.predict(X_test)
    print(f'R^2: {xg_reg.score(X_test, y_test)}')
    rmse = np.sqrt(mean_squared_error(y_test, y_preds))
    print("RMSE: %f" % (rmse))
    return y_preds
y_preds = run_xg_reg()

R^2: 0.18396905433901267
RMSE: 44.167768


In [23]:
#calculate_vif(df_flights)

### Dealing with multicollinearity:
https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

- Dropping one of the one-hot labels to try and avoid "inf." for variance inflation 

In [24]:
columns_to_drop = [
    'Sunday', 
    'origin_10397', 
    'dest_10397', 
    'dest_Clear', 
    'origin_Clear' 
]

In [25]:
df_flights = df_flights.drop(columns=columns_to_drop)

In [26]:
#calculate_vif(df_flights)

In [27]:
X = df_flights.drop(columns=['arr_delay'])
y = df_flights['arr_delay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [28]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.3, max_depth = 6, reg_alpha = 0.0001, n_estimators = 100, reg_lambda = 5, random_state = 101)

In [29]:
xg_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=101,
             reg_alpha=0.0001, reg_lambda=5, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
y_preds = xg_reg.predict(X_test)

In [31]:
print(f'R^2: {xg_reg.score(X_test, y_test)}')

R^2: 0.17516517635140028


In [32]:
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

RMSE: 44.405385


### Let's try dropping some other columns instead...

In [33]:
df_flights = pd.concat((
    day_of_week, 
    origin_airport_id, 
    origin_conditions, 
    flights['origin_visibility'], flights['origin_wind_speed'], 
    dest_airport_id, 
    dest_conditions, 
    flights['dest_visibility'], flights['dest_wind_speed'], 
    flights['holiday'], 
    flights['scheduled_flight_hour_of_day'], 
    flights['distance'],
    flights['arr_delay'] # target variable
), axis=1)

df_flights = df_flights.dropna()

In [34]:
columns_to_drop = [
    'Tuesday', 
    'origin_11057', 
    'dest_11057', 
    'origin_Partially cloudy', 
    'dest_Partially cloudy' 
]

In [35]:
df_flights = df_flights.drop(columns=columns_to_drop)

In [36]:
df_flights.shape

(204676, 56)

In [37]:
#calculate_vif(df_flights)

In [38]:
X = df_flights.drop(columns=['arr_delay'])
y = df_flights['arr_delay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [39]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.3, max_depth = 6, reg_alpha = 0.0001, n_estimators = 100, reg_lambda = 5, random_state = 101)
xg_reg.fit(X_train, y_train)
y_preds = xg_reg.predict(X_test)
print(f'R^2: {xg_reg.score(X_test, y_test)}')
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

R^2: 0.18411488307938428
RMSE: 44.163822


# Predict the test set

In [40]:
flights_test.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,origin_wind_speed,origin_visibility,origin_conditions,dest_wind_speed,dest_visibility,dest_conditions,holiday,day_of_the_week,year,month,scheduled_flight_hour_of_day
0,2020-01-01,WN,WN,WN,3947,WN,N8580Z,3947,14107,PHX,phoenix,10397,ATL,atlanta,815,1350,N,215,1,1587,8.2,9.9,Partially cloudy,12.0,9.9,Clear,True,Wednesday,2020,1,8
1037,2020-01-01,AA,AA,AA,1678,AA,N274AY,1678,11057,CLT,charlotte,14107,PHX,phoenix,733,1024,N,291,1,1773,8.6,9.9,Partially cloudy,8.2,9.9,Partially cloudy,True,Wednesday,2020,1,7
1036,2020-01-01,AA,AA,AA,1676,AA,N991AN,1676,13930,ORD,chicago,11057,CLT,charlotte,1158,1455,N,117,1,599,19.6,9.9,"Snow, Partially cloudy",8.6,9.9,Partially cloudy,True,Wednesday,2020,1,11
1035,2020-01-01,AA,AA,AA,1674,AA,N982VJ,1674,11057,CLT,charlotte,12892,LAX,los angeles,801,1040,N,339,1,2125,8.6,9.9,Partially cloudy,7.5,9.5,Clear,True,Wednesday,2020,1,8
1034,2020-01-01,AA,AA,AA,1668,AA,N901NN,1668,11057,CLT,charlotte,13930,ORD,chicago,724,842,N,138,1,599,8.6,9.9,Partially cloudy,19.6,9.9,"Snow, Partially cloudy",True,Wednesday,2020,1,7


In [41]:
flights_test['holiday'] = flights_test['holiday'].astype(int)
    
dest_conditions = pd.get_dummies(flights_test['dest_conditions'], prefix='dest')
origin_conditions = pd.get_dummies(flights_test['origin_conditions'], prefix='origin')
   
origin_airport_id = pd.get_dummies(flights_test['origin_airport_id'], prefix='origin')
dest_airport_id = pd.get_dummies(flights_test['dest_airport_id'], prefix='dest')
    
day_of_week = pd.get_dummies(flights_test['day_of_the_week'])
df_flights_test = pd.concat((
    day_of_week, 
    origin_airport_id, 
    origin_conditions, 
    flights_test['origin_visibility'], flights_test['origin_wind_speed'], 
    dest_airport_id, 
    dest_conditions, 
    flights_test['dest_visibility'], flights_test['dest_wind_speed'], 
    flights_test['holiday'], 
    flights_test['scheduled_flight_hour_of_day'], 
    flights_test['distance'],
    #flights_test['arr_delay'] # target variable - NOT IN TEST SET       
), axis=1)

df_flights_test = df_flights_test.dropna()

In [42]:
columns_to_drop = [
    'Tuesday', 
    'origin_11057', 
    'dest_11057', 
    'origin_Partially cloudy', 
    'dest_Partially cloudy' 
]

In [43]:
df_flights_test = df_flights_test.drop(columns=columns_to_drop)

In [44]:
# NOTE************************
# I comment this out because this step takes a while, 
# I believe there's a better way to implement this but for now this works
#

route_info = []
sep = '_'

# iterate over the entire dataframe, constructing the route_info
for i in range(len(flights_test[['origin_airport_id', 'dest_airport_id']])):
    route_info.append(str(flights_test.iloc[i].origin_airport_id) + sep + str(flights_test.iloc[i].dest_airport_id))

flights_test['route_info'] = pd.DataFrame(route_info)

In [45]:
# CREATE LIST OF ALL ROUTES (IN OUR DATASET)
route_info_list = []

for i in flights_test['route_info'].unique():
    route_info_list.append(i)

#print(route_info_list)
print(f'There are {len(route_info_list)} routes total.')

There are 156 routes total.


### Need to fix flights_test data to have consistent airports as original training set.

In [46]:
flights_test.origin_city_name.value_counts()

chicago              1579
new york             1462
atlanta              1339
los angeles          1277
houston              1135
washington           1125
denver               1101
dallas/fort worth    1050
phoenix               876
charlotte             795
Name: origin_city_name, dtype: int64

In [47]:
flights.origin_city_name.value_counts()

chicago              28872
new york             28729
atlanta              23996
los angeles          23132
washington           21214
houston              18727
dallas/fort worth    18451
denver               17935
charlotte            14050
seattle              12166
Name: origin_city_name, dtype: int64