### Read in .csv files to begin removing/adding any model-specific columns

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


In [2]:
# Function to calculate VIF
def calculate_vif(data):
    import statsmodels.api as sm
    vif_df = pd.DataFrame(columns = ['Var', 'Vif'])
    x_var_names = data.columns
    for i in range(0, x_var_names.shape[0]):
        y = data[x_var_names[i]]
        x = data[x_var_names.drop([x_var_names[i]])]
        r_squared = sm.OLS(y,x).fit().rsquared
        vif = round(1/(1-r_squared),2)
        vif_df.loc[i] = [x_var_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis = 0, ascending=False, inplace=False)

In [3]:
flights = pd.read_csv('delta_flights_.csv', index_col=0) # the latest flight data, with feature_engineered columns
flights_test = pd.read_csv('delta_flights_test.csv', index_col=0) # the first 7 days of january 2020, with feature_engineered columns - more feature engineering will take place in this notebook
#passengers = pd.read_csv('passengers_jan-dec_18-19.csv') # the latest passenger data

In [4]:
flights.head()

Unnamed: 0,fl_date,mkt_carrier,origin_airport_id,origin_city_name,dest_airport_id,dest_city_name,crs_dep_time,crs_arr_time,dep_time,arr_time,dep_delay,arr_delay,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,origin_wind_speed,origin_visibility,origin_conditions,dest_wind_speed,dest_visibility,dest_conditions,day_of_the_week,holiday,year,month,scheduled_flight_hour_of_day
305935,2018-01-01,DL,13930,chicago,10397,atlanta,705,1010,704.0,948.0,-1.0,-22.0,125.0,104.0,85.0,606.0,,,,,,12.7,9.9,Clear,14.6,9.9,Partially cloudy,Monday,True,2018,1,7
305763,2018-01-01,DL,12892,los angeles,11298,dallas/fort worth,950,1504,946.0,1438.0,-4.0,-26.0,194.0,172.0,146.0,1235.0,,,,,,5.6,4.4,Clear,15.0,9.9,Partially cloudy,Monday,True,2018,1,9
305762,2018-01-01,DL,11298,dallas/fort worth,12892,los angeles,705,845,702.0,823.0,-3.0,-22.0,220.0,201.0,171.0,1235.0,,,,,,15.0,9.9,Partially cloudy,5.6,4.4,Clear,Monday,True,2018,1,7
305761,2018-01-01,DL,14747,seattle,11292,denver,1355,1746,1355.0,1732.0,0.0,-14.0,171.0,157.0,126.0,1024.0,,,,,,7.0,9.8,Partially cloudy,12.3,9.1,Partially cloudy,Monday,True,2018,1,13
305760,2018-01-01,DL,14747,seattle,11292,denver,1906,2256,1930.0,2300.0,24.0,4.0,170.0,150.0,123.0,1024.0,,,,,,7.0,9.8,Partially cloudy,12.3,9.1,Partially cloudy,Monday,True,2018,1,19


In [5]:
flights_test.head()

Unnamed: 0,fl_date,mkt_carrier,origin_airport_id,origin_city_name,dest_airport_id,dest_city_name,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,origin_wind_speed,origin_visibility,origin_conditions,dest_wind_speed,dest_visibility,dest_conditions,day_of_the_week,holiday,year,month,scheduled_flight_hour_of_day
2883,1/1/2020,DL,12953,new york,11278,washington,1900,2022,82,214,15.1,9.9,Overcast,14.7,9.9,Partially cloudy,Wednesday,True,2020,1,19
2715,1/1/2020,DL,13232,chicago,10397,atlanta,600,901,121,591,19.6,9.9,"Snow, Partially cloudy",12.0,9.9,Clear,Wednesday,True,2020,1,6
2714,1/1/2020,DL,10397,atlanta,14747,seattle,1125,1400,335,2182,12.0,9.9,Clear,16.4,9.9,Overcast,Wednesday,True,2020,1,11
2713,1/1/2020,DL,10397,atlanta,12892,los angeles,1925,2120,295,1947,12.0,9.9,Clear,7.5,9.5,Clear,Wednesday,True,2020,1,19
2712,1/1/2020,DL,11278,washington,12478,new york,600,718,78,213,14.7,9.9,Partially cloudy,15.1,9.9,Overcast,Wednesday,True,2020,1,6


In [6]:
flights.head()

Unnamed: 0,fl_date,mkt_carrier,origin_airport_id,origin_city_name,dest_airport_id,dest_city_name,crs_dep_time,crs_arr_time,dep_time,arr_time,dep_delay,arr_delay,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,origin_wind_speed,origin_visibility,origin_conditions,dest_wind_speed,dest_visibility,dest_conditions,day_of_the_week,holiday,year,month,scheduled_flight_hour_of_day
305935,2018-01-01,DL,13930,chicago,10397,atlanta,705,1010,704.0,948.0,-1.0,-22.0,125.0,104.0,85.0,606.0,,,,,,12.7,9.9,Clear,14.6,9.9,Partially cloudy,Monday,True,2018,1,7
305763,2018-01-01,DL,12892,los angeles,11298,dallas/fort worth,950,1504,946.0,1438.0,-4.0,-26.0,194.0,172.0,146.0,1235.0,,,,,,5.6,4.4,Clear,15.0,9.9,Partially cloudy,Monday,True,2018,1,9
305762,2018-01-01,DL,11298,dallas/fort worth,12892,los angeles,705,845,702.0,823.0,-3.0,-22.0,220.0,201.0,171.0,1235.0,,,,,,15.0,9.9,Partially cloudy,5.6,4.4,Clear,Monday,True,2018,1,7
305761,2018-01-01,DL,14747,seattle,11292,denver,1355,1746,1355.0,1732.0,0.0,-14.0,171.0,157.0,126.0,1024.0,,,,,,7.0,9.8,Partially cloudy,12.3,9.1,Partially cloudy,Monday,True,2018,1,13
305760,2018-01-01,DL,14747,seattle,11292,denver,1906,2256,1930.0,2300.0,24.0,4.0,170.0,150.0,123.0,1024.0,,,,,,7.0,9.8,Partially cloudy,12.3,9.1,Partially cloudy,Monday,True,2018,1,19


In [7]:
flights['dest_city_name'].unique()

array(['atlanta', 'dallas/fort worth', 'los angeles', 'denver', 'seattle',
       'new york', 'houston', 'charlotte', 'chicago', 'washington'],
      dtype=object)

### One-Hot encoding for:
- weather
- origin_city_name and dest_city_name
- day_of_week

In [8]:
flights.columns

Index(['fl_date', 'mkt_carrier', 'origin_airport_id', 'origin_city_name',
       'dest_airport_id', 'dest_city_name', 'crs_dep_time', 'crs_arr_time',
       'dep_time', 'arr_time', 'dep_delay', 'arr_delay', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'distance', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_wind_speed', 'origin_visibility', 'origin_conditions',
       'dest_wind_speed', 'dest_visibility', 'dest_conditions',
       'day_of_the_week', 'holiday', 'year', 'month',
       'scheduled_flight_hour_of_day'],
      dtype='object')

In [9]:
# We want to change "Rain, Overcast" to Rain = 1, Cloudy = 1
flights['origin_rain'] = 0
flights['origin_cloudy'] = 0
flights['origin_snow'] = 0
flights['dest_rain'] = 0
flights['dest_cloudy'] = 0
flights['dest_snow'] = 0

# go through each condition, setting all 3 relevant columns
# there is probably a better way but i know this will work for now...

flights.loc[flights['origin_conditions'] == 'Partially cloudy', 'origin_rain'] = 0
flights.loc[flights['origin_conditions'] == 'Partially cloudy', 'origin_cloudy'] = 0.5
flights.loc[flights['origin_conditions'] == 'Partially cloudy', 'origin_snow'] = 0
flights.loc[flights['dest_conditions'] == 'Partially cloudy', 'dest_rain'] = 0
flights.loc[flights['dest_conditions'] == 'Partially cloudy', 'dest_cloudy'] = 0.5
flights.loc[flights['dest_conditions'] == 'Partially cloudy', 'dest_snow'] = 0

flights.loc[flights['origin_conditions'] == 'Clear', 'origin_rain'] = 0
flights.loc[flights['origin_conditions'] == 'Clear', 'origin_cloudy'] = 0
flights.loc[flights['origin_conditions'] == 'Clear', 'origin_snow'] = 0
flights.loc[flights['dest_conditions'] == 'Clear', 'dest_rain'] = 0
flights.loc[flights['dest_conditions'] == 'Clear', 'dest_cloudy'] = 0
flights.loc[flights['dest_conditions'] == 'Clear', 'dest_snow'] = 0

flights.loc[flights['origin_conditions'] == 'Rain, Overcast', 'origin_rain'] = 1
flights.loc[flights['origin_conditions'] == 'Rain, Overcast', 'origin_cloudy'] = 1
flights.loc[flights['origin_conditions'] == 'Rain, Overcast', 'origin_snow'] = 0
flights.loc[flights['dest_conditions'] == 'Rain, Overcast', 'dest_rain'] = 1
flights.loc[flights['dest_conditions'] == 'Rain, Overcast', 'dest_cloudy'] = 1
flights.loc[flights['dest_conditions'] == 'Rain, Overcast', 'dest_snow'] = 0

flights.loc[flights['origin_conditions'] == 'Rain, Partially cloudy', 'origin_rain'] = 1
flights.loc[flights['origin_conditions'] == 'Rain, Partially cloudy', 'origin_cloudy'] = 0.5
flights.loc[flights['origin_conditions'] == 'Rain, Partially cloudy', 'origin_snow'] = 0
flights.loc[flights['dest_conditions'] == 'Rain, Partially cloudy', 'dest_rain'] = 1
flights.loc[flights['dest_conditions'] == 'Rain, Partially cloudy', 'dest_cloudy'] = 0.5
flights.loc[flights['dest_conditions'] == 'Rain, Partially cloudy', 'dest_snow'] = 0

flights.loc[flights['origin_conditions'] == 'Overcast', 'origin_rain'] = 0
flights.loc[flights['origin_conditions'] == 'Overcast', 'origin_cloudy'] = 1
flights.loc[flights['origin_conditions'] == 'Overcast', 'origin_snow'] = 0
flights.loc[flights['dest_conditions'] == 'Overcast', 'dest_rain'] = 0
flights.loc[flights['dest_conditions'] == 'Overcast', 'dest_cloudy'] = 1
flights.loc[flights['dest_conditions'] == 'Overcast', 'dest_snow'] = 0

flights.loc[flights['origin_conditions'] == 'Snow, Partially cloudy', 'origin_rain'] = 0
flights.loc[flights['origin_conditions'] == 'Snow, Partially cloudy', 'origin_cloudy'] = 0.5
flights.loc[flights['origin_conditions'] == 'Snow, Partially cloudy', 'origin_snow'] = 1
flights.loc[flights['dest_conditions'] == 'Snow, Partially cloudy', 'dest_rain'] = 0
flights.loc[flights['dest_conditions'] == 'Snow, Partially cloudy', 'dest_cloudy'] = 0.5
flights.loc[flights['dest_conditions'] == 'Snow, Partially cloudy', 'dest_snow'] = 1

flights.loc[flights['origin_conditions'] == 'Rain', 'origin_rain'] = 1
flights.loc[flights['origin_conditions'] == 'Rain', 'origin_cloudy'] = 0
flights.loc[flights['origin_conditions'] == 'Rain', 'origin_snow'] = 0
flights.loc[flights['dest_conditions'] == 'Rain', 'dest_rain'] = 1
flights.loc[flights['dest_conditions'] == 'Rain', 'dest_cloudy'] = 0
flights.loc[flights['dest_conditions'] == 'Rain', 'dest_snow'] = 0

flights.loc[flights['origin_conditions'] == 'Snow, Overcast', 'origin_rain'] = 0
flights.loc[flights['origin_conditions'] == 'Snow, Overcast', 'origin_cloudy'] = 1
flights.loc[flights['origin_conditions'] == 'Snow, Overcast', 'origin_snow'] = 1
flights.loc[flights['dest_conditions'] == 'Snow, Overcast', 'dest_rain'] = 0
flights.loc[flights['dest_conditions'] == 'Snow, Overcast', 'dest_cloudy'] = 1
flights.loc[flights['dest_conditions'] == 'Snow, Overcast', 'dest_snow'] = 1

flights.loc[flights['origin_conditions'] == 'Snow', 'origin_rain'] = 0
flights.loc[flights['origin_conditions'] == 'Snow', 'origin_cloudy'] = 0
flights.loc[flights['origin_conditions'] == 'Snow', 'origin_snow'] = 1
flights.loc[flights['dest_conditions'] == 'Snow', 'dest_rain'] = 0
flights.loc[flights['dest_conditions'] == 'Snow', 'dest_cloudy'] = 0
flights.loc[flights['dest_conditions'] == 'Snow', 'dest_snow'] = 1

flights = flights.drop(columns=['origin_conditions', 'dest_conditions'])

### Key model columns

In [39]:
# include NEW weather columns & historical_arr_delay
df_flights = pd.concat((
    #day_of_week, 
    #origin_airport_id, 
    #origin_conditions, 
    flights['origin_rain'], 
    flights['origin_cloudy'], 
    flights['origin_snow'], 
    flights['origin_visibility'], 
    flights['origin_wind_speed'],
    #dest_airport_id, 
    #dest_conditions, 
    flights['dest_rain'], 
    flights['dest_cloudy'], 
    flights['dest_snow'], 
    flights['dest_visibility'], 
    flights['dest_wind_speed'], 
    flights['holiday'], 
    flights['scheduled_flight_hour_of_day'], 
    flights['distance'],
    #flights_19['historical_arr_delay'],
    flights['arr_delay'] # target variable
), axis=1)

df_flights = df_flights.dropna()

In [11]:
df_flights.shape

(301892, 43)

In [12]:
# flights['holiday'] = flights['holiday'].astype(int)

# dest_conditions = pd.get_dummies(flights['dest_conditions'], prefix='dest')
# origin_conditions = pd.get_dummies(flights['origin_conditions'], prefix='origin')

# origin_airport_id = pd.get_dummies(flights['origin_airport_id'], prefix='origin')
# dest_airport_id = pd.get_dummies(flights['dest_airport_id'], prefix='dest')

# day_of_week = pd.get_dummies(flights['day_of_the_week'])

# df_flights = pd.concat((
#     day_of_week, 
#     origin_airport_id, 
#     origin_conditions, 
#     flights['origin_visibility'], flights['origin_wind_speed'], 
#     dest_airport_id, 
#     dest_conditions, 
#     flights['dest_visibility'], flights['dest_wind_speed'], 
#     flights['holiday'], 
#     flights['scheduled_flight_hour_of_day'], 
#     flights['distance'],
#     flights['arr_delay'] # target variable
# ), axis=1)

# df_flights = df_flights.dropna()

In [13]:
# normalize arr_delay, 
# remove some weather columns, 
# change origin_city_name to airport_id, 
# add in hour of day, 
# remove year 
# remove month
# add distance
#
#

In [14]:
# the top 100 values in calue_counts sums up to roughly 97% of all data, which is 3 standard deviations worth
# drop anything else

# this change did NOT help the model, it made it significantly worse
#flights = flights.drop(flights[flights.arr_delay < 82].index)

# Linear Regression
### Model-Specific

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample,shuffle
from sklearn import preprocessing

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

import xgboost as xgb
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor

In [16]:
X = df_flights.drop(columns=['arr_delay'])
y = df_flights['arr_delay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

### Baseline model

In [17]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression()

In [18]:
y_preds = clf.predict(X_test)

In [19]:
clf.score(X_test, y_test)

0.04850216960483533

In [20]:
# can only go up from here

In [21]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear',  random_state = 101)

In [22]:
xg_reg.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1,
             objective='reg:linear', random_state=101, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [23]:
y_preds = xg_reg.predict(X_test)

In [24]:
xg_reg.score(X_test, y_test)

0.1655118293791209

In [25]:
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

RMSE: 52.719433


### Some tweaking

In [26]:
def run_xg_reg():
    '''
    This function calls the (currently) best XGB model to generate results.
    This function helps speed up the tuning process.
    '''
    
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.3, max_depth = 6, reg_alpha = 0.0001, n_estimators = 100, reg_lambda = 5, random_state = 101)
    xg_reg.fit(X_train, y_train)
    y_preds = xg_reg.predict(X_test)
    print(f'R^2: {xg_reg.score(X_test, y_test)}')
    rmse = np.sqrt(mean_squared_error(y_test, y_preds))
    print("RMSE: %f" % (rmse))
    return y_preds
y_preds = run_xg_reg()

R^2: 0.16390256360056243
RMSE: 52.770241


In [27]:
#calculate_vif(df_flights)

### Dealing with multicollinearity:
https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

- Dropping one of the one-hot labels to try and avoid "inf." for variance inflation 

In [121]:
# include NEW weather columns & historical_arr_delay
df_flights = pd.concat((
    day_of_week,
    flights['month'],
    flights['year'],
    origin_airport_id, 
    #origin_conditions, 
    flights['origin_rain'], 
    flights['origin_cloudy'], 
    flights['origin_snow'], 
    flights['origin_visibility'], 
    flights['origin_wind_speed'],
    dest_airport_id, 
    #dest_conditions, 
    flights['dest_rain'], 
    flights['dest_cloudy'], 
    flights['dest_snow'], 
    flights['dest_visibility'], 
    flights['dest_wind_speed'], 
    flights['holiday'], 
    flights['scheduled_flight_hour_of_day'], 
    flights['distance'],
    #flights_19['historical_arr_delay'],
    flights['arr_delay'] # target variable
), axis=1)

df_flights = df_flights.dropna()

In [122]:
columns_to_drop = [
    'Tuesday', 
    'origin_12478', 
    'dest_11278', 
    #'dest_Clear', 
    #'origin_Clear' 
]

In [123]:
df_flights = df_flights.drop(columns=columns_to_drop)

In [131]:
#calculate_vif(df_flights)

In [125]:
X = df_flights.drop(columns=['arr_delay'])
y = df_flights['arr_delay']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [130]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.3, max_depth = 6, reg_alpha = 0.0001, n_estimators = 100, reg_lambda = 5, random_state = 101)
xg_reg.fit(X_train, y_train)
y_preds = xg_reg.predict(X_test)
print(f'R^2: {xg_reg.score(X_test, y_test)}')
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

R^2: 0.19507181644052363
RMSE: 51.777276


# RandomForestRegressor

In [142]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['sqrt'],
    'max_depth' : [16, 32],
    'criterion' :['mse'],
    'random_state' : [101]
}

In [143]:
regr = RandomForestRegressor(n_jobs=-1, random_state=101)

In [144]:
CV_regr = GridSearchCV(estimator=regr, param_grid=param_grid, cv=5, verbose=2)
CV_regr.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=200, random_state=101; total time=   6.8s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=200, random_state=101; total time=   7.0s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=200, random_state=101; total time=   6.6s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=200, random_state=101; total time=   6.6s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=200, random_state=101; total time=   6.5s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=500, random_state=101; total time=  16.3s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=500, random_state=101; total time=  16.1s
[CV] END criterion=mse, max_depth=16, max_features=sqrt, n_estimators=500, random_state=101; total time=  16.6s
[CV] END criterion=mse, max_depth=16, max_fe

GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1, random_state=101),
             param_grid={'criterion': ['mse'], 'max_depth': [16, 32],
                         'max_features': ['sqrt'], 'n_estimators': [200, 500],
                         'random_state': [101]},
             verbose=2)

In [145]:
cv_results = pd.DataFrame(CV_regr.cv_results_)
cv_results.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,29.737879,0.234309,1.072731,0.034907,mse,32,sqrt,500,101,"{'criterion': 'mse', 'max_depth': 32, 'max_fea...",0.203792,0.197249,0.204533,0.195719,0.195574,0.199373,0.003961,1
2,12.402435,0.232486,0.49348,0.043365,mse,32,sqrt,200,101,"{'criterion': 'mse', 'max_depth': 32, 'max_fea...",0.198991,0.196514,0.201598,0.193721,0.193679,0.1969,0.003068,2
1,16.508455,0.621284,0.344279,0.001596,mse,16,sqrt,500,101,"{'criterion': 'mse', 'max_depth': 16, 'max_fea...",0.197851,0.189235,0.185857,0.181526,0.184889,0.187872,0.005562,3
0,6.639446,0.171743,0.143416,0.038505,mse,16,sqrt,200,101,"{'criterion': 'mse', 'max_depth': 16, 'max_fea...",0.196845,0.188756,0.183801,0.179512,0.184135,0.18661,0.005895,4


In [149]:
print(f'Best Estimator: {CV_regr.best_estimator_}')
print(f'Best Params: {CV_regr.best_params_}')
print(f'Best Score: {CV_regr.best_score_}')

Best Estimator: RandomForestRegressor(max_depth=32, max_features='sqrt', n_estimators=500,
                      n_jobs=-1, random_state=101)
Best Params: {'criterion': 'mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimators': 500, 'random_state': 101}
Best Score: 0.19937346146295978


In [146]:
y_preds = CV_regr.predict(X_test)
print(f'R^2: {CV_regr.score(X_test, y_test)}')
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

R^2: 0.21480924760843623
RMSE: 51.138528


# Predict the test set

In [None]:
flights_test.head()

In [None]:
flights_test['holiday'] = flights_test['holiday'].astype(int)
    
dest_conditions = pd.get_dummies(flights_test['dest_conditions'], prefix='dest')
origin_conditions = pd.get_dummies(flights_test['origin_conditions'], prefix='origin')
   
origin_airport_id = pd.get_dummies(flights_test['origin_airport_id'], prefix='origin')
dest_airport_id = pd.get_dummies(flights_test['dest_airport_id'], prefix='dest')
    
day_of_week = pd.get_dummies(flights_test['day_of_the_week'])
df_flights_test = pd.concat((
    day_of_week, 
    origin_airport_id, 
    origin_conditions, 
    flights_test['origin_visibility'], flights_test['origin_wind_speed'], 
    dest_airport_id, 
    dest_conditions, 
    flights_test['dest_visibility'], flights_test['dest_wind_speed'], 
    flights_test['holiday'], 
    flights_test['scheduled_flight_hour_of_day'], 
    flights_test['distance'],
    #flights_test['arr_delay'] # target variable - NOT IN TEST SET       
), axis=1)

df_flights_test = df_flights_test.dropna()

In [None]:
columns_to_drop = [
    'Tuesday', 
    'origin_11057', 
    'dest_11057', 
    'origin_Partially cloudy', 
    'dest_Partially cloudy' 
]

In [None]:
df_flights_test = df_flights_test.drop(columns=columns_to_drop)

In [None]:
# NOTE************************
# I comment this out because this step takes a while, 
# I believe there's a better way to implement this but for now this works
#

route_info = []
sep = '_'

# iterate over the entire dataframe, constructing the route_info
for i in range(len(flights_test[['origin_airport_id', 'dest_airport_id']])):
    route_info.append(str(flights_test.iloc[i].origin_airport_id) + sep + str(flights_test.iloc[i].dest_airport_id))

flights_test['route_info'] = pd.DataFrame(route_info)

In [None]:
# CREATE LIST OF ALL ROUTES (IN OUR DATASET)
route_info_list = []

for i in flights_test['route_info'].unique():
    route_info_list.append(i)

#print(route_info_list)
print(f'There are {len(route_info_list)} routes total.')

### Need to fix flights_test data to have consistent airports as original training set.

In [None]:
flights_test.origin_city_name.value_counts()

In [None]:
flights.origin_city_name.value_counts()