## Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import pandas as pd
import numpy as np

import pickle

In [11]:
df = pd.read_csv('../data/encoded_training_data_v5.csv')

In [13]:
y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88)

In [14]:
X_train.shape

(257219, 133)

In [17]:
# Initial run
regr = RandomForestRegressor(max_depth=5, random_state=0)
regr.fit(X_train, y_train)

# # Save pickle file
# model = regr
# filename = '../model/random_forest.pkl'
# pickle.dump(model, open(filename, 'wb'))

y_pred = regr.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

 r2: 0.05335998580832646
 MSE: 278.49980180179136
 MAE: 12.986750698906787



In [6]:
# Grid Search 1
param_grid = {'n_estimators': [10,500,300], 'max_depth': [3,5,7]}

# # no more than 500 mzx!!
# # max depth - 10 max!! ideally 3-5
# # give combo of low and high, then where is it lying 
# # 2 step process

regr = RandomForestRegressor(random_state=88) 
grid_search = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Save pickle file
model = grid_search
filename = '../model/random_forest_gs.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result = grid_search.best_estimator_
best_result

RandomForestRegressor(max_depth=7, n_estimators=500, random_state=88)

In [7]:
y_pred = grid_search.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

 r2: 0.06639414547984201
 MSE: 274.6651753009566
 MAE: 12.900575969228385



In [8]:
# Grid Search 2
param_grid = {'n_estimators': [400,700, 800], 'max_depth': [7, 10, 13]}

regr = RandomForestRegressor(random_state=88) 
grid_search2 = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search2.fit(X_train, y_train)

# Save pickle file
model = grid_search2
filename = '../model/random_forest_gs2.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result2 = grid_search2.best_estimator_
best_result2

RandomForestRegressor(max_depth=13, n_estimators=800, random_state=88)

In [9]:
y_pred = grid_search2.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

 r2: 0.0999446144084124
 MSE: 264.7946872517665
 MAE: 12.655664459528753



In [18]:
# Grid Search 3
# (this was a grid search, but then removed parameters to run faster)
param_grid = {'n_estimators': [800], 'max_depth': [20]}

regr = RandomForestRegressor(random_state=88) 
grid_search3 = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search3.fit(X_train, y_train)

# Save pickle file
model = grid_search3
filename = '../model/random_forest_gs3.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result3 = grid_search3.best_estimator_
best_result3

RandomForestRegressor(max_depth=20, n_estimators=800, random_state=88)

In [20]:
y_pred = grid_search3.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

 r2: 0.09580751587853209
 MSE: 266.0118142518226
 MAE: 12.642115045289376



### Added flight number specific delays v6

In [21]:
# load file
df = pd.read_csv('../data/encoded_training_data_v6.csv')

# test train split
y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88)

In [22]:
X_train.shape

(257219, 134)

In [23]:
# Initial run with v6 data
regr = RandomForestRegressor(max_depth=5, random_state=0)
regr.fit(X_train, y_train)

# # Save pickle file
# model = regr
# filename = '../model/random_forest.pkl'
# pickle.dump(model, open(filename, 'wb'))

y_pred = regr.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

 r2: 0.05336051916626028
 MSE: 278.4996448888396
 MAE: 12.986762937297009



In [24]:
# NOT RUN YET
# Grid search 2.1
param_grid = {'n_estimators': [10,500,300], 'max_depth': [3,5,7]}

regr = RandomForestRegressor(random_state=88) 
grid_search2_1 = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search2_1.fit(X_train, y_train)

# Save pickle file
model = grid_search2_1
filename = '../model/random_forest_gs2_1.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result2_1 = grid_search2_1.best_estimator_
best_result2_1

y_pred = grid_search2_1.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

 r2: 0.0664023870570919
 MSE: 274.6627506436463
 MAE: 12.900544099589858



In [25]:
print(best_result2_1)

RandomForestRegressor(max_depth=7, n_estimators=500, random_state=88)


In [26]:
# Grid search 2.2
param_grid = {'n_estimators': [400,700, 800], 'max_depth': [7, 10, 13]}

regr = RandomForestRegressor(random_state=88) 
grid_search2_2 = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search2_2.fit(X_train, y_train)

# Save pickle file
model = grid_search2_2
filename = '../model/random_forest_gs2_2.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result2_2 = grid_search2_2.best_estimator_
print(best_result2_2)

y_pred = grid_search2_2.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

RandomForestRegressor(max_depth=13, n_estimators=800, random_state=88)
 r2: 0.0997773626549685
 MSE: 264.84389241897566
 MAE: 12.656864802677841



In [27]:
# Grid search 2.3
param_grid = {'n_estimators': [900,1000], 'max_depth': [13, 14]}

regr = RandomForestRegressor(random_state=88) 
grid_search2_3 = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search2_3.fit(X_train, y_train)

# Save pickle file
model = grid_search2_3
filename = '../model/random_forest_gs2_3.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result2_3 = grid_search2_3.best_estimator_
print(best_result2_3)

y_pred = grid_search2_3.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')

RandomForestRegressor(max_depth=14, n_estimators=1000, random_state=88)
 r2: 0.10294572483173192
 MSE: 263.9117659241767
 MAE: 12.630414823098759



In [None]:
# Grid search 2.4
param_grid = {'n_estimators': [1200], 'max_depth': [15, 17, 20]}

regr = RandomForestRegressor(random_state=88) 
grid_search2_4 = GridSearchCV(regr, param_grid, cv=3, n_jobs=-1)
grid_search2_4.fit(X_train, y_train)

# Save pickle file
model = grid_search2_4
filename = '../model/random_forest_gs2_4.pkl'
pickle.dump(model, open(filename, 'wb'))

best_result2_4 = grid_search2_4.best_estimator_
print(best_result2_4)

y_pred = grid_search2_4.predict(X_test)

r2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test,y_pred) 
RMSE = mean_squared_error(y_test,y_pred,squared=False)
MAE = mean_absolute_error(y_test,y_pred)

print(f' r2: {r2}\n MSE: {MSE}\n MAE: {MAE}\n')