In [8]:
def IFPE_Regression(X_train, y_train, X_test, y_test, regressor, deg=None):
    
    ### INSTANTIATE THE MODEL

    ## LINEAR MODELS

    # ordinary least squares
    if regressor == 'linear':
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
    
    # ridge regression
    elif regressor == 'ridge':
        from sklearn.linear_model import Ridge
        reg = Ridge(random_state=0)

    # lasso regression
    elif regressor == 'lasso':
        from sklearn.linear_model import Lasso
        reg = Lasso(random_state=0)

    # polynomial linear regression - need to add polynomial features first
    # https://www.geeksforgeeks.org/python-implementation-of-polynomial-regression/
    # https://scikit-learn.org/stable/modules/linear_model.html#polynomial-regression-extending-linear-models-with-basis-functions
    elif regressor == 'polynomial':
        from sklearn.linear_model import LinearRegression
        from sklearn.preprocessing import PolynomialFeatures
        poly = PolynomialFeatures(degree = deg)
        X_poly_train = poly.fit_transform(X_train)
        X_poly_test = poly.fit_transform(X_test)
        X_train = X_poly_train
        X_test = X_poly_test

        # poly.fit(X_poly, y_train)
        reg = LinearRegression()

    # stochastic gradient descent
    elif regressor == 'sgd':
        from sklearn.linear_model import SGDRegressor
        reg = SGDRegressor(random_state=0)
    
    ## SUPPORT VECTOR MACHINES

    # SVM regression https://scikit-learn.org/stable/modules/svm.html#regression
    elif regressor == 'svm':
        from sklearn import svm
        reg = svm.SVR()

    ## DECISION TREES

    # decision tree
    elif regressor == 'dt':
        from sklearn import tree
        reg = tree.DecisionTreeRegressor(random_state=0)

    ## ENSEMBLE METHODS

    # random forest
    elif regressor =='rf':
        from sklearn.ensemble import RandomForestRegressor
        reg = RandomForestRegressor(max_depth=2, random_state=0)

    # gradient boost
    elif regressor =='gb':
        from sklearn.ensemble import GradientBoostingRegressor
        reg = GradientBoostingRegressor(random_state=0)

    # voting regressor
    elif regressor =='vr':
        from sklearn.ensemble import GradientBoostingRegressor
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.linear_model import LinearRegression
        from sklearn.ensemble import VotingRegressor
        reg1 = GradientBoostingRegressor(random_state=1)
        reg2 = RandomForestRegressor(random_state=1)
        reg3 = LinearRegression()
        reg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])

    # xgboost
    elif regressor == 'xgb':
        import xgboost as xgb
        reg = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)



    ### FIT THE MODEL

    reg.fit(X_train, y_train)




    ### PREDICT THE RESPONSE FOR THE TEST DATASET

    y_pred = reg.predict(X_test)




    ### EVALUATE THE MODEL

    # Mean Absolute Error
    from sklearn.metrics import mean_absolute_error
    MAE = mean_absolute_error(y_test, y_pred)

    # Root Mean Square Error
    from sklearn.metrics import mean_squared_error
    RMSE = mean_squared_error(y_test, y_pred, squared=False)

    # r2
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test, y_pred)

    # Adjusted r2
    n = len(X_test)
    p = len(X_test[0,:])

    Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

    ### COMPARE MODELS
    
    import pandas as pd
    model_comparison = pd.DataFrame({
        'Mean Absolute Error': MAE,
        'Root Mean Square Error': RMSE,
        'R Squared': r2,
        'Adjusted R Squared': Adj_r2
    }, index  = [str(regressor)])
    
    return model_comparison, reg

# Iteration 1

In [2]:
import pickle

with open(r'..\..\data\v2\df_train_step3_subset1.pickle','rb') as flight_data_file:
     df_flights_TRAIN = pickle.load(flight_data_file)

In [3]:
import pickle

with open(r'..\..\data\v2\df_test_step3_subset1.pickle','rb') as flight_data_file:
     df_flights_TEST = pickle.load(flight_data_file)

In [4]:
df_flights_TRAIN.shape

(73395, 34)

In [5]:
df_flights_TEST.shape

(24444, 34)

In [6]:
X_train = df_flights_TRAIN.drop('arr_delay',axis=1).to_numpy()
X_test = df_flights_TEST.drop('arr_delay',axis=1).to_numpy()
y_train = df_flights_TRAIN['arr_delay'].to_numpy()
y_test = df_flights_TEST['arr_delay'].to_numpy()

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(73395, 33)
(24444, 33)
(73395,)
(24444,)


In [8]:
model_comparison_linear, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='linear', deg=None)
model_comparison_linear

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
linear,23.956809,48.678319,0.01337,0.012036


In [9]:
model_comparison_ridge, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='ridge', deg=None)
model_comparison_ridge

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
ridge,23.955426,48.677943,0.013385,0.012051


In [10]:
model_comparison_lasso, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='lasso', deg=None)
model_comparison_lasso

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
lasso,24.082465,48.883307,0.005043,0.003698


In [11]:
model_comparison_poly2,reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=2)
model_comparison_poly2

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
polynomial,1682346000.0,185988600000.0,-1.44031e+19,-1.476245e+19


In [12]:
# model_comparison_poly3 = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=3)
# model_comparison_poly3

In [13]:
model_comparison_sgd, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='sgd', deg=None)
model_comparison_sgd

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
sgd,23.919875,48.676221,0.013455,0.012121


In [14]:
# model_comparison_svm = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='svm', deg=None)
# model_comparison_svm

In [15]:
model_comparison_dt, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='dt', deg=None)
model_comparison_dt

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
dt,25.1137,50.340876,-0.055176,-0.056602


In [16]:
model_comparison_rf, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='rf', deg=None)
model_comparison_rf

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
rf,24.043896,48.785907,0.009004,0.007664


In [17]:
model_comparison_gb, reg_gb = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='gb', deg=None)
model_comparison_gb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
gb,23.823479,48.591112,0.016902,0.015573


In [18]:
with open(r'..\..\data\v2\reg_gb_iteration1.pickle','wb') as flight_data_file:
     pickle.dump(reg_gb, flight_data_file)

In [19]:
model_comparison_vr, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='vr', deg=None)
model_comparison_vr

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
vr,23.941649,48.716293,0.01183,0.010494


In [20]:
model_comparison_xgb, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='xgb', deg=None)
model_comparison_xgb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
xgb,24.202509,49.054208,-0.001926,-0.003281


In [21]:
import pandas as pd
model_comparison = pd.concat([model_comparison_linear,model_comparison_ridge,model_comparison_lasso,model_comparison_dt,model_comparison_rf,model_comparison_gb,model_comparison_vr,model_comparison_xgb])


In [22]:
model_comparison.sort_values('R Squared', ascending=False)

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
gb,23.823479,48.591112,0.016902,0.015573
ridge,23.955426,48.677943,0.013385,0.012051
linear,23.956809,48.678319,0.01337,0.012036
vr,23.941649,48.716293,0.01183,0.010494
rf,24.043896,48.785907,0.009004,0.007664
lasso,24.082465,48.883307,0.005043,0.003698
xgb,24.202509,49.054208,-0.001926,-0.003281
dt,25.1137,50.340876,-0.055176,-0.056602


# Iteration 2

In [1]:
import pickle

with open(r'..\..\data\v2\df_train_step3_subset2.pickle','rb') as flight_data_file:
     df_flights_TRAIN = pickle.load(flight_data_file)

In [2]:
import pickle

with open(r'..\..\data\v2\df_test_step3_subset2.pickle','rb') as flight_data_file:
     df_flights_TEST = pickle.load(flight_data_file)

In [3]:
df_flights_TRAIN.shape

(73395, 36)

In [4]:
df_flights_TEST.shape

(24444, 36)

In [5]:
X_train = df_flights_TRAIN.drop('arr_delay',axis=1).to_numpy()
X_test = df_flights_TEST.drop('arr_delay',axis=1).to_numpy()
y_train = df_flights_TRAIN['arr_delay'].to_numpy()
y_test = df_flights_TEST['arr_delay'].to_numpy()

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(73395, 35)
(24444, 35)
(73395,)
(24444,)


In [9]:
model_comparison_linear, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='linear', deg=None)
model_comparison_linear

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
linear,23.969509,48.589604,0.016963,0.015553


In [10]:
model_comparison_ridge, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='ridge', deg=None)
model_comparison_ridge

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
ridge,23.962125,48.588495,0.017008,0.015598


In [11]:
model_comparison_lasso, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='lasso', deg=None)
model_comparison_lasso

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
lasso,24.042614,48.782128,0.009157,0.007736


In [12]:
model_comparison_poly2,reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=2)
model_comparison_poly2

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
polynomial,51417.101124,5681681.0,-13441170000.0,-13817660000.0


In [13]:
# model_comparison_poly3 = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=3)
# model_comparison_poly3

In [14]:
model_comparison_sgd, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='sgd', deg=None)
model_comparison_sgd

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
sgd,23.863371,48.616012,0.015894,0.014483


In [15]:
# model_comparison_svm = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='svm', deg=None)
# model_comparison_svm

In [16]:
model_comparison_dt, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='dt', deg=None)
model_comparison_dt

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
dt,28.822237,59.917053,-0.494804,-0.496947


In [17]:
model_comparison_rf, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='rf', deg=None)
model_comparison_rf

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
rf,24.025411,48.749677,0.010475,0.009056


In [18]:
model_comparison_gb, reg_gb = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='gb', deg=None)
model_comparison_gb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
gb,23.744873,48.432294,0.023318,0.021917


In [19]:
with open(r'..\..\data\v2\reg_gb_iteration2.pickle','wb') as flight_data_file:
     pickle.dump(reg_gb, flight_data_file)

In [21]:
model_comparison_vr, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='vr', deg=None)
model_comparison_vr

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
vr,24.032086,48.740535,0.010846,0.009428


In [22]:
model_comparison_xgb, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='xgb', deg=None)
model_comparison_xgb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
xgb,24.057127,48.90537,0.004144,0.002716


In [23]:
import pandas as pd
model_comparison = pd.concat([model_comparison_linear,model_comparison_ridge,model_comparison_lasso,model_comparison_dt,model_comparison_rf,model_comparison_gb,model_comparison_vr,model_comparison_xgb])


In [24]:
model_comparison.sort_values('R Squared', ascending=False)

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
gb,23.744873,48.432294,0.023318,0.021917
ridge,23.962125,48.588495,0.017008,0.015598
linear,23.969509,48.589604,0.016963,0.015553
vr,24.032086,48.740535,0.010846,0.009428
rf,24.025411,48.749677,0.010475,0.009056
lasso,24.042614,48.782128,0.009157,0.007736
xgb,24.057127,48.90537,0.004144,0.002716
dt,28.822237,59.917053,-0.494804,-0.496947


# Iteration 3

In [44]:
import pickle

with open(r'..\..\data\v2\df_train_step3_subset3.pickle','rb') as flight_data_file:
     df_flights_TRAIN = pickle.load(flight_data_file)

In [45]:
import pickle

with open(r'..\..\data\v2\df_test_step3_subset3.pickle','rb') as flight_data_file:
     df_flights_TEST = pickle.load(flight_data_file)

In [46]:
df_flights_TRAIN.shape

(73395, 37)

In [47]:
df_flights_TEST.shape

(24444, 37)

In [48]:
X_train = df_flights_TRAIN.drop('arr_delay',axis=1).to_numpy()
X_test = df_flights_TEST.drop('arr_delay',axis=1).to_numpy()
y_train = df_flights_TRAIN['arr_delay'].to_numpy()
y_test = df_flights_TEST['arr_delay'].to_numpy()

In [49]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(73395, 36)
(24444, 36)
(73395,)
(24444,)


In [50]:
model_comparison_linear, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='linear', deg=None)
model_comparison_linear

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
linear,25.364801,50.333921,-0.054884,-0.05644


In [51]:
model_comparison_ridge, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='ridge', deg=None)
model_comparison_ridge

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
ridge,25.356243,50.334136,-0.054893,-0.056449


In [52]:
model_comparison_lasso, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='lasso', deg=None)
model_comparison_lasso

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
lasso,25.43727,50.548007,-0.063877,-0.065446


In [53]:
model_comparison_poly2,reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=2)
model_comparison_poly2

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
polynomial,116.546241,10095.907408,-42438.864657,-43695.613808


In [54]:
# model_comparison_poly3 = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=3)
# model_comparison_poly3

In [55]:
model_comparison_sgd, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='sgd', deg=None)
model_comparison_sgd

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
sgd,24.556834,50.703545,-0.070434,-0.072013


In [56]:
# model_comparison_svm = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='svm', deg=None)
# model_comparison_svm

In [57]:
model_comparison_dt, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='dt', deg=None)
model_comparison_dt

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
dt,34.261506,71.285261,-1.11584,-1.118961


In [58]:
model_comparison_rf, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='rf', deg=None)
model_comparison_rf

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
rf,25.152021,50.058089,-0.043354,-0.044893


In [59]:
model_comparison_gb, reg_gb = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='gb', deg=None)
model_comparison_gb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
gb,25.066499,50.313753,-0.054039,-0.055594


In [None]:
# with open(r'..\..\data\v2\reg_gb_iteration2.pickle','wb') as flight_data_file:
#      pickle.dump(reg_gb, flight_data_file)

In [60]:
model_comparison_vr, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='vr', deg=None)
model_comparison_vr

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
vr,25.523122,50.690409,-0.069879,-0.071458


In [61]:
model_comparison_xgb, reg = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='xgb', deg=None)
model_comparison_xgb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
xgb,25.278288,52.106404,-0.130487,-0.132154


In [62]:
import pandas as pd
model_comparison = pd.concat([model_comparison_linear,model_comparison_ridge,model_comparison_lasso,model_comparison_dt,model_comparison_rf,model_comparison_gb,model_comparison_vr,model_comparison_xgb])


In [63]:
model_comparison.sort_values('R Squared', ascending=False)

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
rf,25.152021,50.058089,-0.043354,-0.044893
gb,25.066499,50.313753,-0.054039,-0.055594
linear,25.364801,50.333921,-0.054884,-0.05644
ridge,25.356243,50.334136,-0.054893,-0.056449
lasso,25.43727,50.548007,-0.063877,-0.065446
vr,25.523122,50.690409,-0.069879,-0.071458
xgb,25.278288,52.106404,-0.130487,-0.132154
dt,34.261506,71.285261,-1.11584,-1.118961
