In [60]:
def IFPE_Regression(X_train, y_train, X_test, y_test, regressor, deg=None):
    
    ### INSTANTIATE THE MODEL

    ## LINEAR MODELS

    # ordinary least squares
    if regressor == 'linear':
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
    
    # ridge regression
    elif regressor == 'ridge':
        from sklearn.linear_model import Ridge
        reg = Ridge(random_state=0)

    # lasso regression
    elif regressor == 'lasso':
        from sklearn.linear_model import Lasso
        reg = Lasso(random_state=0)

    # polynomial linear regression - need to add polynomial features first
    # https://www.geeksforgeeks.org/python-implementation-of-polynomial-regression/
    # https://scikit-learn.org/stable/modules/linear_model.html#polynomial-regression-extending-linear-models-with-basis-functions
    elif regressor == 'polynomial':
        from sklearn.linear_model import LinearRegression
        from sklearn.preprocessing import PolynomialFeatures
        poly = PolynomialFeatures(degree = deg)
        X_poly_train = poly.fit_transform(X_train)
        X_poly_test = poly.fit_transform(X_test)
        X_train = X_poly_train
        X_test = X_poly_test

        # poly.fit(X_poly, y_train)
        reg = LinearRegression()

    # stochastic gradient descent
    elif regressor == 'sgd':
        from sklearn.linear_model import SGDRegressor
        reg = SGDRegressor(random_state=0)
    
    ## SUPPORT VECTOR MACHINES

    # SVM regression https://scikit-learn.org/stable/modules/svm.html#regression
    elif regressor == 'svm':
        from sklearn import svm
        reg = svm.SVR()

    ## DECISION TREES

    # decision tree
    elif regressor == 'dt':
        from sklearn import tree
        reg = tree.DecisionTreeRegressor(random_state=0)

    ## ENSEMBLE METHODS

    # random forest
    elif regressor =='rf':
        from sklearn.ensemble import RandomForestRegressor
        reg = RandomForestRegressor(max_depth=2, random_state=0)

    # gradient boost
    elif regressor =='gb':
        from sklearn.ensemble import GradientBoostingRegressor
        reg = GradientBoostingRegressor(random_state=0)

    # voting regressor
    elif regressor =='vr':
        from sklearn.ensemble import GradientBoostingRegressor
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.linear_model import LinearRegression
        from sklearn.ensemble import VotingRegressor
        reg1 = GradientBoostingRegressor(random_state=1)
        reg2 = RandomForestRegressor(random_state=1)
        reg3 = LinearRegression()
        reg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])

    # xgboost
    elif regressor == 'xgb':
        import xgboost as xgb
        reg = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)



    ### FIT THE MODEL

    reg.fit(X_train, y_train)




    ### PREDICT THE RESPONSE FOR THE TEST DATASET

    y_pred = reg.predict(X_test)




    ### EVALUATE THE MODEL

    # Mean Absolute Error
    from sklearn.metrics import mean_absolute_error
    MAE = mean_absolute_error(y_test, y_pred)

    # Root Mean Square Error
    from sklearn.metrics import mean_squared_error
    RMSE = mean_squared_error(y_test, y_pred, squared=False)

    # r2
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test, y_pred)

    # Adjusted r2
    n = len(X_test)
    p = len(X_test[0,:])

    Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

    ### COMPARE MODELS
    
    import pandas as pd
    model_comparison = pd.DataFrame({
        'Mean Absolute Error': MAE,
        'Root Mean Square Error': RMSE,
        'R Squared': r2,
        'Adjusted R Squared': Adj_r2
    }, index  = [str(regressor)])
    
    return model_comparison

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
sns.set() # Setting seaborn as default style even if use only matplotlib
pd.set_option('display.max_rows', None)

In [23]:
df_flights = pd.read_csv(r'..\flight_data_clean.csv', sep=',')
df_flights = df_flights.drop('Unnamed: 0', axis=1)
df_flights.head()

Unnamed: 0,mkt_carrier,mkt_carrier_fl_num,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,year,month,day
0,9,5431,1253,5431,12889,13851,1930,2350,140,986,19.0,2018,1,1
1,9,989,1238,989,12889,13871,2125,155,150,1099,46.0,2018,1,1
2,9,1664,3796,1664,12889,13871,650,1120,150,1099,-9.0,2018,1,1
3,9,1106,3562,1106,12889,13891,735,830,55,197,-3.0,2018,1,1
4,9,1559,2065,1559,12889,13891,1740,1835,55,197,37.0,2018,1,1


In [24]:
X = df_flights.drop('arr_delay',axis=1).to_numpy()
y = df_flights['arr_delay'].to_numpy()

In [25]:
def split_data(data, target=None, percent_test=0.3, dtype='array'):
    # Import train_test_split function
    from sklearn.model_selection import train_test_split

    # Split dataset into training set and test set
    if dtype == 'array':      
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=percent_test,random_state=10)
    
        print(f'X_train shape: {X_train.shape}')
        print(f'X_test shape: {X_test.shape}')
        print(f'y_train shape: {y_train.shape}')
        print(f'y_test shape: {y_test.shape}')
    
        return X_train, X_test, y_train, y_test
    
    if dtype == 'df':
        df_train, df_test = train_test_split(data, test_size=percent_test,random_state=10)

        print(f'df_train shape: {df_train.shape}')
        print(f'df_test shape: {df_test.shape}')
    
        return df_train, df_test

In [26]:
X_train, X_test, y_train, y_test = split_data(data=X, target=y)

X_train shape: (69377, 13)
X_test shape: (29733, 13)
y_train shape: (69377,)
y_test shape: (29733,)


In [35]:
model_comparison_linear = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='linear', deg=None)
model_comparison_linear

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
linear,24.238396,48.88619,0.008231,0.007797


In [36]:
model_comparison_ridge = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='ridge', deg=None)
model_comparison_ridge

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
ridge,24.238396,48.88619,0.008231,0.007797


In [48]:
model_comparison_lasso = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='lasso', deg=None)
model_comparison_lasso

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
lasso,24.226039,48.883017,0.008359,0.007926


In [53]:
model_comparison_poly2 = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=2)
model_comparison_poly2

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
polynomial,24.257671,49.305122,-0.00884,-0.012416


In [54]:
model_comparison_poly3 = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='polynomial', deg=3)
model_comparison_poly3

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
polynomial,29331440.0,2862203000.0,-3399689000000000.0,-3464951000000000.0


In [55]:
model_comparison_sgd = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='sgd', deg=None)
model_comparison_sgd

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
sgd,2.971017e+16,2.996987e+16,-3.7274189999999995e+29,-3.7290489999999995e+29


In [61]:
model_comparison_svm = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='svm', deg=None)
model_comparison_svm

KeyboardInterrupt: 

In [62]:
model_comparison_dt = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='dt', deg=None)
model_comparison_dt

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
dt,35.270474,73.538842,-1.244254,-1.245235


In [63]:
model_comparison_rf = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='rf', deg=None)
model_comparison_rf

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
rf,24.201394,48.845569,0.009878,0.009445


In [64]:
model_comparison_gb = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='gb', deg=None)
model_comparison_gb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
gb,23.995052,48.603529,0.019666,0.019237


In [65]:
model_comparison_vr = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='vr', deg=None)
model_comparison_vr

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
vr,24.308494,48.46062,0.025423,0.024996


In [66]:
model_comparison_xgb = IFPE_Regression(X_train, y_train, X_test, y_test, regressor='xgb', deg=None)
model_comparison_xgb

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
xgb,24.369545,49.28845,-0.008158,-0.008599


In [67]:
model_comparison = pd.concat([model_comparison_linear,model_comparison_ridge,model_comparison_lasso,model_comparison_poly2,model_comparison_poly3,model_comparison_sgd,model_comparison_dt,model_comparison_rf,model_comparison_gb,model_comparison_vr,model_comparison_xgb])


In [70]:
model_comparison.sort_values('R Squared', ascending=False)

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
vr,24.30849,48.46062,0.02542273,0.02499642
gb,23.99505,48.60353,0.01966628,0.01923745
rf,24.20139,48.84557,0.00987805,0.00944494
lasso,24.22604,48.88302,0.008359303,0.007925529
ridge,24.2384,48.88619,0.008230557,0.007796726
linear,24.2384,48.88619,0.008230551,0.007796721
xgb,24.36955,49.28845,-0.00815813,-0.008599129
polynomial,24.25767,49.30512,-0.008840289,-0.01241568
dt,35.27047,73.53884,-1.244254,-1.245235
polynomial,29331440.0,2862203000.0,-3399689000000000.0,-3464951000000000.0


In [41]:
regressor_list = ['linear', 'ridge', 'lasso']

In [43]:
for regressor in regressor_list:
    df_list = []
    df_list.append(f'model_comparison_{regressor}')
    print(df_list)

['model_comparison_linear']
['model_comparison_ridge']
['model_comparison_lasso']


In [47]:
for regressor in regressor_list:
    df_list = []
    df_list.append(f'model_comparison_{regressor}')
    
for regressor in regressor_list:
    for model in df_list:
        model = IFPE_Regression(X_train, y_train, X_test, y_test, regressor=regressor, deg=None)
        print(model)

Unnamed: 0,Mean Absolute Error,Root Mean Square Error,R Squared,Adjusted R Squared
lasso,24.226039,48.883017,0.008359,0.007926
