# Model Filtering

## Load Modules

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import modules.help_functions as hf
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE

import pickle

from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns

## Load the .csv File

In [None]:
df = pd.read_csv('../data/encoded_training_data_v7.csv')

## AdaBoost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ada = AdaBoostRegressor()

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## SVM

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

svr = SVR()

svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)


rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Bagging

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

bag = BaggingRegressor()

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

gb_reg = GradientBoostingRegressor()

gb_reg.fit(X_train, y_train)
y_pred = gb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Hist Gradient Boosting

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

hgb_reg = HistGradientBoostingRegressor()

hgb_reg.fit(X_train, y_train)
y_pred = hgb_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Voting

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

v_reg = VotingRegressor()

v_reg.fit(X_train, y_train)
y_pred = v_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')

## Stacking

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

y = df.arr_delay.to_numpy()
X = df.drop(columns=['arr_delay']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

st_reg = StackingRegressor()

st_reg.fit(X_train, y_train)
y_pred = st_reg.predict(X_test)

print(f'MSE(test): {mean_squared_error(y_test, y_pred)}',
      f'R2(test): {r2_score(y_test, y_pred)}', 
      f'MAE(test): {mean_absolute_error(y_test, y_pred)}')