# AT3 - XGBoost Lag

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from joblib import dump
from catboost import CatBoostRegressor
from imblearn.over_sampling import SMOTE
from hyperopt import fmin, tpe, hp, Trials
from hyperopt import STATUS_OK
from sklearn.model_selection import train_test_split
from prophet import Prophet
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv")
X_val = pd.read_csv("../data/processed/X_val.csv")
y_val = pd.read_csv("../data/processed/y_val.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

In [3]:
X_train.head()

Unnamed: 0,startingAirport,destinationAirport,cabin_type,year,month,day,hour,minute
0,LAX,PHL,coach,2022,4,17,0,4
1,LAX,BOS,coach,2022,4,17,0,4
2,LAX,DFW,coach,2022,4,17,0,4
3,LAX,LGA,coach,2022,4,17,0,4
4,LAX,ATL,coach,2022,4,17,0,4


In [36]:
# Feature Engineering
def add_time_features(X):
    X = X.copy()
    X['hour'] = X['hour'].astype(int)
    X['minute'] = X['minute'].astype(int)
    
    # Create a datetime column from year, month, day, hour, minute
    X['datetime'] = pd.to_datetime(X[['year', 'month', 'day', 'hour', 'minute']])
    
    # Extracting time-related features
    X['day_of_week'] = X['datetime'].dt.dayofweek
    X['is_weekend'] = (X['day_of_week'] >= 5).astype(int)  # 1 for weekend, 0 for weekday
    X['month'] = X['datetime'].dt.month
    X['day'] = X['datetime'].dt.day
    X['hour'] = X['datetime'].dt.hour

    # Drop the original datetime column and keep relevant features
    X = X.drop(columns=['year', 'month', 'day', 'hour', 'minute', 'datetime'])
    
    # One-hot encoding for categorical features
    X = pd.get_dummies(X, columns=['startingAirport', 'destinationAirport', 'cabin_type'], drop_first=True)

    return X

# Add time features to X
X_train_transformed = add_time_features(X_train)
X_val_transformed = add_time_features(X_val)
X_test_transformed = add_time_features(X_test)

# Initialize the XGBoost Regressor
model = XGBRegressor(objective='reg:squarederror')

# Fit the model
model.fit(X_train_transformed, y_train)

# Predictions
y_train_pred = model.predict(X_train_transformed)
y_val_pred = model.predict(X_val_transformed)
y_test_pred = model.predict(X_test_transformed)

# Calculate RMSE and MAE
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

# Metrics for train, val, and test
train_rmse, train_mae = calculate_metrics(y_train, y_train_pred)
val_rmse, val_mae = calculate_metrics(y_val, y_val_pred)
test_rmse, test_mae = calculate_metrics(y_test, y_test_pred)

# Print the results
print(f"Training RMSE: {train_rmse}, MAE: {train_mae}")
print(f"Validation RMSE: {val_rmse}, MAE: {val_mae}")
print(f"Test RMSE: {test_rmse}, MAE: {test_mae}")

Training RMSE: 155.01796690459514, MAE: 106.87343943058659
Validation RMSE: 167.0188748544038, MAE: 114.52210598911832
Test RMSE: 156.2169339139503, MAE: 112.48162877246561


In [4]:
# Feature Engineering
def add_time_features(X):
    X = X.copy()
    X['hour'] = X['hour'].astype(int)
    X['minute'] = X['minute'].astype(int)
    
    # Create a datetime column from year, month, day, hour, minute
    X['datetime'] = pd.to_datetime(X[['year', 'month', 'day', 'hour', 'minute']])
    
    # Extracting time-related features
    X['day_of_week'] = X['datetime'].dt.dayofweek
    X['is_weekend'] = (X['day_of_week'] >= 5).astype(int)  # 1 for weekend, 0 for weekday
    X['month'] = X['datetime'].dt.month
    X['day'] = X['datetime'].dt.day
    X['hour'] = X['datetime'].dt.hour

    # Drop the original datetime column and keep relevant features
    X = X.drop(columns=['year', 'month', 'day', 'hour', 'minute', 'datetime'])
    
    # One-hot encoding for categorical features
    X = pd.get_dummies(X, columns=['startingAirport', 'destinationAirport', 'cabin_type'], drop_first=True)

    return X

# Add time features to X
X_train_transformed = add_time_features(X_train)
X_val_transformed = add_time_features(X_val)
X_test_transformed = add_time_features(X_test)

# Initialize the XGBoost Regressor with the specified hyperparameters
model = XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=0.9112046961874785,
    learning_rate=0.12635038902243934,
    max_depth=11,
    n_estimators=167,
    subsample=0.8805835345269344,
    random_state=42  # For reproducibility
)

# Fit the model
model.fit(X_train_transformed, y_train)

# Predictions
y_train_pred = model.predict(X_train_transformed)
y_val_pred = model.predict(X_val_transformed)
y_test_pred = model.predict(X_test_transformed)

# Calculate RMSE and MAE
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

# Metrics for train, val, and test
train_rmse, train_mae = calculate_metrics(y_train, y_train_pred)
val_rmse, val_mae = calculate_metrics(y_val, y_val_pred)
test_rmse, test_mae = calculate_metrics(y_test, y_test_pred)

# Print the results
print(f"Training RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}, MAE: {val_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}")

Training RMSE: 154.14, MAE: 106.05
Validation RMSE: 166.90, MAE: 114.28
Test RMSE: 156.17, MAE: 112.31
