# AT3 - Linear Regression

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from joblib import dump
from catboost import CatBoostRegressor
from imblearn.over_sampling import SMOTE
from hyperopt import fmin, tpe, hp, Trials
from hyperopt import STATUS_OK
from sklearn.model_selection import train_test_split
from prophet import Prophet
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv")
X_val = pd.read_csv("../data/processed/X_val.csv")
y_val = pd.read_csv("../data/processed/y_val.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

In [3]:
X_train.head()

Unnamed: 0,startingAirport,destinationAirport,cabin_type,year,month,day,hour,minute
0,LAX,PHL,coach,2022,4,17,0,4
1,LAX,BOS,coach,2022,4,17,0,4
2,LAX,DFW,coach,2022,4,17,0,4
3,LAX,LGA,coach,2022,4,17,0,4
4,LAX,ATL,coach,2022,4,17,0,4


In [5]:
# Cyclical encoding for time features (hour and month)
def encode_cyclical_features(df):
    df = df.copy()
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df = df.drop(columns=['year', 'month', 'day', 'hour', 'minute'])  # Drop raw datetime features after encoding
    return df

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['startingAirport', 'destinationAirport', 'cabin_type']),
        ('cyclical', FunctionTransformer(encode_cyclical_features), ['year', 'month', 'day', 'hour', 'minute'])
    ], remainder='passthrough'
)

# Define the model
model = LinearRegression()

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('linear', model)
])

pipeline.fit(X_train, y_train)

# Predictions
train_pred = pipeline.predict(X_train)
val_pred = pipeline.predict(X_val)
test_pred = pipeline.predict(X_test)

# Calculate and print RMSE and MAE
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

train_rmse, train_mae = calculate_metrics(y_train, train_pred)
val_rmse, val_mae = calculate_metrics(y_val, val_pred)
test_rmse, test_mae = calculate_metrics(y_test, test_pred)

print(f"Training Set - RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}")
print(f"Validation Set - RMSE: {val_rmse:.2f}, MAE: {val_mae:.2f}")
print(f"Test Set - RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}")

Training Set - RMSE: 179.98, MAE: 127.60
Validation Set - RMSE: 194.48, MAE: 139.07
Test Set - RMSE: 182.99, MAE: 134.78
