In [29]:
import pandas as pd
import argparse
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import math
import random 
from sklearn.linear_model import Ridge, Lasso

In [None]:
# Load dataset
df = pd.read_csv("space_mountain_with_holiday_weather_lag_suite.csv")

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Day of Week']) 

# train on 2014 (December), 2015, 2016, 2017, 2018, 2019, 2022 (excluding COVID years)
# test on 2023, 2024, 2025 (January - March)
train_years = ['14', '15', '16', '17', '18', '19', '22']
test_years = ['23', '24', '25']

df['Year'] = df['Date'].apply(lambda x: x.split('/')[-1])  # Extract year as a string
df = df[df['Time of Day'] >= 450]
df['Time_sq'] = df['Time of Day'] ** 2
df['Time_sin'] = np.sin(2 * np.pi * df['Time of Day'] / 1440)
df['Time_cos'] = np.cos(2 * np.pi * df['Time of Day'] / 1440)

train_df = df[df['Year'].isin(train_years)]  # Filter rows where Year is in train_years
test_df = df[df['Year'].isin(test_years)]  # Filter rows where Year is in train_years

# getting datasets
y_train = train_df['Wait Time'].values
X_train = train_df.drop(columns=['Wait Time', 'Date', 'Time', 'Year'])
y_test = test_df['Wait Time'].values
X_test = test_df.drop(columns=['Wait Time', 'Date', 'Time', 'Year'])

print("X_train:")
print(X_train.head())

print("\ny_train (first 10):")
print(y_train[:10])

print("\nX_test:")
print(X_test.head())

print("\ny_test (first 10):")
print(y_test[:10])

KeyError: "None of [Index(['Day of Week'], dtype='object')] are in the [columns]"

In [None]:
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0, max_iter=10000)

ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)

# Predictions
ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)

# RMSE
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))

print(f"Ridge RMSE: {ridge_rmse:.2f}")
print(f"Lasso RMSE: {lasso_rmse:.2f}")

Ridge RMSE: 16.70
Lasso RMSE: 16.57


In [None]:
print("Ridge RMSE by alpha:")
for alpha in [0.01, 0.1, 1.0, 10.0, 100.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    pred = ridge.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    print(f"  alpha={alpha:<6}: RMSE = {rmse:.2f}")

Ridge RMSE by alpha:
  alpha=0.01  : RMSE = 16.70
  alpha=0.1   : RMSE = 16.70
  alpha=1.0   : RMSE = 16.70
  alpha=10.0  : RMSE = 16.69
  alpha=100.0 : RMSE = 16.68


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [None]:
print("Lasso RMSE by alpha:")
for alpha in [0.01, 0.1, 1.0, 10.0]:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train, y_train)
    pred = lasso.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    print(f"  alpha={alpha:<6}: RMSE = {rmse:.2f}")

Lasso RMSE by alpha:


  model = cd_fast.enet_coordinate_descent(


  alpha=0.01  : RMSE = 16.69
  alpha=0.1   : RMSE = 16.66
  alpha=1.0   : RMSE = 16.57
  alpha=10.0  : RMSE = 16.56


In [None]:
lasso_best = Lasso(alpha=1.0, max_iter=10000)
lasso_best.fit(X_train, y_train)

# Display non-zero feature coefficients
coef_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": lasso_best.coef_
})
print(coef_df[coef_df["Coefficient"] != 0].sort_values(by="Coefficient", key=abs, ascending=False))


                 Feature  Coefficient
12  Day of Week_Saturday     2.139519
6         Lag1_Wait_Time     0.300419
4            Temperature    -0.246405
7         Lag2_Wait_Time     0.178028
0                  Month     0.177673
1            Time of Day     0.121100
8         Lag3_Wait_Time     0.097192
9       RollingMean_Lag3     0.015950
17               Time_sq    -0.000059
