# Waiting Time Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from datetime import timedelta
from sklearn.model_selection import GridSearchCV

## Data Extraction

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read the Parquet file
df = pd.read_parquet("/content/drive/MyDrive/DataInMinute.parquet", engine='pyarrow')  # You can also use engine='fastparquet'
df_nocov = pd.read_parquet("/content/drive/MyDrive/mins_data.parquet", engine='pyarrow')  # You can also use engine='fastparquet'

# Now df is a Pandas DataFrame containing the data from your Parquet file.
print(df.head())

    WORK_DATE                 DEB_TIME  DEB_TIME_HOUR  \
0  2018-01-01  2018-01-01 21:00:00.000             21   
1  2018-01-01  2018-01-01 19:30:00.000             19   
2  2018-01-01  2018-01-01 22:30:00.000             22   
3  2018-01-01  2018-01-01 12:45:00.000             12   
4  2018-01-01  2018-01-01 17:00:00.000             17   

                  FIN_TIME ENTITY_DESCRIPTION_SHORT  WAIT_TIME_MAX  NB_UNITS  \
0  2018-01-01 21:15:00.000           Roller Coaster              0       2.0   
1  2018-01-01 19:45:00.000              Bumper Cars              5      18.0   
2  2018-01-01 22:45:00.000              Rapids Ride              0       1.0   
3  2018-01-01 13:00:00.000              Crazy Dance              5       1.0   
4  2018-01-01 17:15:00.000                   Skyway              5      15.0   

   GUEST_CARRIED  CAPACITY  ADJUST_CAPACITY  ...  wind_gust  rain_1h  rain_3h  \
0            0.0     0.000             0.00  ...        NaN      NaN      NaN   
1          148

In [None]:
df.head()

Unnamed: 0,WORK_DATE,DEB_TIME,DEB_TIME_HOUR,FIN_TIME,ENTITY_DESCRIPTION_SHORT,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,2018-01-01,2018-01-01 21:00:00.000,21,2018-01-01 21:15:00.000,Roller Coaster,0,2.0,0.0,0.0,0.0,...,,,,,,39.0,802.0,Clouds,scattered clouds,03n
1,2018-01-01,2018-01-01 19:30:00.000,19,2018-01-01 19:45:00.000,Bumper Cars,5,18.0,148.0,254.749,254.75,...,,,,,,23.0,801.0,Clouds,few clouds,02n
2,2018-01-01,2018-01-01 22:30:00.000,22,2018-01-01 22:45:00.000,Rapids Ride,0,1.0,0.0,0.0,0.0,...,,,,,,51.0,803.0,Clouds,broken clouds,04n
3,2018-01-01,2018-01-01 12:45:00.000,12,2018-01-01 13:00:00.000,Crazy Dance,5,1.0,46.0,250.001,250.0,...,,0.92,,,,99.0,500.0,Rain,light rain,10d
4,2018-01-01,2018-01-01 17:00:00.000,17,2018-01-01 17:15:00.000,Skyway,5,15.0,92.0,211.5,198.25,...,,0.25,,,,100.0,500.0,Rain,light rain,10n


## Data Analysis

In [None]:
df['ENTITY_DESCRIPTION_SHORT'].unique()

array(['Roller Coaster', 'Bumper Cars', 'Rapids Ride', 'Crazy Dance',
       'Skyway', 'Free Fall', 'Monorail', 'Swing Ride', 'Crazy Bus',
       'Drop Tower', 'Spinning Coaster', 'Scooby Doo', 'Superman Ride',
       'Spiral Slide', 'Inverted Coaster', 'Water Ride', 'Power Tower',
       'Top Spin', 'Log Flume', 'Oz Theatre', 'Circus Train',
       'Giant Wheel', 'Kiddie Coaster', 'Bungee Jump', 'Zipline',
       'Aeroplane Ride', 'Haunted House', 'Reverse Bungee', 'Go-Karts',
       'Dizzy Dropper', 'Merry Go Round', 'Flying Coaster', 'Gondola',
       'Pirate Ship', 'Giga Coaster', 'Himalaya Ride', 'Sling Shot',
       'Vertical Drop', 'Tilt-A-Whirl'], dtype=object)

In [4]:
attractions = ['Roller Coaster', 'Bumper Cars', 'Rapids Ride', 'Crazy Dance',
       'Skyway', 'Free Fall', 'Monorail', 'Swing Ride', 'Crazy Bus',
       'Drop Tower', 'Spinning Coaster', 'Scooby Doo', 'Superman Ride',
       'Spiral Slide', 'Inverted Coaster', 'Water Ride', 'Power Tower',
       'Top Spin', 'Log Flume', 'Oz Theatre', 'Circus Train',
       'Giant Wheel', 'Kiddie Coaster', 'Bungee Jump', 'Zipline',
       'Aeroplane Ride', 'Haunted House', 'Reverse Bungee', 'Go-Karts',
       'Dizzy Dropper', 'Merry Go Round', 'Flying Coaster', 'Gondola',
       'Pirate Ship', 'Giga Coaster', 'Himalaya Ride', 'Sling Shot',
       'Vertical Drop', 'Tilt-A-Whirl']
attractions2 = ['Roller Coaster', 'Bumper Cars']

## Data Processing

In [5]:
# Assuming your dataframe is named df
df['WORK_DATE'] = pd.to_datetime(df['WORK_DATE'])
df['DEB_TIME'] = pd.to_datetime(df['DEB_TIME'])
df['FIN_TIME'] = pd.to_datetime(df['FIN_TIME'])

# Sort the dataframe by WORK_DATE to ensure correct lag calculation
df.sort_values(by='DEB_TIME', inplace=True)

# Adding day of the week as a feature
df['day_of_week'] = df['WORK_DATE'].dt.dayofweek

In [6]:
# Assuming your dataframe is named df
df_nocov['WORK_DATE'] = pd.to_datetime(df_nocov['WORK_DATE'])
df_nocov['DEB_TIME'] = pd.to_datetime(df_nocov['DEB_TIME'])
df_nocov['FIN_TIME'] = pd.to_datetime(df_nocov['FIN_TIME'])

# Sort the dataframe by WORK_DATE to ensure correct lag calculation
df_nocov.sort_values(by='DEB_TIME', inplace=True)

# Adding day of the week as a feature
df_nocov['day_of_week'] = df_nocov['WORK_DATE'].dt.dayofweek

In [7]:
max_date_nocov = df_nocov['DEB_TIME'].max()

df_new = df[df['DEB_TIME'] <= max_date_nocov]

columns_to_drop = ['DEB_TIME', 'FIN_TIME', 'city_name', 'dt_iso', 'weather_icon', 'visibility', 'sea_level', 'grnd_level', 'snow_3h']
df_new = df_new.drop(columns=columns_to_drop)
df_new = pd.get_dummies(df_new, columns=['weather_main', 'weather_description'])

columns_to_fill = ['wind_gust', 'rain_1h', 'rain_3h', 'snow_1h']
df_new[columns_to_fill] = df_new[columns_to_fill].fillna(0)

one_hot_encoded_entity = pd.get_dummies(df_new['ENTITY_DESCRIPTION_SHORT'])
one_hot_encoded_entity['WORK_DATE'] = df_new['WORK_DATE']

validation_start_date = df_new['WORK_DATE'].max() - timedelta(weeks=2)
train_df = df_new[df_new['WORK_DATE'] < validation_start_date]
validation_df = df_new[df_new['WORK_DATE'] >= validation_start_date]

one_hot_encoded_entity_train = one_hot_encoded_entity[one_hot_encoded_entity['WORK_DATE'] < validation_start_date]
one_hot_encoded_entity_test = one_hot_encoded_entity[one_hot_encoded_entity['WORK_DATE'] >= validation_start_date]

one_hot_encoded_entity_train = one_hot_encoded_entity_train.drop(columns=['WORK_DATE'])
one_hot_encoded_entity_test = one_hot_encoded_entity_test.drop(columns=['WORK_DATE'])

train_df = train_df.drop(columns=['WORK_DATE'])
validation_df = validation_df.drop(columns=['WORK_DATE'])

In [8]:
columns_to_drop_nocov = ['DEB_TIME', 'FIN_TIME', 'snow_3h', 'START_TIME', 'DEB_TIME_x', 'FIN_TIME_x', 'DEB_TIME_y', 'FIN_TIME_y']

df_nocov_new = df_nocov.drop(columns=columns_to_drop_nocov)

df_nocov_new[columns_to_fill] = df_nocov_new[columns_to_fill].fillna(0)

one_hot_encoded_entity_nocov = pd.get_dummies(df_nocov_new['Attraction'])
one_hot_encoded_entity_nocov['WORK_DATE'] = df_nocov_new['WORK_DATE']

validation_start_date = df_nocov_new['WORK_DATE'].max() - timedelta(weeks=2)
train_df_nocov = df_nocov_new[df_nocov_new['WORK_DATE'] < validation_start_date]
validation_df_nocov = df_nocov_new[df_nocov_new['WORK_DATE'] >= validation_start_date]

one_hot_encoded_entity_train_nocov = one_hot_encoded_entity_nocov[one_hot_encoded_entity_nocov['WORK_DATE'] < validation_start_date]
one_hot_encoded_entity_test_nocov  = one_hot_encoded_entity_nocov[one_hot_encoded_entity_nocov['WORK_DATE'] >= validation_start_date]

one_hot_encoded_entity_train_nocov = one_hot_encoded_entity_train_nocov.drop(columns=['WORK_DATE'])
one_hot_encoded_entity_test_nocov = one_hot_encoded_entity_test_nocov.drop(columns=['WORK_DATE'])

train_df_nocov = train_df_nocov.drop(columns=['WORK_DATE'])
validation_df_nocov = validation_df_nocov.drop(columns=['WORK_DATE'])

## Data Modelling

In [9]:
import tensorflow as tf

if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please select GPU from the Runtime menu.")

Default GPU Device: /device:GPU:0


In [10]:
import warnings

# Suppress all warnings (not recommended)
warnings.filterwarnings('ignore')

### General Model

In [None]:
train_df_mod = pd.concat([train_df, one_hot_encoded_entity_train], axis=1)
validation_df_mod = pd.concat([validation_df, one_hot_encoded_entity_test], axis=1)

X_train = train_df_mod.drop(columns=['ENTITY_DESCRIPTION_SHORT', 'WAIT_TIME_MAX'])
y_train = train_df_mod['WAIT_TIME_MAX']

X_test = validation_df_mod.drop(columns=['ENTITY_DESCRIPTION_SHORT', 'WAIT_TIME_MAX'])
y_test = validation_df_mod['WAIT_TIME_MAX']

model = XGBRegressor(tree_method='hist', device="cuda")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for all rides: {rmse}")
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Average Percentage Error : {mape}%")

RMSE for all rides: 12.685695570324336
Mean Average Percentage Error : 3688984163424636.0%


In [None]:
train_df_mod = pd.concat([train_df_nocov, one_hot_encoded_entity_train_nocov], axis=1)
validation_df_mod = pd.concat([validation_df_nocov, one_hot_encoded_entity_test_nocov], axis=1)

X_train = train_df_mod.drop(columns=['Attraction', 'WAIT_TIME_MAX'])
y_train = train_df_mod['WAIT_TIME_MAX']

X_test = validation_df_mod.drop(columns=['Attraction', 'WAIT_TIME_MAX'])
y_test = validation_df_mod['WAIT_TIME_MAX']

model = XGBRegressor(tree_method='hist', device="cuda")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for all rides: {rmse}")

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Average Percentage Error : {mape}%")

RMSE for all rides: 11.543302903932162
Mean Average Percentage Error : 3634133140330222.5%


In [11]:
train_df_mod = pd.concat([train_df_nocov, one_hot_encoded_entity_train_nocov], axis=1)
validation_df_mod = pd.concat([validation_df_nocov, one_hot_encoded_entity_test_nocov], axis=1)

X_train = train_df_mod.drop(columns=['Attraction', 'WAIT_TIME_MAX'])
y_train = train_df_mod['WAIT_TIME_MAX']

X_test = validation_df_mod.drop(columns=['Attraction', 'WAIT_TIME_MAX'])
y_test = validation_df_mod['WAIT_TIME_MAX']

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [3, 6, 12],         # Maximum depth of the trees
    'learning_rate': [0.01, 0.1, 0.2] # Step size shrinkage used in update to prevent overfitting
}

# Initialize the XGBRegressor with fixed parameters
model = XGBRegressor(tree_method='hist', device="gpu")  # Use 'gpu' if your setup supports it, else use 'auto'

# Setup the grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring='neg_root_mean_squared_error', verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f"Best parameters found: {grid_search.best_params_}")

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for all rides: {rmse}")

# Calculate and print the MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Average Percentage Error: {mape}%")

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   8.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   8.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   9.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   8.3s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=  10.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=  10.5s
[CV] END ..learning_rate=0.01, max_depth=6, n_estimators=100; total time=   8.4s
[CV] END ..learning_rate=0.01, max_depth=6, n_estimators=100; total time=   7.8s
[CV] END ..learning_rate=0.01, max_depth=6, n_estimators=200; total time=  10.4s
[CV] END ..learning_rate=0.01, max_depth=6, n_estimators=200; total time=   9.5s
[CV] END ..learning_rate=0.01, max_depth=6, n_estimators=300; total time=  11.6s
[CV] END ..learning_rate=0.01, max_depth=6, n_es

In [12]:
train_df_mod = pd.concat([train_df_nocov, one_hot_encoded_entity_train_nocov], axis=1)
validation_df_mod = pd.concat([validation_df_nocov, one_hot_encoded_entity_test_nocov], axis=1)

mse = []
error_percentage = []
n_obs = []
y_true_total = []
y_pred_total = []

for i in range(7):
      train_df_mod_per_dow = train_df_mod[train_df_mod['day_of_week']==i]
      validation_df_mod_per_dow = validation_df_mod[validation_df_mod['day_of_week']==i]

      X_train = train_df_mod_per_dow.drop(columns=['Attraction', 'WAIT_TIME_MAX'])
      y_train = train_df_mod_per_dow['WAIT_TIME_MAX']

      X_test = validation_df_mod_per_dow.drop(columns=['Attraction', 'WAIT_TIME_MAX'])
      y_test = validation_df_mod_per_dow['WAIT_TIME_MAX']


      model = XGBRegressor(tree_method='hist', device="cuda")
      model.fit(X_train, y_train)

      y_pred = model.predict(X_test)

      # Calculate the Root Mean Squared Error (RMSE)
      mse.append(mean_squared_error(y_test, y_pred))
      n_obs.append(len(y_test))

      y_true_total.extend(y_test.tolist())
      y_pred_total.extend(y_pred.tolist())

# Convert lists to NumPy arrays for calculation
mse = np.array(mse)
n_obs = np.array(n_obs)

# Calculate weighted MSE
weighted_mse = np.sum(mse * n_obs) / np.sum(n_obs)

# Calculate RMSE for all rides
total_rmse = np.sqrt(weighted_mse)

# Calculate average error percentage
mape = mean_absolute_percentage_error(y_true_total, y_pred_total)

print(f"RMSE for all rides: {total_rmse}")
print(f"Mean Average Percentage Error : {mape}%")

RMSE for all rides: 12.439537989644322
Mean Average Percentage Error : 4778540608577874.0%


### Model for each attraction

In [None]:
mse = []
error_percentage = []
n_obs = []

for ride in attractions:
    train_df_filtered = train_df[train_df['ENTITY_DESCRIPTION_SHORT'] == ride]
    validation_df_filtered = validation_df[validation_df['ENTITY_DESCRIPTION_SHORT'] == ride]

    # Ensure 'ENTITY_DESCRIPTION_SHORT' is dropped after filtering to avoid future errors
    X_train = train_df_filtered.drop(columns=['ENTITY_DESCRIPTION_SHORT', 'WAIT_TIME_MAX'])
    y_train = train_df_filtered['WAIT_TIME_MAX']

    X_test = validation_df_filtered.drop(columns=['ENTITY_DESCRIPTION_SHORT', 'WAIT_TIME_MAX'])
    y_test = validation_df_filtered['WAIT_TIME_MAX']

    model = XGBRegressor(tree_method='gpu_hist', device="cuda")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse.append(mean_squared_error(y_test, y_pred))
    n_obs.append(len(y_test))
    error_percentage.append((mean_absolute_error(y_test, y_pred) / np.mean(y_test)) * 100)

# Convert lists to NumPy arrays for calculation
mse = np.array(mse)
n_obs = np.array(n_obs)

# Calculate weighted MSE
weighted_mse = np.sum(mse * n_obs) / np.sum(n_obs)

# Calculate RMSE for all rides
total_rmse = np.sqrt(weighted_mse)

# Calculate average error percentage
avg_error_percentage = np.mean(error_percentage)

print(f"RMSE for all rides: {total_rmse}")
print(f"Average Error Percentage: {avg_error_percentage}%")


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

  error_percentage.append((mean_absolute_error(y_test, y_pred) / np.mean(y_test)) * 100)

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist

RMSE for all rides: 13.105798163164362
Average Error Percentage: inf%



    E.g. tree_method = "hist", device = "cuda"

