# Imports and load data

In [2]:
import py7zr
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [3]:
data_dir = "final_data/"
attendance_file = "attendance.csv"
entity_file = "entity_schedule.csv"
link_attraction_file = "link_attraction_park.csv"
parade_file = "parade_night_show.xlsx"
waiting_file = "waiting_times.csv"
weather_file = "weather_data.csv"

# Extract the compressed data if not already extracted
if not os.path.exists(data_dir):
    with py7zr.SevenZipFile("final_data.7z", mode="r") as z:
        z.extractall()

# Load each dataset into a DataFrame
attendance = pd.read_csv(os.path.join(data_dir, attendance_file))
entity_schedule = pd.read_csv(os.path.join(data_dir, entity_file))
link_attraction_park = pd.read_csv(os.path.join(data_dir, link_attraction_file), sep=";")
parades = pd.read_excel(os.path.join(data_dir, parade_file))
waiting_times = pd.read_csv(os.path.join(data_dir, waiting_file))
weather = pd.read_csv(os.path.join(data_dir, weather_file))

# Filter to future dates (we want to predict)

In [4]:
waiting_times["WORK_DATE"] = pd.to_datetime(waiting_times["WORK_DATE"])
waiting_times = waiting_times[waiting_times["WORK_DATE"] >= '2022-08-01']
waiting_times = waiting_times[waiting_times["WORK_DATE"] <= '2022-08-18']

# Filter to relevant rides and convert time columns

In [5]:
# Keep only PortAventura World rides
ride_names = link_attraction_park.loc[
    link_attraction_park["PARK"] == "PortAventura World", "ATTRACTION"
]
waiting_times = waiting_times[
    waiting_times["ENTITY_DESCRIPTION_SHORT"].isin(ride_names)
]

# Convert schedule columns to datetime
entity_schedule["DEB_TIME"] = pd.to_datetime(entity_schedule["DEB_TIME"])
entity_schedule["FIN_TIME"] = pd.to_datetime(entity_schedule["FIN_TIME"])

# Convert waiting times columns to datetime
waiting_times["DEB_TIME"] = pd.to_datetime(waiting_times["DEB_TIME"])
waiting_times["FIN_TIME"] = pd.to_datetime(waiting_times["FIN_TIME"])

# Exclude Tivoli Gardens from the schedule
entity_schedule = entity_schedule[
    entity_schedule["ENTITY_DESCRIPTION_SHORT"] != "Tivoli Gardens"
]

# Prepare and merge weather data

In [6]:
# Keep only relevant columns
weather_cols = [
    "dt_iso", "temp", "humidity", "wind_speed", 
    "clouds_all", "rain_1h", "snow_1h"
]
relevant_weather_data = weather.loc[:, weather_cols].copy()

# Convert dt_iso to datetime (strip timezone part)
dt_weather = pd.to_datetime(
    relevant_weather_data["dt_iso"].str.split("+").str[0]
)
relevant_weather_data["date"] = dt_weather.dt.normalize()  # floor to day
relevant_weather_data["hour"] = dt_weather.dt.hour + 1  # optional offset
relevant_weather_data.drop(columns="dt_iso", inplace=True)
relevant_weather_data.fillna(0, inplace=True)

# Prepare date & time columns in waiting_times
dt_wait = pd.to_datetime(waiting_times["DEB_TIME"].astype(str))
waiting_times["date"] = dt_wait.dt.normalize()
waiting_times["hour"] = dt_wait.dt.hour
waiting_times["minute"] = dt_wait.dt.minute

# Merge weather data on [date, hour]
waiting_times = waiting_times.merge(
    relevant_weather_data, on=["date", "hour"], how="left"
)

# Adding parade data

In [7]:
data_merged = waiting_times

# Process data_merged
# Convert DEB_TIME and FIN_TIME to datetime
data_merged["DEB_TIME"] = pd.to_datetime(data_merged["DEB_TIME"])
data_merged["FIN_TIME"] = pd.to_datetime(data_merged["FIN_TIME"])
# Convert WORK_DATE to date (dropping any time component)
data_merged["WORK_DATE"] = pd.to_datetime(data_merged["WORK_DATE"]).dt.date

# Process parades
# Convert WORK_DATE in parades to date
parades["WORK_DATE"] = pd.to_datetime(parades["WORK_DATE"]).dt.date

# For each event (NIGHT_SHOW, PARADE_1, PARADE_2), create full datetime columns and event windows
for event in ["NIGHT_SHOW", "PARADE_1", "PARADE_2"]:
    # Combine WORK_DATE and event time to form a full datetime; handle missing times by returning NaT
    parades[event + "_DT"] = parades.apply(
        lambda row: pd.to_datetime(f"{row['WORK_DATE']} {row[event]}") if pd.notnull(row[event]) else pd.NaT,
        axis=1
    )
    # Define the event window: 15 minutes before and 30 minutes after the event time
    parades[event + "_start"] = parades[event + "_DT"] - pd.Timedelta(minutes=15)
    parades[event + "_end"] = parades[event + "_DT"] + pd.Timedelta(minutes=30)

# Merge data_merged with parades
merge_cols = [
    "WORK_DATE", 
    "NIGHT_SHOW_start", "NIGHT_SHOW_end", 
    "PARADE_1_start", "PARADE_1_end", 
    "PARADE_2_start", "PARADE_2_end"
]
merged = data_merged.merge(parades[merge_cols], on="WORK_DATE", how="left")

# Create Boolean Flags for Overlap
# Condition: row's interval [DEB_TIME, FIN_TIME] overlaps the event window if:
# FIN_TIME >= event_start and DEB_TIME <= event_end.
merged["in_night_show"] = (merged["FIN_TIME"] >= merged["NIGHT_SHOW_start"]) & \
                          (merged["DEB_TIME"] <= merged["NIGHT_SHOW_end"])

merged["in_parade_1"] = (merged["FIN_TIME"] >= merged["PARADE_1_start"]) & \
                        (merged["DEB_TIME"] <= merged["PARADE_1_end"])

merged["in_parade_2"] = (merged["FIN_TIME"] >= merged["PARADE_2_start"]) & \
                        (merged["DEB_TIME"] <= merged["PARADE_2_end"])

# Remove the event window columns
cols_to_drop = ["NIGHT_SHOW_start", "NIGHT_SHOW_end", 
                "PARADE_1_start", "PARADE_1_end", 
                "PARADE_2_start", "PARADE_2_end"]
merged_final = merged.drop(columns=cols_to_drop)

# Remove rows for which we have no information about the events 
start_date = pd.to_datetime("2018-10-01").date()
end_date   = pd.to_datetime("2022-08-18").date()

# Filter merged_final to only include rows where WORK_DATE is between start_date and end_date (inclusive)
merged_final = merged_final[
    (merged_final["WORK_DATE"] >= start_date) & (merged_final["WORK_DATE"] <= end_date)
]

merged_final.head()


Unnamed: 0,WORK_DATE,DEB_TIME,DEB_TIME_HOUR,FIN_TIME,ENTITY_DESCRIPTION_SHORT,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,...,minute,temp,humidity,wind_speed,clouds_all,rain_1h,snow_1h,in_night_show,in_parade_1,in_parade_2
0,2022-08-01,2022-08-01 09:30:00,9,2022-08-01 09:45:00,Haunted House,5,9.0,158.0,225.0,225.0,...,30,21.48,80,2.23,99,0.0,0.0,False,False,False
1,2022-08-01,2022-08-01 18:00:00,18,2022-08-01 18:15:00,Dizzy Dropper,15,88.0,124.0,213.251,213.25,...,0,26.44,52,2.0,48,0.0,0.0,False,True,False
2,2022-08-01,2022-08-01 15:45:00,15,2022-08-01 16:00:00,Water Ride,30,9.0,112.0,247.001,247.0,...,45,26.08,59,2.37,99,0.0,0.0,False,False,False
3,2022-08-01,2022-08-01 20:45:00,20,2022-08-01 21:00:00,Flying Coaster,15,25.0,477.0,756.0,756.0,...,45,24.98,64,1.19,30,0.0,0.0,False,False,False
4,2022-08-01,2022-08-01 13:15:00,13,2022-08-01 13:30:00,Dizzy Dropper,10,88.0,110.0,213.251,213.25,...,15,24.47,62,2.35,100,0.0,0.0,False,False,False


# Adding covid boolean

In [8]:
# Define the start and end dates for the Covid period.
covid_start = pd.to_datetime("2020-03-14").date()
covid_end = pd.to_datetime("2021-06-30").date()

# Create the 'covid' boolean column
merged_final["covid"] = (merged_final["WORK_DATE"] >= covid_start) & (merged_final["WORK_DATE"] <= covid_end)


In [9]:
# Dropping WAIT_TIME_MAX column
merged_final.drop(columns="WAIT_TIME_MAX", inplace=True)

In [10]:
merged_final.to_csv("final_data/data_future.csv", index=False)