# **Data Cleaning & Pre-Processing**

Environment Set-Up

In [1]:
!pip install py7zr
import py7zr

with py7zr.SevenZipFile('/content/final_data.7z', mode='r') as z:
    z.extractall(path="<output path>")

Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.1-cp311-cp311-manylinux_2_17_

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime

%load_ext autoreload
%autoreload 2

In [3]:
data_dir = "<output path>/final_data/"

attendance_file = "attendance.csv"
entity_file = "entity_schedule.csv"
link_attraction_file = "link_attraction_park.csv"
parade_file = "parade_night_show.xlsx"
waiting_file = "waiting_times.csv"
weather_file = "weather_data.csv"

In [4]:
attendance = pd.read_csv(os.path.join(data_dir, attendance_file))
schedule = pd.read_csv(os.path.join(data_dir, entity_file))
link_attraction = pd.read_csv(os.path.join(data_dir, link_attraction_file), sep=";")
parade = pd.read_excel(os.path.join(data_dir, parade_file))
waiting = pd.read_csv(os.path.join(data_dir, waiting_file))
weather = pd.read_csv(os.path.join(data_dir, weather_file))

**1. Attendance Data**

In [5]:
attendance.head()

Unnamed: 0,USAGE_DATE,FACILITY_NAME,attendance
0,2018-06-01,PortAventura World,46804
1,2018-06-01,Tivoli Gardens,20420
2,2018-06-02,PortAventura World,57940
3,2018-06-02,Tivoli Gardens,29110
4,2018-06-03,PortAventura World,44365


For this dataset, we filter on the PortAventura World park only and execute the following data cleaning:


*   Converting the date to a date format
*   Converting the negative values to zero
* Dropping the unecessary column



In [6]:
attendance["USAGE_DATE"] = pd.to_datetime(attendance["USAGE_DATE"], format="%Y-%m-%d")
attendance = attendance[attendance["FACILITY_NAME"] == "PortAventura World"]
attendance.loc[attendance['attendance'] < 0, 'attendance'] = 0
attendance

Unnamed: 0,USAGE_DATE,FACILITY_NAME,attendance
0,2018-06-01,PortAventura World,46804
2,2018-06-02,PortAventura World,57940
4,2018-06-03,PortAventura World,44365
6,2018-06-04,PortAventura World,37617
8,2018-06-05,PortAventura World,32438
...,...,...,...
2357,2022-07-22,PortAventura World,49586
2359,2022-07-23,PortAventura World,51748
2361,2022-07-24,PortAventura World,45261
2363,2022-07-25,PortAventura World,53764


**2. Schedule Data**

This is data for each time the park and its attractions had to be close.

In [7]:
schedule.head()

Unnamed: 0,REF_CLOSING_DESCRIPTION,ENTITY_DESCRIPTION_SHORT,ENTITY_TYPE,DEB_TIME,FIN_TIME,UPDATE_TIME,WORK_DATE
0,,Tivoli Gardens,PARK,2018-11-19 10:00:00.000,2018-11-19 18:00:00.000,2018-11-20 08:24:32.000,2018-11-19
1,,Dizzy Dropper,ATTR,2022-04-07 08:30:00.000,2022-04-07 22:04:00.000,2022-04-08 08:00:30.000,2022-04-07
2,,Sling Shot,ATTR,2018-03-28 08:37:00.000,2018-03-28 18:12:00.000,2018-03-29 08:24:37.000,2018-03-28
3,,Gondola,ATTR,2019-04-11 09:55:00.000,2019-04-11 20:19:00.000,2019-04-12 08:59:29.000,2019-04-11
4,,Monorail,ATTR,2019-06-29 08:30:00.000,2019-06-29 20:35:00.000,2019-06-30 08:14:16.000,2019-06-29


For this part, we will merge with our dataset containing the name of all the attractions and filter only on PortAventura.

In [8]:
# Merging the name of the attractions
schedule = schedule.merge(
    link_attraction,
    how="left",
    left_on="ENTITY_DESCRIPTION_SHORT",
    right_on="ATTRACTION",
).drop(columns=["ATTRACTION"])
schedule = schedule[schedule["ENTITY_DESCRIPTION_SHORT"] != "Tivoli Gardens"]
schedule = schedule[schedule["PARK"] != "Tivoli Gardens"]
schedule = schedule.drop(columns=['PARK'])

# Formating the date
schedule['WORK_DATE'] = pd.to_datetime(schedule['WORK_DATE'])
schedule.sort_values(by="WORK_DATE")
schedule.head()

Unnamed: 0,REF_CLOSING_DESCRIPTION,ENTITY_DESCRIPTION_SHORT,ENTITY_TYPE,DEB_TIME,FIN_TIME,UPDATE_TIME,WORK_DATE
1,,Dizzy Dropper,ATTR,2022-04-07 08:30:00.000,2022-04-07 22:04:00.000,2022-04-08 08:00:30.000,2022-04-07
6,,PortAventura World,PARK,2018-07-17 08:30:00.000,2018-07-17 09:30:00.000,2018-07-18 07:48:41.000,2018-07-17
11,,PortAventura World,PARK,2019-02-28 08:30:00.000,2019-02-28 09:30:00.000,2019-03-01 09:23:40.000,2019-02-28
12,Fermeture Réhab,Kiddie Coaster,ATTR,2018-02-21 23:59:00.000,2018-02-21 23:59:00.000,2018-02-22 08:36:23.000,2018-02-21
13,,PortAventura World,PARK,2019-08-18 08:30:00.000,2019-08-18 09:30:00.000,2019-08-19 07:34:17.000,2019-08-18


In [9]:
schedule.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25215 entries, 1 to 38306
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   REF_CLOSING_DESCRIPTION   2769 non-null   object        
 1   ENTITY_DESCRIPTION_SHORT  25215 non-null  object        
 2   ENTITY_TYPE               25215 non-null  object        
 3   DEB_TIME                  25215 non-null  object        
 4   FIN_TIME                  25215 non-null  object        
 5   UPDATE_TIME               25215 non-null  object        
 6   WORK_DATE                 25215 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(6)
memory usage: 2.0+ MB


**3. Attractions Data**

This dataframe hold the name of all the attractions of the park within our business scope.

In [10]:
link_attraction.head()

Unnamed: 0,ATTRACTION,PARK
0,Aeroplane Ride,Tivoli Gardens
1,Bumper Cars,PortAventura World
2,Bungee Jump,PortAventura World
3,Circus Train,PortAventura World
4,Crazy Bus,Tivoli Gardens


In [11]:
link_attraction=link_attraction[link_attraction["PARK"] == "PortAventura World"]
link_attraction

Unnamed: 0,ATTRACTION,PARK
1,Bumper Cars,PortAventura World
2,Bungee Jump,PortAventura World
3,Circus Train,PortAventura World
5,Crazy Dance,PortAventura World
6,Dizzy Dropper,PortAventura World
7,Drop Tower,PortAventura World
8,Flying Coaster,PortAventura World
9,Free Fall,PortAventura World
10,Giant Wheel,PortAventura World
11,Giga Coaster,PortAventura World


**4. Parade Data**

For each day, data of the different parade taking place.

In [12]:
parade.head()

Unnamed: 0.1,Unnamed: 0,WORK_DATE,NIGHT_SHOW,PARADE_1,PARADE_2
0,0,2018-10-01,20:00:00,17:30:00,12:10:00
1,1,2018-10-02,20:00:00,17:30:00,12:10:00
2,2,2018-10-03,20:00:00,17:30:00,12:10:00
3,3,2018-10-04,20:00:00,17:30:00,12:10:00
4,4,2018-10-05,20:00:00,17:30:00,12:10:00


In [13]:
# Dropping unecessary column
parade.drop(columns=["Unnamed: 0"], inplace=True)

# Converting to a time format and adding it to the work date
time_columns = ["NIGHT_SHOW", "PARADE_1", "PARADE_2"]
for col in time_columns:
    parade[col] = pd.to_datetime(parade[col], format="%H:%M:%S").dt.time
    parade[col] = parade["WORK_DATE"] + pd.to_timedelta(
        parade[col].astype(str)
    )

parade[time_columns] = parade[time_columns].apply(pd.to_datetime)
parade

Unnamed: 0,WORK_DATE,NIGHT_SHOW,PARADE_1,PARADE_2
0,2018-10-01,2018-10-01 20:00:00,2018-10-01 17:30:00,2018-10-01 12:10:00
1,2018-10-02,2018-10-02 20:00:00,2018-10-02 17:30:00,2018-10-02 12:10:00
2,2018-10-03,2018-10-03 20:00:00,2018-10-03 17:30:00,2018-10-03 12:10:00
3,2018-10-04,2018-10-04 20:00:00,2018-10-04 17:30:00,2018-10-04 12:10:00
4,2018-10-05,2018-10-05 20:00:00,2018-10-05 17:30:00,2018-10-05 12:10:00
...,...,...,...,...
666,2022-08-14,2022-08-14 23:00:00,2022-08-14 17:30:00,NaT
667,2022-08-15,2022-08-15 23:00:00,2022-08-15 17:30:00,NaT
668,2022-08-16,2022-08-16 23:00:00,2022-08-16 17:30:00,NaT
669,2022-08-17,2022-08-17 23:00:00,2022-08-17 17:30:00,NaT


**5. Waiting Time Data**

For this dataset, we performed the following:


*   Converting to date time format
*   Filtering on our business scope's park





In [14]:
waiting["WORK_DATE"] = pd.to_datetime(waiting["WORK_DATE"], format="%Y-%m-%d")
waiting["FIN_TIME"] = pd.to_datetime(waiting["FIN_TIME"], format="%Y-%m-%d %H:%M:%S.%f")
waiting["DEB_TIME"] = pd.to_datetime(waiting["DEB_TIME"], format="%Y-%m-%d %H:%M:%S.%f")
waiting.head()

Unnamed: 0,WORK_DATE,DEB_TIME,DEB_TIME_HOUR,FIN_TIME,ENTITY_DESCRIPTION_SHORT,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,OPEN_TIME,UP_TIME,DOWNTIME,NB_MAX_UNIT
0,2018-01-01,2018-01-01 21:00:00,21,2018-01-01 21:15:00,Roller Coaster,0,2.0,0.0,0.0,0.0,0,0,0,2.0
1,2018-01-01,2018-01-01 19:30:00,19,2018-01-01 19:45:00,Bumper Cars,5,18.0,148.0,254.749,254.75,15,15,0,18.0
2,2018-01-01,2018-01-01 22:30:00,22,2018-01-01 22:45:00,Rapids Ride,0,1.0,0.0,0.0,0.0,0,0,0,2.0
3,2018-01-01,2018-01-01 12:45:00,12,2018-01-01 13:00:00,Crazy Dance,5,1.0,46.0,250.001,250.0,15,15,0,1.0
4,2018-01-01,2018-01-01 17:00:00,17,2018-01-01 17:15:00,Skyway,5,15.0,92.0,211.5,198.25,15,15,0,16.0


In [15]:
waiting = waiting.merge(
    link_attraction,
    how="left",
    left_on="ENTITY_DESCRIPTION_SHORT",
    right_on="ATTRACTION",
).drop(columns=["ATTRACTION"])
waiting = waiting[waiting["PARK"] == "PortAventura World"]
waiting

Unnamed: 0,WORK_DATE,DEB_TIME,DEB_TIME_HOUR,FIN_TIME,ENTITY_DESCRIPTION_SHORT,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,OPEN_TIME,UP_TIME,DOWNTIME,NB_MAX_UNIT,PARK
0,2018-01-01,2018-01-01 21:00:00,21,2018-01-01 21:15:00,Roller Coaster,0,2.0,0.0,0.000,0.00,0,0,0,2.0,PortAventura World
1,2018-01-01,2018-01-01 19:30:00,19,2018-01-01 19:45:00,Bumper Cars,5,18.0,148.0,254.749,254.75,15,15,0,18.0,PortAventura World
2,2018-01-01,2018-01-01 22:30:00,22,2018-01-01 22:45:00,Rapids Ride,0,1.0,0.0,0.000,0.00,0,0,0,2.0,PortAventura World
3,2018-01-01,2018-01-01 12:45:00,12,2018-01-01 13:00:00,Crazy Dance,5,1.0,46.0,250.001,250.00,15,15,0,1.0,PortAventura World
5,2018-01-01,2018-01-01 18:15:00,18,2018-01-01 18:30:00,Free Fall,50,3.0,0.0,0.000,0.00,0,0,0,3.0,PortAventura World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509319,2022-08-18,2022-08-18 18:45:00,18,2022-08-18 19:00:00,Himalaya Ride,0,0.0,0.0,0.000,0.00,0,0,0,1.0,PortAventura World
3509320,2022-08-18,2022-08-18 10:15:00,10,2022-08-18 10:30:00,Crazy Dance,0,0.0,0.0,0.000,0.00,0,0,0,1.0,PortAventura World
3509321,2022-08-18,2022-08-18 09:15:00,9,2022-08-18 09:30:00,Crazy Dance,0,0.0,0.0,0.000,0.00,0,0,0,1.0,PortAventura World
3509322,2022-08-18,2022-08-18 20:30:00,20,2022-08-18 20:45:00,Giga Coaster,0,0.0,0.0,0.000,0.00,0,0,0,24.0,PortAventura World


**6. Weather Data**

For this dataset, we duplicated each row into 15' intervals.

In [16]:
weather.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,915148800,1999-01-01 00:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.33,,3.39,5.28,...,,,,,,8,800,Clear,sky is clear,01n
1,915152400,1999-01-01 01:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.08,,3.54,5.18,...,,,,,,6,800,Clear,sky is clear,01n
2,915156000,1999-01-01 02:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.08,,4.11,5.38,...,,,,,,14,801,Clouds,few clouds,02n
3,915159600,1999-01-01 03:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,7.31,,3.73,4.42,...,,,,,,39,802,Clouds,scattered clouds,03n
4,915163200,1999-01-01 04:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,6.91,,3.53,4.0,...,,,,,,52,803,Clouds,broken clouds,04n


In [17]:
weather['dt_iso'] = pd.to_datetime(weather['dt_iso'].str.slice(0,13), format='%Y-%m-%d %H')
weather = weather[weather['timezone'] == 3600]
weather['dt_iso+1'] = weather['dt_iso'].apply(lambda row: row + pd.Timedelta(hours=1))
weather['QUARTER'] = weather.apply(lambda row: pd.date_range(row['dt_iso'], row['dt_iso+1'], freq='15min'), axis=1)
weather = weather.explode('QUARTER')
weather = weather.drop(columns=['dt_iso', 'dt_iso+1', 'timezone'])
weather = weather.rename(columns={'QUARTER': 'date'})
weather = weather.drop_duplicates(subset='date', keep='first')
weather

Unnamed: 0,dt,city_name,lat,lon,temp,visibility,dew_point,feels_like,temp_min,temp_max,...,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon,date
0,915148800,Custom location,48.873492,2.295104,8.33,,3.39,5.28,8.14,9.32,...,,,,,8,800,Clear,sky is clear,01n,1999-01-01 00:00:00
0,915148800,Custom location,48.873492,2.295104,8.33,,3.39,5.28,8.14,9.32,...,,,,,8,800,Clear,sky is clear,01n,1999-01-01 00:15:00
0,915148800,Custom location,48.873492,2.295104,8.33,,3.39,5.28,8.14,9.32,...,,,,,8,800,Clear,sky is clear,01n,1999-01-01 00:30:00
0,915148800,Custom location,48.873492,2.295104,8.33,,3.39,5.28,8.14,9.32,...,,,,,8,800,Clear,sky is clear,01n,1999-01-01 00:45:00
0,915148800,Custom location,48.873492,2.295104,8.33,,3.39,5.28,8.14,9.32,...,,,,,8,800,Clear,sky is clear,01n,1999-01-01 01:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203663,1648335600,Custom location,48.873492,2.295104,10.41,,2.27,8.99,8.74,12.88,...,,,,,0,800,Clear,sky is clear,01n,2022-03-27 00:00:00
203664,1648339200,Custom location,48.873492,2.295104,9.50,,2.36,7.49,7.74,12.31,...,,,,,0,800,Clear,sky is clear,01n,2022-03-27 00:15:00
203664,1648339200,Custom location,48.873492,2.295104,9.50,,2.36,7.49,7.74,12.31,...,,,,,0,800,Clear,sky is clear,01n,2022-03-27 00:30:00
203664,1648339200,Custom location,48.873492,2.295104,9.50,,2.36,7.49,7.74,12.31,...,,,,,0,800,Clear,sky is clear,01n,2022-03-27 00:45:00


**7. Closing Time Extraction**

This part allows us to extract the time where the entire park was closed.

In [18]:
closing = pd.read_csv(os.path.join(data_dir, entity_file))

# Filtering only on the entire park
closing = closing[closing["ENTITY_DESCRIPTION_SHORT"] != "Tivoli Gardens"]
closing = closing.merge(
    link_attraction,
    how="left",
    left_on="ENTITY_DESCRIPTION_SHORT",
    right_on="ATTRACTION",
).drop(columns=["ATTRACTION"])
closing = closing[closing["PARK"] != "Tivoli Gardens"]
closing = closing[closing["ENTITY_TYPE"] == "PARK"]
closing.drop(columns=["PARK"], inplace=True)
closing.drop(columns=["UPDATE_TIME"], inplace=True)

# Cleaning the date
closing["DEB_TIME"] = pd.to_datetime(closing["DEB_TIME"])
closing["FIN_TIME"] = pd.to_datetime(closing["FIN_TIME"])
closing["WORK_DATE"] = pd.to_datetime(closing["WORK_DATE"])
closing.sort_values(by="WORK_DATE")

Unnamed: 0,REF_CLOSING_DESCRIPTION,ENTITY_DESCRIPTION_SHORT,ENTITY_TYPE,DEB_TIME,FIN_TIME,WORK_DATE
9194,,PortAventura World,PARK,2018-01-01 10:00:00,2018-01-01 22:00:00,2018-01-01
9584,,PortAventura World,PARK,2018-01-01 08:30:00,2018-01-01 09:30:00,2018-01-01
8777,,PortAventura World,PARK,2018-01-02 09:00:00,2018-01-02 22:00:00,2018-01-02
10065,,PortAventura World,PARK,2018-01-02 08:00:00,2018-01-02 09:00:00,2018-01-02
23702,,PortAventura World,PARK,2018-01-03 09:00:00,2018-01-03 22:00:00,2018-01-03
...,...,...,...,...,...,...
36336,,PortAventura World,PARK,2022-08-20 09:30:00,2022-08-20 23:00:00,2022-08-20
36240,,PortAventura World,PARK,2022-08-21 09:30:00,2022-08-21 23:00:00,2022-08-21
22559,,PortAventura World,PARK,2022-08-21 08:30:00,2022-08-21 09:30:00,2022-08-21
36217,,PortAventura World,PARK,2022-08-22 09:30:00,2022-08-22 23:00:00,2022-08-22


In [19]:
# Computing the closing time
closing["TIME CLOSED"] = (
    closing["FIN_TIME"] - closing["DEB_TIME"]
).dt.total_seconds() / 3600
closing.sort_values(by="WORK_DATE")

Unnamed: 0,REF_CLOSING_DESCRIPTION,ENTITY_DESCRIPTION_SHORT,ENTITY_TYPE,DEB_TIME,FIN_TIME,WORK_DATE,TIME CLOSED
9194,,PortAventura World,PARK,2018-01-01 10:00:00,2018-01-01 22:00:00,2018-01-01,12.0
9584,,PortAventura World,PARK,2018-01-01 08:30:00,2018-01-01 09:30:00,2018-01-01,1.0
8777,,PortAventura World,PARK,2018-01-02 09:00:00,2018-01-02 22:00:00,2018-01-02,13.0
10065,,PortAventura World,PARK,2018-01-02 08:00:00,2018-01-02 09:00:00,2018-01-02,1.0
23702,,PortAventura World,PARK,2018-01-03 09:00:00,2018-01-03 22:00:00,2018-01-03,13.0
...,...,...,...,...,...,...,...
36336,,PortAventura World,PARK,2022-08-20 09:30:00,2022-08-20 23:00:00,2022-08-20,13.5
36240,,PortAventura World,PARK,2022-08-21 09:30:00,2022-08-21 23:00:00,2022-08-21,13.5
22559,,PortAventura World,PARK,2022-08-21 08:30:00,2022-08-21 09:30:00,2022-08-21,1.0
36217,,PortAventura World,PARK,2022-08-22 09:30:00,2022-08-22 23:00:00,2022-08-22,13.5


In [20]:
# Grouping by day
unique_closing = closing.groupby("WORK_DATE")["TIME CLOSED"].sum().reset_index()
unique_closing["min_DEB_TIME"] = (
    closing.groupby("WORK_DATE")["DEB_TIME"]
    .min()
    .reset_index(name="min_DEB_TIME")["min_DEB_TIME"]
)
unique_closing["max_FIN_TIME"] = (
    closing.groupby("WORK_DATE")["FIN_TIME"]
    .max()
    .reset_index(name="max_FIN_TIME")["max_FIN_TIME"]
)
unique_closing.sort_values(by="TIME CLOSED")
unique_closing

Unnamed: 0,WORK_DATE,TIME CLOSED,min_DEB_TIME,max_FIN_TIME
0,2018-01-01,13.0,2018-01-01 08:30:00,2018-01-01 22:00:00
1,2018-01-02,14.0,2018-01-02 08:00:00,2018-01-02 22:00:00
2,2018-01-03,14.0,2018-01-03 08:00:00,2018-01-03 22:00:00
3,2018-01-04,14.0,2018-01-04 08:00:00,2018-01-04 22:00:00
4,2018-01-05,14.0,2018-01-05 08:00:00,2018-01-05 22:00:00
...,...,...,...,...
927,2022-08-18,14.5,2022-08-18 08:30:00,2022-08-18 23:00:00
928,2022-08-19,14.5,2022-08-19 08:30:00,2022-08-19 23:00:00
929,2022-08-20,14.5,2022-08-20 08:30:00,2022-08-20 23:00:00
930,2022-08-21,14.5,2022-08-21 08:30:00,2022-08-21 23:00:00


**8. Merging into a dataframe**

In [21]:
# Matching the date format
unique_closing["WORK_DATE"] = pd.to_datetime(unique_closing["WORK_DATE"])
attendance["USAGE_DATE"] = pd.to_datetime(attendance["USAGE_DATE"])

# Merging
df = attendance.merge(
    unique_closing, how="left", left_on="USAGE_DATE", right_on="WORK_DATE"
)
df.drop(columns=['WORK_DATE'], inplace=True)
df = df.fillna('0')

attendance=df
attendance

Unnamed: 0,USAGE_DATE,FACILITY_NAME,attendance,TIME CLOSED,min_DEB_TIME,max_FIN_TIME
0,2018-06-01,PortAventura World,46804,14.0,2018-06-01 08:30:00,2018-06-01 23:00:00
1,2018-06-02,PortAventura World,57940,14.0,2018-06-02 08:30:00,2018-06-02 23:00:00
2,2018-06-03,PortAventura World,44365,14.0,2018-06-03 08:30:00,2018-06-03 23:00:00
3,2018-06-04,PortAventura World,37617,14.0,2018-06-04 08:30:00,2018-06-04 23:00:00
4,2018-06-05,PortAventura World,32438,14.0,2018-06-05 08:30:00,2018-06-05 23:00:00
...,...,...,...,...,...,...
1177,2022-07-22,PortAventura World,49586,14.5,2022-07-22 08:30:00,2022-07-22 23:00:00
1178,2022-07-23,PortAventura World,51748,14.5,2022-07-23 08:30:00,2022-07-23 23:00:00
1179,2022-07-24,PortAventura World,45261,14.5,2022-07-24 08:30:00,2022-07-24 23:00:00
1180,2022-07-25,PortAventura World,53764,14.5,2022-07-25 08:30:00,2022-07-25 23:00:00


**9. Removing Covid Data**

Since the Covid impacted the attendance in the park, we decided to remove the period of Covid-19 (from the 14/03/2020 to 30/06/2021) in our dataframe.

In [22]:
attendance = attendance[(attendance['USAGE_DATE'] < pd.Timestamp('2020-03-14')) | (attendance['USAGE_DATE'] > pd.Timestamp('2021-06-30'))]
attendance

Unnamed: 0,USAGE_DATE,FACILITY_NAME,attendance,TIME CLOSED,min_DEB_TIME,max_FIN_TIME
0,2018-06-01,PortAventura World,46804,14.0,2018-06-01 08:30:00,2018-06-01 23:00:00
1,2018-06-02,PortAventura World,57940,14.0,2018-06-02 08:30:00,2018-06-02 23:00:00
2,2018-06-03,PortAventura World,44365,14.0,2018-06-03 08:30:00,2018-06-03 23:00:00
3,2018-06-04,PortAventura World,37617,14.0,2018-06-04 08:30:00,2018-06-04 23:00:00
4,2018-06-05,PortAventura World,32438,14.0,2018-06-05 08:30:00,2018-06-05 23:00:00
...,...,...,...,...,...,...
1177,2022-07-22,PortAventura World,49586,14.5,2022-07-22 08:30:00,2022-07-22 23:00:00
1178,2022-07-23,PortAventura World,51748,14.5,2022-07-23 08:30:00,2022-07-23 23:00:00
1179,2022-07-24,PortAventura World,45261,14.5,2022-07-24 08:30:00,2022-07-24 23:00:00
1180,2022-07-25,PortAventura World,53764,14.5,2022-07-25 08:30:00,2022-07-25 23:00:00


In [23]:
waiting_without_covid = waiting[(waiting['WORK_DATE'] < pd.Timestamp('2020-03-14')) | (waiting['WORK_DATE'] > pd.Timestamp('2021-06-30'))]
waiting_without_covid

Unnamed: 0,WORK_DATE,DEB_TIME,DEB_TIME_HOUR,FIN_TIME,ENTITY_DESCRIPTION_SHORT,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,OPEN_TIME,UP_TIME,DOWNTIME,NB_MAX_UNIT,PARK
0,2018-01-01,2018-01-01 21:00:00,21,2018-01-01 21:15:00,Roller Coaster,0,2.0,0.0,0.000,0.00,0,0,0,2.0,PortAventura World
1,2018-01-01,2018-01-01 19:30:00,19,2018-01-01 19:45:00,Bumper Cars,5,18.0,148.0,254.749,254.75,15,15,0,18.0,PortAventura World
2,2018-01-01,2018-01-01 22:30:00,22,2018-01-01 22:45:00,Rapids Ride,0,1.0,0.0,0.000,0.00,0,0,0,2.0,PortAventura World
3,2018-01-01,2018-01-01 12:45:00,12,2018-01-01 13:00:00,Crazy Dance,5,1.0,46.0,250.001,250.00,15,15,0,1.0,PortAventura World
5,2018-01-01,2018-01-01 18:15:00,18,2018-01-01 18:30:00,Free Fall,50,3.0,0.0,0.000,0.00,0,0,0,3.0,PortAventura World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509319,2022-08-18,2022-08-18 18:45:00,18,2022-08-18 19:00:00,Himalaya Ride,0,0.0,0.0,0.000,0.00,0,0,0,1.0,PortAventura World
3509320,2022-08-18,2022-08-18 10:15:00,10,2022-08-18 10:30:00,Crazy Dance,0,0.0,0.0,0.000,0.00,0,0,0,1.0,PortAventura World
3509321,2022-08-18,2022-08-18 09:15:00,9,2022-08-18 09:30:00,Crazy Dance,0,0.0,0.0,0.000,0.00,0,0,0,1.0,PortAventura World
3509322,2022-08-18,2022-08-18 20:30:00,20,2022-08-18 20:45:00,Giga Coaster,0,0.0,0.0,0.000,0.00,0,0,0,24.0,PortAventura World


**10. Exporting Files**

In [24]:
attendance.to_csv('attendance_without_covid.csv', index=False)
schedule.to_csv('schedule_cleaned.csv', index=False)
link_attraction.to_csv('attraction.csv', index=False)
parade.to_csv('parade_cleaned.csv', index=False)
waiting.to_csv('waiting_cleaned.csv', index=False)
waiting_without_covid.to_csv('waiting_cleaned_without_covid.csv', index=False)
weather.to_csv('weather_cleaned.csv', index=False)