Name: Kyle Salgado-Gouker <br>
Date: October 7, 2023 <br>
Class: DSC540 - Professor Williams <br>
Project Deadline 2

In [1]:
#!pip install lxml
# !pip install SciPy
# !pip install python-Levenshtein
# !pip install fuzzywuzzy

In [2]:
import time

# file system searches etc
import os
from os.path import basename, exists
import glob

# regular expressions
import re
import math
import random

# data frames and smart arrays etc
import pandas as pd
import numpy as np

# web access and html parsing (urllib, its submodules)
import requests
import urllib
import urllib.request
import urllib.error
from urllib.request import urlretrieve
import ssl


# parser of web pages
from bs4 import BeautifulSoup
# more efficient parsing.
import lxml

# for plots
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# Fuzzy string matching
from fuzzywuzzy import fuzz

# for accessing sql
import sqlite3

# fancy table printing
from tabulate import tabulate

In [3]:
# For testing: Make warnings fatal.

import warnings
warnings.filterwarnings("error")

In [4]:
# Constants

FINAL_DATA_DIRECTORY = "data/final"

# Check if the directory exists
if not os.path.exists(FINAL_DATA_DIRECTORY):
    # If it doesn't exist, create it
    os.makedirs(FINAL_DATA_DIRECTORY)
    print(f"Directory '{FINAL_DATA_DIRECTORY}' created.")


In [5]:
QUEUE_TIMES_API = "https://queue-times.com/en-US/pages/api"
WIKIPEDIA_PARK_RANKINGS = "https://en.wikipedia.org/wiki/List_of_amusement_park_rankings"
WIKIPEDIA_PARK_RANKINGS_FILE = FINAL_DATA_DIRECTORY+"/amusement_park_rankings.html"
RCDB_CSV_FILE = FINAL_DATA_DIRECTORY+"/coaster_db.csv"
RCSB_URL = "https://github.com/RobMulla/twitch-stream-projects/blob/main/001-rollercoaster-dataset/dbv1.csv"
QUEUE_TIMES_PARK_LIST_URL = "https://queue-times.com/en-US/parks?group=country"

In [6]:
# General Functions

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}

# download: A good citizen download function
#     url - the url accessed
#     destination - local file to write
#
# respects code 429 and waits instead of pounding.

# Function to disable SSL certificate verification
def disable_ssl_verification():
    ssl._create_default_https_context = ssl._create_unverified_context

# Call the function to disable SSL verification
# This is to workaround an SSL certificate error I am getting.
disable_ssl_verification()

def download(url, destination, secure=True):
    try:
        # Send a GET request with headers
        response = requests.get(url, headers=headers, verify=secure)
        # Check if the request was successful
        if response.status_code == 200:
            with open(destination, 'w') as f:
                f.write(response.text)
            print("Downloaded " + destination)
        elif response.status_code == 429:
            # Extract the Retry-After header value
            retry_after = response.headers.get("Retry-After")
            if retry_after:
                # Convert the Retry-After value to seconds
                retry_after_seconds = int(retry_after)
                print("Rate limit exceeded. Waiting for " + str(retry_after_seconds) + " seconds.")
                time.sleep(retry_after_seconds)
                # Retry the request after waiting
                download(url, destination)
            else:
                print("Rate limit exceeded. Retry-After header not found.")
        else:
            print("Website returned " + str(response.status_code))
    except urllib.error.HTTPError:
        print("Failed to download " + url)
    except Exception:
        print("Error writing " + destination)
    return

def downloadFile(url, filename):
    if not exists(filename):
        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local + "\n")
        return local, _

def downloadRawFile(url, filename):
    if not os.path.exists(filename):
        # Modify the URL to the raw content URL (replace "github.com" with "raw.githubusercontent.com")
        raw_url = url + "?raw=true"
        # Download the raw content
        local, _ = downloadFile(raw_url, filename)


### Useful functions.

In [7]:
def pretty_print_df(df, rows=None):
    if rows is not None:
        df = df.head(rows)  # Use head() to limit the DataFrame to the specified number of rows
    # Use Tabulate to show the data.
    print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))
    print("\n" + "="*40 + "\n")  # Separation between DataFrames

def find_table_by_class(soup, className):
    table = soup.find('table', class_=className)

# Prints a title decorated by stars.
def formatFancyTitle(title):
    # Calculate the length of the title
    title_length = len(title)
    # format title with decoration
    title = "*" * (title_length + 4) + "\n" + f"* {title} *" + "\n" + "*" * (title_length + 4) + "\n"
    return title

def formatTestStat(value, dec=6):
    format_string = "{:."+str(dec)+"f}"
    return format_string.format(value)

# white
TABLE_BACKGROUND_COLOR = (255, 255, 255)
# black
TABLE_FONT_COLOR = (0, 0, 0)
TABLE_FONT_SIZE = 12
TABLE_WIDTH = 600
TABLE_HEIGHT = 800
TYPEFACE_FILE = "/Library/Fonts/Menlo.ttc"

def drawText(image_filename, text, title = "", width=TABLE_WIDTH, height=TABLE_HEIGHT, background_color=TABLE_BACKGROUND_COLOR, font_name=TYPEFACE_FILE, font_color=TABLE_FONT_COLOR, font_size=TABLE_FONT_SIZE):
    # Create an image with white background
    image = Image.new('RGB', (width, height), background_color)
    # Set the font style and size
    font = ImageFont.truetype(font_name, font_size)
    # Create a drawing context
    draw = ImageDraw.Draw(image)
    # Calculate the position to start drawing the table
    x, y = 10, 10
    # Add an optional title.
    if len(title) > 0:
        text = formatFancyTitle(title) + "\n" + text
    # Draw the table onto the image
    draw.text((x, y), text, font=font, fill=font_color)
    # Save the image
    image.save(image_filename)
    return text
    
def drawTable(image_filename, table, title="", width=TABLE_WIDTH, height=TABLE_HEIGHT, background_color=TABLE_BACKGROUND_COLOR, font_name=TYPEFACE_FILE, font_color=TABLE_FONT_COLOR, font_size=TABLE_FONT_SIZE):
    text = drawText(image_filename, table, title, width, height, background_color, font_name, font_color, font_size)
    print(text)
    return text
    
def drawReport(image_filename, text, title="", width=TABLE_WIDTH, height=TABLE_HEIGHT, background_color=TABLE_BACKGROUND_COLOR, font_name=TYPEFACE_FILE, font_color=TABLE_FONT_COLOR, font_size=TABLE_FONT_SIZE):
    if len(title) > 0:
        drawText(image_filename, text, title, width, height, background_color, font_name, font_color, font_size)
        print(formatFancyTitle(title))
    else:
        drawText(image_filename, text)
    print(text)
    return text
    
# formats long tables side by side.    
def combineTables(table1, table2, table3):
    # Split the input strings into rows
    table1_rows = table1.strip().split("\n")
    table2_rows = table2.strip().split("\n")
    table3_rows = table3.strip().split("\n")
    
    max_row_count = max(len(table1_rows), len(table2_rows), len(table3_rows))
    combined_table = ""
    
    for row_idx in range(max_row_count):
        # Get the corresponding rows from each table
        table1_row = table1_rows[row_idx] if row_idx < len(table1_rows) else ""
        table2_row = table2_rows[row_idx] if row_idx < len(table2_rows) else ""
        table3_row = table3_rows[row_idx] if row_idx < len(table3_rows) else ""

        # Combine the rows into a single row
        combined_row = f"{table1_row} {table2_row} {table3_row}".strip()

        # Add the combined row to the overall table
        combined_table += combined_row + "\n"

    return combined_table



In [8]:
# Download the file locally. (might not have Internet this weekend)
downloadRawFile(WIKIPEDIA_PARK_RANKINGS, WIKIPEDIA_PARK_RANKINGS_FILE)
downloadRawFile(RCSB_URL, RCDB_CSV_FILE)

# Workaround weird character conversion issue.
encoding = "utf-8"
# Read the HTML from the URL. Force "utf-8" encoding to workaround issue.
with open(WIKIPEDIA_PARK_RANKINGS_FILE, 'r', encoding = encoding) as rankings_file:
    # Read the contents of the file into a buffer
    rankings_html = rankings_file.read()
    

Downloaded data/final/coaster_db.csv



### The Flatfile has a lot of problems. 

We can still use it but first we need to apply some fixes. 

#### Transformation 1: Drop the columns we don't need.

* Column 0: Ride Name  <<<=== Manually renamed in Excel.
* Column 1 & 2: Drop
* Column 3: Park Name
* Column 4-5: Drop
* Column 6: Opening Date Keep
* Column 7-8: Drop
* Column 9: Manufacturer
* Column 10-11: Drop
* Column 12: Height
* Column 13: Length
* Column 14: Speed
* Column 15: Inversions
* Column 16: Duration keep
* Column 17: Capacity keep
* Column 18: Height Restriction
* Column 19-21: Drop
* Column 22: Cost Keep
* Column 23-end: Drop


In [9]:
# Read csv file into panda and make a data frame.

columns_to_load = ["Ride name", "Location", "Opening date", "Type", "Manufacturer", "Height", "Length",
 "Speed", "Inversions", "Duration", "Capacity", "Height Restriction", "Cost", 
 "Drop", "Max vertical angle", "G-force"]

# These columns will all need to be transformed into more useful types but for now we need to load them and the only choice is str.
dtype_options = {
    "Ride name": str,
    "Location": str,
    "Opening date": str,
    "Type": str, 
    "Manufacturer": str,
    "Height": str,
    "Length": str,
    "Speed": str,
    "Inversions": str,
    "Duration": str,
    "Capacity": str,
    "Height Restriction": str,
    "Cost": str, 
    "Drop": str,
    "Max vertical angle": str,
    "G-force": str
}

rcdb_df = pd.read_csv(RCDB_CSV_FILE, usecols=columns_to_load, dtype=dtype_options)


In [10]:

desired_columns = ['Ride name', 'Location']
sorted_df = rcdb_df.sort_values(by='Ride name')

print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+------------------------------------------------------------------+-----------------------------------------+
|                            Ride name                             |                Location                 |
+------------------------------------------------------------------+-----------------------------------------+
|     ("Super Grover's Box Car Derby", 'SeaWorld San Antonio')     |                   nan                   |
|          ('Canyon Blaster', 'Six Flags Magic Mountain')          |        Six Flags Magic Mountain         |
|                   ('Cobra', 'Tivoli Friheden')                   |             Tivoli Friheden             |
|                   ('Comet', 'Waldameer Park')                    |             Waldameer Park              |
|              ('Crazy Bird', 'Happy Valley Tianjin')              |          Happy Valley Tianjin           |
|              ('Desmo Race', 'Mirabilandia (Italy)')              |                   nan                   |
|

#### Transformation 2: Delete obvious trash.

* Remove the records with nan in Location or Ride name.
* Remove the records with garbage Ride names (the ones in parentheses), which will not match Queue Time rides.

In [11]:
print(len(rcdb_df), " original records.")

# Drop records where 'Location' or 'Ride name' is NaN or an empty string
rcdb_df.dropna(subset=['Location', 'Ride name'], inplace=True)
rcdb_df = rcdb_df[rcdb_df['Location'] != '']  # Remove empty strings in 'Location'
rcdb_df = rcdb_df[rcdb_df['Ride name'] != '']  # Remove empty strings in 'Ride name'

# Drop records where 'Ride name' starts with '(' and ends with ')'
rcdb_df = rcdb_df[~rcdb_df['Ride name'].str.contains(r'^\(.+\)$')]

print(len(rcdb_df), " records after discarding trash.")


695  original records.
572  records after discarding trash.


#### Transformation 3: Drop duplicates.

For now I will define duplicates as same Ride name, same Location.

In [12]:
# Define 'duplicate' as when Ride name and Location are the same.
rcdb_df.drop_duplicates(subset=['Ride name', 'Location'], inplace=True)

print(len(rcdb_df), " records after dropping duplicates.")


572  records after dropping duplicates.


In [13]:
sorted_df = rcdb_df.sort_values(by='Ride name')
print(tabulate(sorted_df.head(500)[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+-------------------------------------------------------+-----------------------------------------+
|                       Ride name                       |                Location                 |
+-------------------------------------------------------+-----------------------------------------+
|              10 Inversion Roller Coaster              |           Chimelong Paradise            |
|                        ARTHUR                         |               Europa-Park               |
|                         Abyss                         |             Adventure World             |
|                        Abyssus                        |              Energylandia               |
|                      Accelerator                      |              Drayton Manor              |
|                        Acrobat                        |           Nagashima Spa Land            |
|                    Adrenaline Peak                    |           Oaks Amusement Park           |


#### The park name in the flat file must be the same as in the Queue Times file.

First, though, I need a universal park name between the other two data sources.

### Read the wikipedia page using bs4.

In [14]:
# Parse the HTML using Beautiful Soup with the lxml parser
soup = BeautifulSoup(rankings_html, 'lxml')

# Find the tables
tables = soup.find_all('table')

# Initialize an empty list to store DataFrames for each table
dataframes = []

# Iterate through the first five tables (excluding the first table)
for table in tables[2:5]:
    # Convert the table to a DataFrame
    df = pd.read_html(str(table))[0]  # Assuming the first table is the one you want
    if 'Amusement Park' in df.columns:
        df['Amusement Park'] = df['Amusement Park'].apply(lambda x: x['title'] if isinstance(x, dict) and 'title' in x else x)
    dataframes.append(df)
    
    # Concatenate the DataFrames into one
all_wiki_amusement_parks_df = pd.concat(dataframes, ignore_index=True)

# Drop the 'Rank' column. Do this before finding duplicates.
all_wiki_amusement_parks_df.drop(columns=['Rank'], inplace=True)

# Remove duplicate rows based on all columns
all_wiki_amusement_parks_df.drop_duplicates(inplace=True)

# Replace NaN values with 0 in the combined DataFrame
all_wiki_amusement_parks_df.fillna(0, inplace=True)

# Convert all columns except 'Amusement Park' and 'Location' to numeric
columns_to_convert = all_wiki_amusement_parks_df.columns.difference(['Amusement park', 'Location'])
all_wiki_amusement_parks_df[columns_to_convert] = all_wiki_amusement_parks_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')


### Let's see the full list of amusement parks!

In [15]:
all_wiki_amusement_parks_df = all_wiki_amusement_parks_df.sort_values(by='Amusement park')

all_wiki_amusement_parks_df['Park number'] = ""
all_wiki_amusement_parks_df['Best match'] = ""

# Display the combined DataFrame
print(tabulate(all_wiki_amusement_parks_df.head(10), headers='keys', tablefmt='pretty', showindex=False))


+--------------------------------------------------------+------------------------------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+----------+-----------+-----------+-----------+-----------+-------------+------------+
|                     Amusement park                     |              Location              |  2009[1]  |  2010[3]  |  2011[3]  |  2012[4]  |  2013[5]   |  2014[6]   |  2015[7]   |  2016[8]   |  2017[9]   |  2018[10]  | 2019[11] | 2020[12]  | 2021[13]  |  2010[2]  | 2021[14]  | Park number | Best match |
+--------------------------------------------------------+------------------------------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+----------+-----------+-----------+-----------+-----------+-------------+------------+
|          Alton Towers at Alton Towers Resort           |       Alton, Unit

In [16]:
print(all_wiki_amusement_parks_df[['Amusement park', 'Park number', 'Best match']])

                                       Amusement park Park number Best match
27                Alton Towers at Alton Towers Resort                       
40                                 Beto Carrero World                       
11                            Busch Gardens Tampa Bay                       
19                                Canada's Wonderland                       
10                                        Cedar Point                       
31                    Chessington World of Adventures                       
22                                        De Efteling                       
8                    Disney California Adventure Park                       
6   Disney's Animal Kingdom at Walt Disney World R...                       
3   Disney's Hollywood Studios at Walt Disney Worl...                       
4                                     Disneyland Park                       
20                Disneyland Park at Disneyland Paris                       

In [17]:
# Define the URL of the HTML page
url = QUEUE_TIMES_PARK_LIST_URL

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize lists to store the data
    amusement_parks = []
    park_numbers = []
    countries = []

    # Find all the <div> panels
    div_panels = soup.find_all('div', class_='panel')

    # Iterate through the panels
    for panel in div_panels:
        # Find the <h2> tag inside the panel
        country_name = panel.find('h2').text.strip()
        
        # Find all the <a> tags inside the panel
        a_tags = panel.find_all('a', class_='panel-block')
        
        # Iterate through the <a> tags
        for a_tag in a_tags:
            # Extract the amusement park name and link
            amusement_park = a_tag.text.strip()
            link = a_tag['href'] 
            amusement_park = amusement_park.split("\n")[0]
            # Append the data to the lists
            amusement_parks.append(amusement_park)
            park_numbers.append(link.split("/")[-1])
            countries.append(country_name)

    # Create a DataFrame from the collected data
    QT_park_list_df = pd.DataFrame({
        'Amusement park': amusement_parks,
        'Park number': park_numbers,
        'Country': countries
    })

    # Display the DataFrame
    print(tabulate(QT_park_list_df, headers='keys', tablefmt='pretty', showindex=False))
else:
    print("Failed to fetch the URL.")

+-------------------------------------------+-------------+---------------------+
|              Amusement park               | Park number |       Country       |
+-------------------------------------------+-------------+---------------------+
|                Familypark                 |     322     |       Austria       |
|                Bellewaerde                |     276     |       Belgium       |
|               Bobbejaanland               |     311     |       Belgium       |
|            Plopsaland De Panne            |     54      |       Belgium       |
|              Walibi Belgium               |     14      |       Belgium       |
|            Beto Carrero World             |     319     |       Brazil        |
|            Canada's Wonderland            |     58      |       Canada        |
|            La Ronde, Montreal             |     48      |       Canada        |
|          Shanghai Disney Resort           |     30      |        China        |
|             Dj

#### Fuzzy match Amusement Parks by name and move Park Number field from QT_park_list_df into all_wiki_amusement_parks_df.

#### The fuzzy matching algorithm cannot resolve a few important differences between some wikipedia/queuetime entries.


In [18]:
# These parks need to be set.

all_wiki_amusement_parks_df.loc[all_wiki_amusement_parks_df['Amusement park'] == 'Disneyland Park', 'Park number'] = '16'
all_wiki_amusement_parks_df.loc[all_wiki_amusement_parks_df['Amusement park'] == 'Disneyland Hong Kong', 'Park number'] = '31'
all_wiki_amusement_parks_df.loc[all_wiki_amusement_parks_df['Amusement park'] == 'Disneyland Park at Disneyland Paris', 'Park number'] = '4'
all_wiki_amusement_parks_df.loc[all_wiki_amusement_parks_df['Amusement park'] == 'Magic Kingdom Theme Park at Walt Disney World Resort', 'Park number'] = '6'
all_wiki_amusement_parks_df.loc[all_wiki_amusement_parks_df['Amusement park'] == 'Walt Disney Studios Park at Disneyland Paris', 'Park number'] = '28'

# # These parks need to be removed because Queue Times doesn't include them.
indices_to_remove = [index for index, row in all_wiki_amusement_parks_df.iterrows() if row['Amusement park'] in 
                     ['Fantasilandia', 'La Feria', 'Parque Mundo Aventura', 'Parque Plaza Sésamo', 'Parque Warner', 
                      'Parque Xcaret', 'Parque de la Costa', 'Theme Parque Nacional del Café', 'Puy du Fou',
                      'Mundo Petapa', 'Futuroscope', 'La Feria Chapultepec Mágico', 'Tivoli Gardens']]

# Remove rows with the specified indices
all_wiki_amusement_parks_df.drop(indices_to_remove, inplace=True)



In [19]:
for w, row_wiki in all_wiki_amusement_parks_df.iterrows():
    amusement_park_w = row_wiki['Amusement park']
    
    # Initialize variables to store the best match and its index
    best_match_index = None
    best_match_score = -1
    
    # Iterate through each row in QT_park_list_df
    for q, row_QT in QT_park_list_df.iterrows():
        amusement_park_Q = row_QT['Amusement park']
        
        # exact match first
        if row_wiki['Park number'] == "" and amusement_park_w == amusement_park_Q:
            # print(f"Exact match for {amusement_park_w} and {amusement_park_Q}")
            all_wiki_amusement_parks_df.at[w, 'Park number'] = row_QT['Park number']
            break
            
        # if queue time string is in wiki string
        elif row_wiki['Park number'] == "" and amusement_park_w.find(amusement_park_Q) != -1:
            # print(f"Near match (wiki contains QT) for {amusement_park_w} and {amusement_park_Q}")
            all_wiki_amusement_parks_df.at[w, 'Park number'] = row_QT['Park number']
            break
            
        # if wiki string is in queue time string
        elif row_wiki['Park number'] == "" and amusement_park_Q.find(amusement_park_w) != -1:
            # print(f"Near match (QT contains wiki) for {amusement_park_w} and {amusement_park_Q}")
            all_wiki_amusement_parks_df.at[w, 'Park number'] = row_QT['Park number']
            break
            
        # fuzzy match
        elif row_wiki['Park number'] == "":
            fuzz_score = fuzz.ratio(amusement_park_w, amusement_park_Q)
            # print(f"Fuzzy match for {amusement_park_w} and {amusement_park_Q}: {fuzz_score}")
            
            if fuzz_score > best_match_score:
                best_match_score = fuzz_score
                best_match_index = q
                
    if all_wiki_amusement_parks_df.at[w, 'Park number'] == "" and best_match_index is not None:
        all_wiki_amusement_parks_df.at[w, 'Park number'] = QT_park_list_df.at[best_match_index, 'Park number']


In [20]:
# The two data frames after matching.
desired_columns = ['Amusement park', 'Park number']
sorted_df = all_wiki_amusement_parks_df.sort_values(by='Amusement park')
print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))
desired_columns = ['Amusement park', 'Park number']
sorted_df = QT_park_list_df.sort_values(by='Amusement park')
print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+------------------------------------------------------------+-------------+
|                       Amusement park                       | Park number |
+------------------------------------------------------------+-------------+
|            Alton Towers at Alton Towers Resort             |      1      |
|                     Beto Carrero World                     |     319     |
|                  Busch Gardens Tampa Bay                   |     24      |
|                    Canada's Wonderland                     |     58      |
|                        Cedar Point                         |     50      |
|              Chessington World of Adventures               |      3      |
|                        De Efteling                         |     160     |
|              Disney California Adventure Park              |     17      |
|    Disney's Animal Kingdom at Walt Disney World Resort     |      8      |
|   Disney's Hollywood Studios at Walt Disney World Resort   |      7      |

### Now we have a uniform method of referencing parks.

- We need to apply it to the flat file.

#### Transformation 4: Use a uniform park reference. 

* Match location against Amusement park field.
* Create Ride Number field.

In [21]:
# Here is the flat file.
rcdb_df['Park number'] = ""

# Use location to match against Amusement park. When it matches add Park number.


In [22]:
for rcdb_index, row_rcdb in rcdb_df.iterrows():
    location = row_rcdb['Location']
    
    # Initialize variables to store the best match and its index
    best_park_number = None
    best_match_score = -1
    best_match_amusement_park = ""
    
    # Iterate through each row in QT_park_list_df
    for q, row_QT in QT_park_list_df.iterrows():
        amusement_park_Q = row_QT['Amusement park']
        park_number = row_QT['Park number']
        
        # exact match first
        if row_rcdb['Park number'] == "" and location == amusement_park_Q:
            print(f"Q Exact match for {location} and {amusement_park_Q}  - park = {park_number}")
            rcdb_df.at[rcdb_index, 'Park number'] = row_QT['Park number']
            break
            
        # if queue time string is in wiki string
        elif row_rcdb['Park number'] == "" and location.find(amusement_park_Q) != -1:
            print(f"Q Near match (location contains QT) for {location} and {amusement_park_Q}  - park = {park_number}")
            rcdb_df.at[rcdb_index, 'Park number'] = row_QT['Park number']
            break
            
        # if wiki string is in queue time string
        elif row_rcdb['Park number'] == "" and amusement_park_Q.find(location) != -1:
            print(f"Q Near match (QT contains location) for {location} and {amusement_park_Q}  - park = {park_number}")
            rcdb_df.at[rcdb_index, 'Park number'] = row_QT['Park number']
            break
            
        # fuzzy match
        elif row_rcdb['Park number'] == "":
            fuzz_score = fuzz.ratio(location, amusement_park_Q)
            # print(f"Fuzzy match for {location} and {amusement_park_Q}: {fuzz_score}")
            
            if fuzz_score > best_match_score:
                best_match_score = fuzz_score
                best_park_number = row_QT['Park number']
                best_match_amusement_park = amusement_park_Q
         
    for w, row_wiki in all_wiki_amusement_parks_df.iterrows():
        amusement_park_w = row_wiki['Amusement park']
        park_number = row_wiki['Park number']
        # exact match first
        if row_rcdb['Park number'] == "" and amusement_park_w == location:
            print(f"W Exact match for {amusement_park_w} and {location} - park = {park_number}")
            rcdb_df.at[rcdb_index, 'Park number'] = row_wiki['Park number']
            break
            
        # if queue time string is in wiki string
        elif row_rcdb['Park number'] == "" and location.find(amusement_park_w) != -1:
            print(f"W Near match (location contains amusement_park_w) for {amusement_park_w} and {location} - park = {park_number}")
            rcdb_df.at[rcdb_index, 'Park number'] = row_wiki['Park number']
            break
            
        # if wiki string is in queue time string
        elif row_rcdb['Park number'] == "" and amusement_park_w.find(location) != -1:
            print(f"W Near match (amusement_park_w contains location) for {amusement_park_w} and {location} - park = {park_number}")
            rcdb_df.at[rcdb_index, 'Park number'] = row_wiki['Park number']
            break
            
        # fuzzy match
        elif row_rcdb['Park number'] == "":
            fuzz_score = fuzz.ratio(location, amusement_park_w)
            # print(f"Fuzzy match for {location} and {amusement_park_w}: {fuzz_score}")
            
            if fuzz_score > best_match_score:
                best_match_score = fuzz_score
                best_park_number = row_wiki['Park number']
                best_match_amusement_park = amusement_park_w


    if rcdb_df.at[rcdb_index, 'Park number'] == "" and best_park_number is not None:
        rcdb_df.at[rcdb_index, 'Park number'] = "-"+best_park_number
        print(f"Fuzzy match for {location} is {best_match_amusement_park}")


Fuzzy match for Chimelong Paradise is Disneyland Park Paris
Fuzzy match for Adventure World is Adventure Island
Q Exact match for Energylandia and Energylandia  - park = 317
Fuzzy match for Drayton Manor is Paultons Park
Fuzzy match for Nagashima Spa Land is Phantasialand
Fuzzy match for Oaks Amusement Park is Paultons Park
Q Exact match for Kings Island and Kings Island  - park = 60
Q Exact match for Carowinds and Carowinds  - park = 59
W Near match (amusement_park_w contains location) for Europa-Park at Europa-Park Resort and Europa-Park - park = 51
Q Exact match for Busch Gardens Williamsburg and Busch Gardens Williamsburg  - park = 23
Fuzzy match for Nigloland is Kings Island
Q Exact match for Six Flags St. Louis and Six Flags St. Louis  - park = 36
Q Exact match for Kings Dominion and Kings Dominion  - park = 62
Fuzzy match for Walygator Parc is Paultons Park
Q Exact match for Six Flags Magic Mountain and Six Flags Magic Mountain  - park = 32
Q Exact match for Busch Gardens Willia

Q Exact match for Six Flags Over Georgia and Six Flags Over Georgia  - park = 35
Q Exact match for Movie Park Germany and Movie Park Germany  - park = 310
Q Exact match for Canada's Wonderland and Canada's Wonderland  - park = 58
Q Exact match for Knott's Berry Farm and Knott's Berry Farm  - park = 61
Fuzzy match for Santa Cruz Beach Boardwalk is Beto Carrero World
Fuzzy match for Attractiepark Slagharen is Avonturenpark Hellendoorn
Q Exact match for Six Flags Magic Mountain and Six Flags Magic Mountain  - park = 32
Q Exact match for California's Great America and California's Great America  - park = 57
Q Exact match for Six Flags Over Georgia and Six Flags Over Georgia  - park = 35
Q Near match (QT contains location) for La Ronde and La Ronde, Montreal  - park = 48
Q Exact match for Six Flags Fiesta Texas and Six Flags Fiesta Texas  - park = 39
Q Exact match for Six Flags Great America and Six Flags Great America  - park = 38
Q Exact match for Six Flags Magic Mountain and Six Flags Ma

Q Exact match for Six Flags Darien Lake and Six Flags Darien Lake  - park = 281
Q Exact match for Movie Park Germany and Movie Park Germany  - park = 310
Fuzzy match for Flamingo Land Resort is Shanghai Disney Resort
Fuzzy match for Tibidabo Amusement Park is PortAventura Park
Q Exact match for Dollywood and Dollywood  - park = 55
Q Exact match for Walibi Rhône-Alpes and Walibi Rhône-Alpes  - park = 301
Q Exact match for Kings Island and Kings Island  - park = 60
Q Exact match for Alton Towers and Alton Towers  - park = 1
Q Exact match for Thorpe Park and Thorpe Park  - park = 2
Fuzzy match for Cliff's Amusement Park is Disney California Adventure Park
Q Exact match for Six Flags Over Texas and Six Flags Over Texas  - park = 34
Fuzzy match for Nickelodeon Universe is Universal Studios At Universal Orlando
Q Exact match for Blackpool Pleasure Beach and Blackpool Pleasure Beach  - park = 273
Q Exact match for Six Flags Magic Mountain and Six Flags Magic Mountain  - park = 32
Q Exact matc

Q Exact match for Six Flags Discovery Kingdom and Six Flags Discovery Kingdom  - park = 33
Fuzzy match for Wet'n'Wild Gold Coast is SeaWorld Orlando
Fuzzy match for Kongeparken is Dorney Park
Fuzzy match for Family Kingdom Amusement Park is Disney California Adventure Park
Fuzzy match for Wild Adventures is Adventure Island
Q Exact match for Thorpe Park and Thorpe Park  - park = 2
Fuzzy match for Everland is Toverland
Fuzzy match for Kentucky Kingdom is Animal Kingdom
Fuzzy match for Linnanmäki is Disneyland
Fuzzy match for Fuji-Q Highland is Walibi Holland
Q Near match (location contains QT) for Dorney Park & Wildwater Kingdom and Dorney Park  - park = 69
Q Exact match for Phantasialand and Phantasialand  - park = 56
Q Exact match for Six Flags Magic Mountain and Six Flags Magic Mountain  - park = 32
Q Exact match for Busch Gardens Williamsburg and Busch Gardens Williamsburg  - park = 23
Q Exact match for Dollywood and Dollywood  - park = 55
Fuzzy match for SeaWorld San Antonio is Sea

#### After Matching

1. Park number positive = near match or exact match.
2. Park number negative = fuzzy match.

In [23]:
desired_columns = ['Ride name', 'Location', 'Park number']
sorted_df = rcdb_df.sort_values(by='Ride name')

print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+-------------------------------------------------------+-----------------------------------------+-------------+
|                       Ride name                       |                Location                 | Park number |
+-------------------------------------------------------+-----------------------------------------+-------------+
|              10 Inversion Roller Coaster              |           Chimelong Paradise            |     -4      |
|                        ARTHUR                         |               Europa-Park               |     51      |
|                         Abyss                         |             Adventure World             |     -97     |
|                        Abyssus                        |              Energylandia               |     317     |
|                      Accelerator                      |              Drayton Manor              |     -49     |
|                        Acrobat                        |           Nagashima Spa Land  

#### Fuzzy Matches That Worked

* Hagrid’s Magical Creatures Motorbike Adventure     |    Universal's Islands of Adventure     |     -64 
*  Insane                         |               Gröna Lund                |    -166  
*  Jetline                        |               Gröna Lund                |    -166  
*  Jurassic World VelociCoaster              |    Universal's Islands of Adventure     |     -64  
*  Kvasten                        |               Gröna Lund                |    -166
*  Monster                        |               Gröna Lund                |    -166 
*  Orkanen                        |            Fårup Sommarland             |     -18 
*  Steel Eel                       |          SeaWorld San Antonio           |     -22 
*  Texas Stingray                     |          SeaWorld San Antonio           |     -22 
*  The Great White                    |          SeaWorld San Antonio           |     -22 
*  The Incredible Hulk Coaster              |    Universal's Islands of Adventure     |     -64 
*  Tornado                        |     Parque de Atracciones de Madrid     |    -321   
*  Vilda Musen                      |               Gröna Lund                |    -166   

For these coasters change the Park number to positive by stripping the '-'.

In [24]:
# Fuzzy matching worked for a few important coasters. We need to strip off the '-' of their Park number fields.
resolved_fuzzy_parks = ["-18", "-22", "-64", "-166", "-321"]

# Define a function to remove the negative sign
def remove_negative_sign(park_number):
    if park_number in resolved_fuzzy_parks:
        return park_number.replace('-', '')
    return park_number

# Apply the function to the 'Park number' column
rcdb_df['Park number'] = rcdb_df['Park number'].apply(remove_negative_sign)

desired_columns = ['Ride name', 'Location', 'Park number']
sorted_df = rcdb_df.sort_values(by='Ride name')

print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+-------------------------------------------------------+-----------------------------------------+-------------+
|                       Ride name                       |                Location                 | Park number |
+-------------------------------------------------------+-----------------------------------------+-------------+
|              10 Inversion Roller Coaster              |           Chimelong Paradise            |     -4      |
|                        ARTHUR                         |               Europa-Park               |     51      |
|                         Abyss                         |             Adventure World             |     -97     |
|                        Abyssus                        |              Energylandia               |     317     |
|                      Accelerator                      |              Drayton Manor              |     -49     |
|                        Acrobat                        |           Nagashima Spa Land  

### The Fuzzy Match worked! 

#### We fixed the coasters entries for the fuzzy parks.

### Transformation 5: Drop all rides that are not in the Queue Times Database

In [25]:
# Drop records where 'Ride name' starts with '(' and ends with ')'
rcdb_df = rcdb_df[~rcdb_df['Park number'].str.contains(r'-')]

print(len(rcdb_df))

366


In [26]:
desired_columns = ['Ride name', 'Location', 'Park number']
sorted_df = rcdb_df.sort_values(by='Ride name')

print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+-------------------------------------------------------+----------------------------------+-------------+
|                       Ride name                       |             Location             | Park number |
+-------------------------------------------------------+----------------------------------+-------------+
|                        ARTHUR                         |           Europa-Park            |     51      |
|                        Abyssus                        |           Energylandia           |     317     |
|                   Adventure Express                   |           Kings Island           |     60      |
|                       Afterburn                       |            Carowinds             |     59      |
|                  Alpenexpress Enzian                  |           Europa-Park            |     51      |
|                      Alpengeist                       |    Busch Gardens Williamsburg    |     23      |
|                   American Thunder 


## Next Step:

* Webscrape the Queue Times Pages for Ride Entries in each park.
* Add missing rides to data base.