In [4]:
# pip install lxml
#!pip install SciPy
#!pip install python-Levenshtein
#!pip install fuzzywuzzy

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.22.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.22.0
  Downloading Levenshtein-0.22.0-cp39-cp39-win_amd64.whl (101 kB)
     -------------------------------------- 101.0/101.0 kB 1.5 MB/s eta 0:00:00
Collecting rapidfuzz<4.0.0,>=2.3.0
  Downloading rapidfuzz-3.3.1-cp39-cp39-win_amd64.whl (1.8 MB)
     ---------------------------------------- 1.8/1.8 MB 6.1 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.22.0 python-Levenshtein-0.22.0 rapidfuzz-3.3.1


In [5]:
import time

# file system searches etc
import os
from os.path import basename, exists
import glob

# regular expressions
import re
import math
import random

# data frames and smart arrays etc
import pandas as pd
import numpy as np

# web access and html parsing (urllib, its submodules)
import requests
import urllib
import urllib.request
import urllib.error
from urllib.request import urlretrieve
import ssl


# parser of web pages
from bs4 import BeautifulSoup
# more efficient parsing.
import lxml

# for plots
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# Fuzzy string matching
from fuzzywuzzy import fuzz

# for accessing sql
import sqlite3

# fancy table printing
from tabulate import tabulate

In [6]:
# For testing: Make warnings fatal.

import warnings
warnings.filterwarnings("error")

In [7]:
# Constants

FINAL_DATA_DIRECTORY = "data/final"

# Check if the directory exists
if not os.path.exists(FINAL_DATA_DIRECTORY):
    # If it doesn't exist, create it
    os.makedirs(FINAL_DATA_DIRECTORY)
    print(f"Directory '{FINAL_DATA_DIRECTORY}' created.")


Directory 'data/final' created.


In [8]:
QUEUE_TIMES_API = "https://queue-times.com/en-US/pages/api"
WIKIPEDIA_PARK_RANKINGS = "https://en.wikipedia.org/wiki/List_of_amusement_park_rankings"
WIKIPEDIA_PARK_RANKINGS_FILE = FINAL_DATA_DIRECTORY+"/amusement_park_rankings.html"
RCDB_CSV_FILE = FINAL_DATA_DIRECTORY+"/coaster_db.csv"
RCSB_URL = "https://github.com/RobMulla/twitch-stream-projects/blob/main/001-rollercoaster-dataset/dbv1.csv"
QUEUE_TIMES_PARK_LIST_URL = "https://queue-times.com/en-US/parks?group=country"

In [9]:
# General Functions

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}

# download: A good citizen download function
#     url - the url accessed
#     destination - local file to write
#
# respects code 429 and waits instead of pounding.

# Function to disable SSL certificate verification
def disable_ssl_verification():
    ssl._create_default_https_context = ssl._create_unverified_context

# Call the function to disable SSL verification
# This is to workaround an SSL certificate error I am getting.
disable_ssl_verification()

def download(url, destination, secure=True):
    try:
        # Send a GET request with headers
        response = requests.get(url, headers=headers, verify=secure)
        # Check if the request was successful
        if response.status_code == 200:
            with open(destination, 'w') as f:
                f.write(response.text)
            print("Downloaded " + destination)
        elif response.status_code == 429:
            # Extract the Retry-After header value
            retry_after = response.headers.get("Retry-After")
            if retry_after:
                # Convert the Retry-After value to seconds
                retry_after_seconds = int(retry_after)
                print("Rate limit exceeded. Waiting for " + str(retry_after_seconds) + " seconds.")
                time.sleep(retry_after_seconds)
                # Retry the request after waiting
                download(url, destination)
            else:
                print("Rate limit exceeded. Retry-After header not found.")
        else:
            print("Website returned " + str(response.status_code))
    except urllib.error.HTTPError:
        print("Failed to download " + url)
    except Exception:
        print("Error writing " + destination)
    return

def downloadFile(url, filename):
    if not exists(filename):
        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local + "\n")
        return local, _

def downloadRawFile(url, filename):
    if not os.path.exists(filename):
        # Modify the URL to the raw content URL (replace "github.com" with "raw.githubusercontent.com")
        raw_url = url + "?raw=true"
        # Download the raw content
        local, _ = downloadFile(raw_url, filename)


### Useful functions.

In [10]:
def pretty_print_df(df, rows=None):
    if rows is not None:
        df = df.head(rows)  # Use head() to limit the DataFrame to the specified number of rows
    # Use Tabulate to show the data.
    print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))
    print("\n" + "="*40 + "\n")  # Separation between DataFrames

def find_table_by_class(soup, className):
    table = soup.find('table', class_=className)

# Prints a title decorated by stars.
def formatFancyTitle(title):
    # Calculate the length of the title
    title_length = len(title)
    # format title with decoration
    title = "*" * (title_length + 4) + "\n" + f"* {title} *" + "\n" + "*" * (title_length + 4) + "\n"
    return title

def formatTestStat(value, dec=6):
    format_string = "{:."+str(dec)+"f}"
    return format_string.format(value)

# white
TABLE_BACKGROUND_COLOR = (255, 255, 255)
# black
TABLE_FONT_COLOR = (0, 0, 0)
TABLE_FONT_SIZE = 12
TABLE_WIDTH = 600
TABLE_HEIGHT = 800
TYPEFACE_FILE = "/Library/Fonts/Menlo.ttc"

def drawText(image_filename, text, title = "", width=TABLE_WIDTH, height=TABLE_HEIGHT, background_color=TABLE_BACKGROUND_COLOR, font_name=TYPEFACE_FILE, font_color=TABLE_FONT_COLOR, font_size=TABLE_FONT_SIZE):
    # Create an image with white background
    image = Image.new('RGB', (width, height), background_color)
    # Set the font style and size
    font = ImageFont.truetype(font_name, font_size)
    # Create a drawing context
    draw = ImageDraw.Draw(image)
    # Calculate the position to start drawing the table
    x, y = 10, 10
    # Add an optional title.
    if len(title) > 0:
        text = formatFancyTitle(title) + "\n" + text
    # Draw the table onto the image
    draw.text((x, y), text, font=font, fill=font_color)
    # Save the image
    image.save(image_filename)
    return text
    
def drawTable(image_filename, table, title="", width=TABLE_WIDTH, height=TABLE_HEIGHT, background_color=TABLE_BACKGROUND_COLOR, font_name=TYPEFACE_FILE, font_color=TABLE_FONT_COLOR, font_size=TABLE_FONT_SIZE):
    text = drawText(image_filename, table, title, width, height, background_color, font_name, font_color, font_size)
    print(text)
    return text
    
def drawReport(image_filename, text, title="", width=TABLE_WIDTH, height=TABLE_HEIGHT, background_color=TABLE_BACKGROUND_COLOR, font_name=TYPEFACE_FILE, font_color=TABLE_FONT_COLOR, font_size=TABLE_FONT_SIZE):
    if len(title) > 0:
        drawText(image_filename, text, title, width, height, background_color, font_name, font_color, font_size)
        print(formatFancyTitle(title))
    else:
        drawText(image_filename, text)
    print(text)
    return text
    
# formats long tables side by side.    
def combineTables(table1, table2, table3):
    # Split the input strings into rows
    table1_rows = table1.strip().split("\n")
    table2_rows = table2.strip().split("\n")
    table3_rows = table3.strip().split("\n")
    
    max_row_count = max(len(table1_rows), len(table2_rows), len(table3_rows))
    combined_table = ""
    
    for row_idx in range(max_row_count):
        # Get the corresponding rows from each table
        table1_row = table1_rows[row_idx] if row_idx < len(table1_rows) else ""
        table2_row = table2_rows[row_idx] if row_idx < len(table2_rows) else ""
        table3_row = table3_rows[row_idx] if row_idx < len(table3_rows) else ""

        # Combine the rows into a single row
        combined_row = f"{table1_row} {table2_row} {table3_row}".strip()

        # Add the combined row to the overall table
        combined_table += combined_row + "\n"

    return combined_table



In [11]:
# Download the file locally. (might not have Internet this weekend)
downloadRawFile(WIKIPEDIA_PARK_RANKINGS, WIKIPEDIA_PARK_RANKINGS_FILE)

# Workaround weird character conversion issue.
encoding = "utf-8"
# Read the HTML from the URL. Force "utf-8" encoding to workaround issue.
with open(WIKIPEDIA_PARK_RANKINGS_FILE, 'r', encoding = encoding) as rankings_file:
    # Read the contents of the file into a buffer
    rankings_html = rankings_file.read()
    

Downloaded data/final/amusement_park_rankings.html



#### Utility functions for pretty print of data frames (later) and extracting a particular table from soup.

### 2. Read the page using bs4. (Reading from a downloaded file, see note above)

In [12]:
# Parse the HTML using Beautiful Soup with the lxml parser
soup = BeautifulSoup(rankings_html, 'lxml')

# Find the tables
tables = soup.find_all('table')

# Initialize an empty list to store DataFrames for each table
dataframes = []

# Iterate through the first five tables (excluding the first table)
for table in tables[2:5]:
    # Convert the table to a DataFrame
    df = pd.read_html(str(table))[0]  # Assuming the first table is the one you want
    if 'Amusement Park' in df.columns:
        df['Amusement Park'] = df['Amusement Park'].apply(lambda x: x['title'] if isinstance(x, dict) and 'title' in x else x)
    dataframes.append(df)
    
    # Concatenate the DataFrames into one
all_wiki_amusement_parks_df = pd.concat(dataframes, ignore_index=True)

# Drop the 'Rank' column. Do this before finding duplicates.
all_wiki_amusement_parks_df.drop(columns=['Rank'], inplace=True)

# Remove duplicate rows based on all columns
all_wiki_amusement_parks_df.drop_duplicates(inplace=True)

# Replace NaN values with 0 in the combined DataFrame
all_wiki_amusement_parks_df.fillna(0, inplace=True)

# Convert all columns except 'Amusement Park' and 'Location' to numeric
columns_to_convert = all_wiki_amusement_parks_df.columns.difference(['Amusement park', 'Location'])
all_wiki_amusement_parks_df[columns_to_convert] = all_wiki_amusement_parks_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')


### Let's see the full list of amusement parks!

In [13]:
all_wiki_amusement_parks_df = all_wiki_amusement_parks_df.sort_values(by='Amusement park')

all_wiki_amusement_parks_df['Park number'] = ""
all_wiki_amusement_parks_df['Best match'] = ""

# Display the combined DataFrame
print(tabulate(all_wiki_amusement_parks_df.head(10), headers='keys', tablefmt='pretty', showindex=False))


+--------------------------------------------------------+------------------------------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+----------+-----------+-----------+-----------+-----------+-------------+------------+
|                     Amusement park                     |              Location              |  2009[1]  |  2010[3]  |  2011[3]  |  2012[4]  |  2013[5]   |  2014[6]   |  2015[7]   |  2016[8]   |  2017[9]   |  2018[10]  | 2019[11] | 2020[12]  | 2021[13]  |  2010[2]  | 2021[14]  | Park number | Best match |
+--------------------------------------------------------+------------------------------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+----------+-----------+-----------+-----------+-----------+-------------+------------+
|          Alton Towers at Alton Towers Resort           |       Alton, Unit

In [14]:
print(all_wiki_amusement_parks_df[['Amusement park', 'Park number', 'Best match']])

                                       Amusement park Park number Best match
27                Alton Towers at Alton Towers Resort                       
40                                 Beto Carrero World                       
11                            Busch Gardens Tampa Bay                       
19                                Canada's Wonderland                       
10                                        Cedar Point                       
31                    Chessington World of Adventures                       
22                                        De Efteling                       
8                    Disney California Adventure Park                       
6   Disney's Animal Kingdom at Walt Disney World R...                       
3   Disney's Hollywood Studios at Walt Disney Worl...                       
4                                     Disneyland Park                       
20                Disneyland Park at Disneyland Paris                       

In [15]:
# Define the URL of the HTML page
url = QUEUE_TIMES_PARK_LIST_URL

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize lists to store the data
    amusement_parks = []
    park_numbers = []
    countries = []

    # Find all the <div> panels
    div_panels = soup.find_all('div', class_='panel')

    # Iterate through the panels
    for panel in div_panels:
        # Find the <h2> tag inside the panel
        country_name = panel.find('h2').text.strip()
        
        # Find all the <a> tags inside the panel
        a_tags = panel.find_all('a', class_='panel-block')
        
        # Iterate through the <a> tags
        for a_tag in a_tags:
            # Extract the amusement park name and link
            amusement_park = a_tag.text.strip()
            link = a_tag['href'] 
            amusement_park = amusement_park.split("\n")[0]
            # Append the data to the lists
            amusement_parks.append(amusement_park)
            park_numbers.append(link.split("/")[-1])
            countries.append(country_name)

    # Create a DataFrame from the collected data
    QT_park_list_df = pd.DataFrame({
        'Amusement park': amusement_parks,
        'Park number': park_numbers,
        'Country': countries
    })

    # Display the DataFrame
    print(tabulate(QT_park_list_df, headers='keys', tablefmt='pretty', showindex=False))
else:
    print("Failed to fetch the URL.")

+-------------------------------------------+-------------+---------------------+
|              Amusement park               | Park number |       Country       |
+-------------------------------------------+-------------+---------------------+
|                Familypark                 |     322     |       Austria       |
|                Bellewaerde                |     276     |       Belgium       |
|               Bobbejaanland               |     311     |       Belgium       |
|            Plopsaland De Panne            |     54      |       Belgium       |
|              Walibi Belgium               |     14      |       Belgium       |
|            Beto Carrero World             |     319     |       Brazil        |
|            Canada's Wonderland            |     58      |       Canada        |
|            La Ronde, Montreal             |     48      |       Canada        |
|          Shanghai Disney Resort           |     30      |        China        |
|             Dj

### Fuzzy matching to move Park Number field from QT_park_list_df into all_wiki_amusement_parks_df.

We are trying to add a Park number column to all_wiki_amusement_parks_df['Amusement park']
all_wiki_amusement_parks_df and QT_park_list_df both have a field called 'Amusement park', but they may not match.
So here is the idea.

# First, we find all_wiki_amusement_parks_df['Amusement park'] in QT_park_list_df['Amusement park']. If there is a match put the Park number in all_wiki_amusement_parks_df 
# Second, if no match, then find QT_park_list_df['Amusement park'] in all_wiki_amusement_parks_df['Amusement park']. If there is a match put the Park number in all_wiki_amusement_parks_df 
# Third, generate a fuzzy string comparison using distance function of Levenshtein for every unmatched entry in all_wiki_amusement_parks_df to find the entry in QT_park_list_df with the smallest return value of distance. Add the park number and 
put the result of distance in a new column called 'Distance'. The 'Distance' field will only have values if there were no matches before.

# Note: Park number is a field in QT_park_list_df.
# The goal is to create a matching Park number foor every entry in all_wiki_amusement_parks_df and add it to the dataframe.

In [16]:
for w, row_wiki in all_wiki_amusement_parks_df.iterrows():
    amusement_park_w = row_wiki['Amusement park']
    
    # Initialize variables to store the best match and its index
    best_match_index = None
    best_match_score = -1
    
    # Iterate through each row in QT_park_list_df
    for q, row_QT in QT_park_list_df.iterrows():
        amusement_park_Q = row_QT['Amusement park']
        
        # exact match first
        if row_wiki['Park number'] == "" and amusement_park_w == amusement_park_Q:
            print(f"Exact match for {amusement_park_w} and {amusement_park_Q}")
            all_wiki_amusement_parks_df.at[w, 'Park number'] = row_QT['Park number']
            break
            
        # if queue time string is in wiki string
        elif row_wiki['Park number'] == "" and amusement_park_w.find(amusement_park_Q) != -1:
            print(f"Near match (wiki contains QT) for {amusement_park_w} and {amusement_park_Q}")
            all_wiki_amusement_parks_df.at[w, 'Park number'] = row_QT['Park number']
            break
            
        # if wiki string is in queue time string
        elif row_wiki['Park number'] == "" and amusement_park_Q.find(amusement_park_w) != -1:
            print(f"Near match (QT contains wiki) for {amusement_park_w} and {amusement_park_Q}")
            all_wiki_amusement_parks_df.at[w, 'Park number'] = row_QT['Park number']
            break
            
        # fuzzy match
        elif row_wiki['Park number'] == "":
            fuzz_score = fuzz.ratio(amusement_park_w, amusement_park_Q)
            print(f"Fuzzy match for {amusement_park_w} and {amusement_park_Q}: {fuzz_score}")
            
            if fuzz_score > best_match_score:
                best_match_score = fuzz_score
                best_match_index = q
                
    if all_wiki_amusement_parks_df.at[w, 'Park number'] == "" and best_match_index is not None:
        all_wiki_amusement_parks_df.at[w, 'Park number'] = QT_park_list_df.at[best_match_index, 'Park number']


Fuzzy match for Alton Towers at Alton Towers Resort and Familypark: 9
Fuzzy match for Alton Towers at Alton Towers Resort and Bellewaerde: 22
Fuzzy match for Alton Towers at Alton Towers Resort and Bobbejaanland: 8
Fuzzy match for Alton Towers at Alton Towers Resort and Plopsaland De Panne: 33
Fuzzy match for Alton Towers at Alton Towers Resort and Walibi Belgium: 16
Fuzzy match for Alton Towers at Alton Towers Resort and Beto Carrero World: 30
Fuzzy match for Alton Towers at Alton Towers Resort and Canada's Wonderland: 22
Fuzzy match for Alton Towers at Alton Towers Resort and La Ronde, Montreal: 23
Fuzzy match for Alton Towers at Alton Towers Resort and Shanghai Disney Resort: 35
Fuzzy match for Alton Towers at Alton Towers Resort and Djurs Sommerland: 24
Fuzzy match for Alton Towers at Alton Towers Resort and Fårup Sommerland: 20
Fuzzy match for Alton Towers at Alton Towers Resort and Legoland Billund: 20
Near match (wiki contains QT) for Alton Towers at Alton Towers Resort and Alto

Fuzzy match for Kings Island and Disneyland Hong Kong: 38
Fuzzy match for Kings Island and Cinecittà World: 37
Fuzzy match for Kings Island and Gardaland: 38
Fuzzy match for Kings Island and Legoland Japan: 38
Fuzzy match for Kings Island and Tokyo Disneyland: 43
Fuzzy match for Kings Island and Tokyo DisneySea: 22
Fuzzy match for Kings Island and Universal Studios Japan: 29
Fuzzy match for Kings Island and Six Flags Mexico: 29
Fuzzy match for Kings Island and Avonturenpark Hellendoorn: 27
Fuzzy match for Kings Island and Efteling: 30
Fuzzy match for Kings Island and Toverland: 38
Fuzzy match for Kings Island and Walibi Holland: 46
Fuzzy match for Kings Island and Energylandia: 50
Fuzzy match for Kings Island and Legoland Korea: 38
Fuzzy match for Kings Island and Ferrari Land: 42
Fuzzy match for Kings Island and Parque de Atracciones Madrid: 30
Fuzzy match for Kings Island and Parque Warner Madrid: 12
Fuzzy match for Kings Island and PortAventura Park: 21
Fuzzy match for Kings Island 

Fuzzy match for PortAventura Park at PortAventura World and Disneyland Hong Kong: 17
Fuzzy match for PortAventura Park at PortAventura World and Cinecittà World: 30
Fuzzy match for PortAventura Park at PortAventura World and Gardaland: 21
Fuzzy match for PortAventura Park at PortAventura World and Legoland Japan: 19
Fuzzy match for PortAventura Park at PortAventura World and Tokyo Disneyland: 15
Fuzzy match for PortAventura Park at PortAventura World and Tokyo DisneySea: 11
Fuzzy match for PortAventura Park at PortAventura World and Universal Studios Japan: 23
Fuzzy match for PortAventura Park at PortAventura World and Six Flags Mexico: 4
Fuzzy match for PortAventura Park at PortAventura World and Avonturenpark Hellendoorn: 41
Fuzzy match for PortAventura Park at PortAventura World and Efteling: 13
Fuzzy match for PortAventura Park at PortAventura World and Toverland: 25
Fuzzy match for PortAventura Park at PortAventura World and Walibi Holland: 11
Fuzzy match for PortAventura Park at 

Fuzzy match for Tivoli Gardens and Legoland Deutschland: 29
Fuzzy match for Tivoli Gardens and Movie Park Germany: 31
Fuzzy match for Tivoli Gardens and Phantasialand: 30
Fuzzy match for Tivoli Gardens and Rulantica: 17
Fuzzy match for Tivoli Gardens and Disneyland Hong Kong: 24
Fuzzy match for Tivoli Gardens and Cinecittà World: 28
Fuzzy match for Tivoli Gardens and Gardaland: 43
Fuzzy match for Tivoli Gardens and Legoland Japan: 36
Fuzzy match for Tivoli Gardens and Tokyo Disneyland: 33
Fuzzy match for Tivoli Gardens and Tokyo DisneySea: 21
Fuzzy match for Tivoli Gardens and Universal Studios Japan: 32
Fuzzy match for Tivoli Gardens and Six Flags Mexico: 13
Fuzzy match for Tivoli Gardens and Avonturenpark Hellendoorn: 31
Fuzzy match for Tivoli Gardens and Efteling: 27
Fuzzy match for Tivoli Gardens and Toverland: 43
Fuzzy match for Tivoli Gardens and Walibi Holland: 36
Fuzzy match for Tivoli Gardens and Energylandia: 15
Fuzzy match for Tivoli Gardens and Legoland Korea: 29
Fuzzy matc

In [18]:
# Display the combined DataFrame
desired_columns = ['Amusement park', 'Park number', 'Best match']
sorted_df = all_wiki_amusement_parks_df.sort_values(by='Amusement park')
print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))
desired_columns = ['Amusement park', 'Park number']
sorted_df = QT_park_list_df.sort_values(by='Amusement park')
print(tabulate(sorted_df[desired_columns], headers='keys', tablefmt='pretty', showindex=False))


+------------------------------------------------------------+-------------+------------+
|                       Amusement park                       | Park number | Best match |
+------------------------------------------------------------+-------------+------------+
|            Alton Towers at Alton Towers Resort             |      1      |            |
|                     Beto Carrero World                     |     319     |            |
|                  Busch Gardens Tampa Bay                   |     24      |            |
|                    Canada's Wonderland                     |     58      |            |
|                        Cedar Point                         |     50      |            |
|              Chessington World of Adventures               |      3      |            |
|                        De Efteling                         |     160     |            |
|              Disney California Adventure Park              |     17      |            |
|    Disne