In [4]:
# importing packages
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Scraping the Wikipedia 2018/2016 top 100 Table

The code below is scraping the '100 most visited cities in 2016/2018 according to Euromonitor' table

In [12]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_cities_by_international_visitors'
r = requests.get(wiki_url)
text = r.text
soup = BeautifulSoup(r.text)

tables = pd.read_html(wiki_url)
top_cities = tables[2]

top_cities = top_cities.dropna(subset= ['Rank (Euromonitor)'])
final_top = top_cities[['Rank (Euromonitor)', 'City', 'Country / Territory']]

final_top.to_csv('top_cities.csv')

# Adding the BudgetYourTrip.com links to the top_cities.csv

In [5]:
cities = "top_cities.csv" 
city_links = "city_links.txt"
output_file = "cities.csv"


cities = pd.read_csv(cities)


with open(city_links, "r", encoding="utf-8") as f:
    city_urls = [line.strip() for line in f.readlines()[1:]]  


if len(cities) != len(city_urls):
    raise ValueError("Error: The number of cities and URLs do not match!")


cities["Links"] = city_urls


cities.to_csv(output_file, index=False)


cities = cities.drop(columns=["Unnamed: 0"])

pd.DataFrame(cities)

Unnamed: 0,Rank (Euromonitor),City,Country / Territory,Links
0,1.0,Hong Kong,Hong Kong,https://www.budgetyourtrip.com/hong-kong
1,2.0,Bangkok,Thailand,https://www.budgetyourtrip.com/thailand
2,3.0,London,United Kingdom,https://www.budgetyourtrip.com/united-kingdom/...
3,4.0,Macau,Macau,https://www.budgetyourtrip.com/budgetreportadv...
4,5.0,Singapore,Singapore,https://www.budgetyourtrip.com/singapore
...,...,...,...,...
95,96.0,Jeju,South Korea,https://www.budgetyourtrip.com/south-korea/che...
96,97.0,Porto,Portugal,https://www.budgetyourtrip.com/portugal/porto
97,98.0,Rhodes,Greece,https://www.budgetyourtrip.com/greece/rhodes
98,99.0,Rio de Janeiro,Brazil,https://www.budgetyourtrip.com/brazil/rio-de-j...


# Scraping stay duration costs from BudgetYourTrip.com (with written consent from BudgetYourTrip.com)

The Terms of Use asks to not use web scrapers on their data, so I emailed them and asked if I could scrape their website for the pricing information. They gave me permission, however I will not be publicly displaying the scraped data from their website. 

In [22]:
df = pd.read_csv("cities.csv")

scraped_duration_data = []

# clean_cost() cleans the cost data and converts it to a float

def clean_cost(text):
    try:
        return float(text.replace(",", "").strip())  # Convert to float
    except ValueError:
        return np.nan

# scraping the costs from a single URL
def scrape_duration_costs(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()  

        soup = BeautifulSoup(response.text, "html.parser")

        costs = soup.find_all("span", class_="curvalue")

        if len(costs) >= 7:  
            daily_cost = costs[0].text.strip()
            one_week_person = costs[1].text.strip()
            two_weeks_person = costs[2].text.strip()
            one_month_person = costs[3].text.strip()
            one_week_couple = costs[4].text.strip()
            two_weeks_couple = costs[5].text.strip()
            one_month_couple = costs[6].text.strip()
        else:
            return None  

        return {
            "Links": url,
            "Daily Cost": clean_cost(costs[0].text),
            "1 Week (Individual)": clean_cost(costs[1].text),
            "2 Weeks (Individual)": clean_cost(costs[2].text),
            "1 Month (Individual)": clean_cost(costs[3].text),
            "1 Week (Couple)": clean_cost(costs[4].text),
            "2 Weeks (Couple)": clean_cost(costs[5].text),
            "1 Month (Couple)": clean_cost(costs[6].text),
        }
    
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None

# scraping the costs from multiple URLs
for index, row in df.iterrows():
    url = row["Links"]
    cost_data = scrape_duration_costs(url)
    
    if cost_data:
        scraped_duration_data.append(cost_data)

    time.sleep(1)  

# creating a DataFrame from the scraped data
scraped_duration_df = pd.DataFrame(scraped_duration_data)

# saving the scraped data to a CSV file
scraped_duration_df.to_csv("scraped_duration_costs.csv", index=False)

# Scraping the "Mid-range" column from the "___ on a Budget" Tables on BudgetYourTrip.com

In [26]:
# Set up Selenium WebDriver (headless mode) because the tables are in JavaScript
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run without opening a browser
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")

# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Function to clean and convert cost values to float
def clean_cost1(value):
    """
    Cleans extracted cost values by:
    - Removing $ signs
    - Removing currency conversions in parentheses
    - Handling ranges (taking the average)
    - Converting valid numbers to floats
    """
    if not value:
        return None  # Return None if value is empty

    # Remove dollar signs and extra whitespace
    value = value.replace("$", "").strip()

    # Remove currency conversion values in parentheses
    value = re.sub(r"\(.*?\)", "", value)

    # Handle ranges (e.g., "2-7" → Convert to average (2+7)/2 = 4.5)
    if "-" in value:
        numbers = [float(n) for n in value.split("-") if n.strip().replace(".", "").isdigit()]
        if numbers:
            return sum(numbers) / len(numbers)  # Return the average

    # Convert to float if valid
    try:
        return float(value)
    except ValueError:
        return None  # Return None if conversion fails

# Function to scrape the mid-range column from a table
def scrape_mid_range(url):
    try:
        driver.get(url)
        time.sleep(3)  # Wait for JavaScript to load

        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table")

        if not table:
            # the line below is for debugging purposes/ to see which URL doesn't have a table
            print(f"Skipping {url}: Table not found")
            return None

        rows = table.find("tbody").find_all("tr")
        city_data = {"URL": url}

        for row in rows:
            cells = row.find_all("td")
            if len(cells) >= 3:
                category = cells[0].text.strip()  # First column: Category
                mid_range_value = clean_cost1(cells[2].text.strip())  # Third column: Mid-Range
                
                city_data[category] = mid_range_value

        return city_data

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to loop through all links in cities.csv
def scrape_all_links(csv_file):
    df_cities = pd.read_csv(csv_file)

    scraped_data = []
    
    for index, row in df_cities.iterrows():
        url = row["Links"]
        #print(f"Scraping: {url}")

        city_costs = scrape_mid_range(url)
        if city_costs:
            scraped_data.append(city_costs)

        time.sleep(1)  # Prevent rate-limiting

    # Convert scraped data into a DataFrame
    df_scraped = pd.DataFrame(scraped_data)

    # renaming the "URL" column to "Links" 
    df_scraped = df_scraped.rename(columns={"URL": "Links"})
    
    # Save the results to a CSV file
    df_scraped.to_csv("mid_range_costs.csv", index=False)

    # Display the scraped data
    df_scraped
# Run the scraper for all the links
scrape_all_links("cities.csv")


driver.quit()


In [44]:
# fixing the mid_range_costs.csv file column names
mid_range_costs = pd.read_csv("mid_range_costs.csv", header=None, skiprows=1)

# manually setting correct column names
mid_range_costs.columns = ["Links", "Accommodation1", "Local Transportation1", "Food2", "Entertainment1", "Alcohol2"]

mid_range_costs.head()


Unnamed: 0,Links,Accommodation1,Local Transportation1,Food2,Entertainment1,Alcohol2
0,https://www.budgetyourtrip.com/hong-kong,77.0,8.78,54.0,95.0,20.5
1,https://www.budgetyourtrip.com/thailand,38.0,13.0,30.0,20.0,9.5
2,https://www.budgetyourtrip.com/united-kingdom/...,158.0,33.0,75.0,41.0,19.5
3,https://www.budgetyourtrip.com/budgetreportadv...,114.0,1.0,37.0,11.0,
4,https://www.budgetyourtrip.com/singapore,99.0,10.0,46.0,33.0,27.5


# Merging the .csv files together

In [52]:
# reading in the files made in the previous steps
cities_df = pd.read_csv("cities.csv")
costs_df = pd.read_csv("scraped_duration_costs.csv")

# merging the cities & scraped_duration_costs dataframes
cities_costs = pd.merge(cities_df, costs_df, on="Links", how="left")
cities_costs_cleaned = cities_costs.drop_duplicates(subset=["City", "Country / Territory"], keep="first")

# merging the cities_costs_cleaned & mid_range_costs dataframes
final_df = pd.merge(cities_costs_cleaned, mid_range_costs, on="Links", how="left")
final_df = final_df.drop_duplicates(subset=["City", "Country / Territory"], keep="first")

# dropping the "Unnamed: 0" column
final_df = final_df.drop(columns=["Unnamed: 0"])

# saving the final dataframe to a CSV file 
final_df.to_csv("final_trip_costs.csv", index=False)