In [1]:
# import necessary libraries

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from requests_html import AsyncHTMLSession

In [3]:
# obtain url as page variable and turn into soup (html text)

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_the_largest_fast_food_restaurant_chains"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

In [5]:
# acquire table from soup as its own subtext

In [6]:
table = soup.find('table', class_ = "wikitable sortable")

In [7]:
# create a Dictionary to store each row as we extract data from soup

In [8]:
data = {
    'Country of Origin': [],
    'Name': [],
    'Number of Locations': [],
    'Revenue': []
}

In [9]:
# go through for loop for getting each individual value and adding to the dictionary

In [10]:
for row in table.find_all('tr')[1:]:

    try:
        country = row.find_all('td')[0] 
        name = row.find_all('td')[1] 
        locNum = row.find_all('td')[2] 
        revenue = row.find_all('td')[3]
    except:
        pass

    pattern = r'\[[^\]]*\]'

    # append each of the v|alues to the dictionary
    data['Country of Origin'].append(country.get_text().strip() if country else '')
    data['Name'].append(name.get_text().strip() if name else '')
    data['Number of Locations'].append(re.sub(pattern, '', locNum.get_text().strip()) if locNum else '')
    data['Revenue'].append(re.sub(pattern, '', revenue.get_text().strip()) if revenue else '')

In [11]:
# get rid of ': num' in the data

In [12]:
for i in range(len(data['Number of Locations'])):
    if ":" in data['Number of Locations'][i]:
        data['Number of Locations'][i] = data['Number of Locations'][i].split(":")[0]

In [13]:
for i in range(len(data['Revenue'])):
    if ":" in data['Revenue'][i]:
        data['Revenue'][i] = data['Revenue'][i].split(":")[0]

In [14]:
# now pass the dictionary to a datafram

In [15]:
chain_df = pd.DataFrame(list(zip(data['Country of Origin'], data['Name'], data['Number of Locations'], data['Revenue'])), columns=['Country of Origin', 'Name', 'Number of Locations', 'Revenue'])

In [16]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
chain_df

Unnamed: 0,Country of Origin,Name,Number of Locations,Revenue
0,United States,McDonald's,"40,275 (2022)",US$23.2 billion (2021)
1,United States,Subway,"36,999 (2021)",US$16.1 billion (2020)
2,United States,Starbucks,"36,170 (2023)",US$32.3 billion (2022)
3,China,Mixue Ice Cream & Tea,"36,153 (2023)",RMB13.6 billion (2021)
4,United States,KFC,"26,934 (2021)",US$31.3 billion (2021)
5,United States,Burger King,"19,247 (2021)",US$23.4 billion (2021)
6,United States,Pizza Hut,"18,848 (2021)",US$17.7 billion (2021)
7,United States,Domino's,"18,381 (2021)",US$12.9 billion (2021)
8,China,Luckin Coffee,"13,273 (2023)",RMB13.3 billion (2022)
9,United States,Dunkin',11300,US$1.37 billion (2020)


In [17]:
# now to start with cleaning up the dataset, we will reset the index to start at 1 instead of 0

In [18]:
chain_df.index = chain_df.index + 1

In [19]:
# now to fill all missing value cells with 'N/A'

In [20]:
chain_df.replace('', 'Unknown', inplace = True)

In [21]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
chain_df

Unnamed: 0,Country of Origin,Name,Number of Locations,Revenue
1,United States,McDonald's,"40,275 (2022)",US$23.2 billion (2021)
2,United States,Subway,"36,999 (2021)",US$16.1 billion (2020)
3,United States,Starbucks,"36,170 (2023)",US$32.3 billion (2022)
4,China,Mixue Ice Cream & Tea,"36,153 (2023)",RMB13.6 billion (2021)
5,United States,KFC,"26,934 (2021)",US$31.3 billion (2021)
6,United States,Burger King,"19,247 (2021)",US$23.4 billion (2021)
7,United States,Pizza Hut,"18,848 (2021)",US$17.7 billion (2021)
8,United States,Domino's,"18,381 (2021)",US$12.9 billion (2021)
9,China,Luckin Coffee,"13,273 (2023)",RMB13.3 billion (2022)
10,United States,Dunkin',11300,US$1.37 billion (2020)


In [22]:
# now to finally export dataset as csv file

In [23]:
chain_df.to_csv(r"C:\Users\miria\OneDrive\Documents\DS Learn\output datasets\largestfastfoodrestaurantchains(webscraping).csv", index = False)