# Scraping Data from Mcdonald's SG Online Menu

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# assign categories to their respective urls
beverages = 'https://www.mcdonalds.com.sg/food-category/beverages'
breakfast = 'https://www.mcdonalds.com.sg/food-category/breakfast'
burgers = 'https://www.mcdonalds.com.sg/food-category/burgers'
chicken = 'https://www.mcdonalds.com.sg/food-category/chicken'
desserts = 'https://www.mcdonalds.com.sg/food-category/desserts'
light = 'https://www.mcdonalds.com.sg/food-category/eat-light-under-500-calories'
family = 'https://www.mcdonalds.com.sg/food-category/for-the-family'
sharing = 'https://www.mcdonalds.com.sg/food-category/sharing'
saladsandwraps = 'https://www.mcdonalds.com.sg/food-category/salads-and-wraps'
sides = 'https://www.mcdonalds.com.sg/food-category/sides'

categories_url = [beverages, breakfast, burgers, chicken, desserts, light, family, sharing, saladsandwraps, sides]
categories_str = ['beverages', 'breakfast', 'burgers', 'chicken', 'desserts', 'light', 'family', 'sharing', 'saladsandwraps', 'sides']


# check if the all the urls allow us to retrieve data
def check_response(urls):
    for i in urls:
        if requests.get(i).status_code != 200:
            return requests.get(i)
    return 'Responses are all 200'
        

check_response(categories_url)

'Responses are all 200'

In [3]:
# create global variables for each category, containing the url subdirectory of items associated to them
categories_html = ['beverages_html', 'breakfast_html', 'burgers_html', 'chicken_html', 'desserts_html', 'light_html', 'family_html', 'sharing_html', 'saladsandwraps_html', 'sides_html']
html_list = []

for (i, j) in zip(categories_url, categories_html):
    globals()[j] = BeautifulSoup(requests.get(i).text, 'lxml').find_all(class_ = 'category-item__name')
    html_list.append(globals()[j])

In [4]:
category = []
item = []
itemurl = []

# assign categories and the full url to each of the items
for i in range(len(html_list)):
    for j in html_list[i]:
        category.append(categories_str[i])
        item.append(j.text)
        itemurl.append('https://www.mcdonalds.com.sg' + j.a['href'])

In [5]:
energy_kcal = []
protein_g = []
totalfat_g = []
saturatedfat_g = []
cholesterol_mg = []
carbohydrates_g = []
dietaryfibres_g = []
sodium_mg = []
egg = []
fish = []
milk = []
peanuts = []
soyabeans = []
treenuts = []
wheat = []
gluten = []
msg = []
sulphite = []

itemurl_list = []
nutrition_list = [energy_kcal, protein_g, totalfat_g, saturatedfat_g, cholesterol_mg, carbohydrates_g, dietaryfibres_g, sodium_mg]
allergen_list = [egg, fish, milk, peanuts, soyabeans, treenuts, wheat, gluten, msg, sulphite]

In [6]:
for i in itemurl:

    # pull nutrition facts from each item's individual page
    temp_html = BeautifulSoup(requests.get(i).text, 'lxml').find('table', class_ = 'card__table')
    temp_list = []

    for j in temp_html.find_all('td'):
        temp_list.append(j.text)

    temp_list = [k.strip() for k in temp_list if str(k)]
    temp_list = list(zip(*(l.split('\n            ') for l in temp_list[1::2])))[0]

    for m in range(len(nutrition_list)):
        nutrition_list[m].append(int(float(temp_list[m])))

    # pull allergen and food sensitivities data from each item's individual page
    temp_html2 = BeautifulSoup(requests.get(i).text, 'lxml').find_all('ul', class_ = 'checkbox-list')

    allergen_temp = temp_html2[0]
    sensitive_temp = temp_html2[1]

    temp_list2 = []

    for i in allergen_temp.find_all('span'):
        if str(i)[29:30] == '>':
            temp_list2.append(False)
        else:
            temp_list2.append(True)

    for j in sensitive_temp.find_all('span'):
        if str(j)[29:30] == '>':
            temp_list2.append(False)
        else:
            temp_list2.append(True)
    
    for i in range(len(temp_list2)):
        allergen_list[i].append(temp_list2[i])

In [7]:
# create dataframe with data collected
menu = pd.DataFrame({
    'category' : category,
    'item' : item,
    'energy_kcal' : energy_kcal,
    'protein_g' : protein_g,
    'totalfat_g' : totalfat_g,
    'saturatedfat_g' : saturatedfat_g,
    'cholesterol_mg' : cholesterol_mg,
    'carbohydrates_g' : carbohydrates_g,
    'dietaryfibres_g' : dietaryfibres_g,
    'sodium_mg' : sodium_mg,
    'egg' : egg,
    'fish' : fish,
    'milk' : milk,
    'peanuts' : peanuts,
    'soyabeans' : soyabeans,
    'treenuts' : treenuts,
    'wheat' : wheat,
    'gluten' : gluten,
    'msg' : msg,
    'sulphite' : sulphite
})

menu.head(5)

Unnamed: 0,category,item,energy_kcal,protein_g,totalfat_g,saturatedfat_g,cholesterol_mg,carbohydrates_g,dietaryfibres_g,sodium_mg,egg,fish,milk,peanuts,soyabeans,treenuts,wheat,gluten,msg,sulphite
0,beverages,Dasani® Drinking Water,0,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False
1,beverages,Coca-Cola® Original Taste Less Sugar (Small),109,0,0,0,0,27,0,7,False,False,False,False,False,False,False,False,False,False
2,beverages,Coca-Cola® Original Taste Less Sugar (Medium),152,0,0,0,0,38,0,10,False,False,False,False,False,False,False,False,False,False
3,beverages,Coca-Cola® Original Taste Less Sugar (Large),221,0,0,0,0,55,0,15,False,False,False,False,False,False,False,False,False,False
4,beverages,Coca-Cola® Zero Sugar (Small),0,0,0,0,0,0,0,29,False,False,False,False,False,False,False,False,False,False


In [9]:
menu.dtypes

category           object
item               object
energy_kcal         int64
protein_g           int64
totalfat_g          int64
saturatedfat_g      int64
cholesterol_mg      int64
carbohydrates_g     int64
dietaryfibres_g     int64
sodium_mg           int64
egg                  bool
fish                 bool
milk                 bool
peanuts              bool
soyabeans            bool
treenuts             bool
wheat                bool
gluten               bool
msg                  bool
sulphite             bool
dtype: object

In [13]:
menu.to_excel(r'mcdmenu.xlsx', index = False)