In [334]:
# Step 1: Import the required libraries
## pip install requests bs4 pandas json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json 
import datetime 

In [336]:
# Step 2: Fetch the webpage content 
url = 'https://gofood.co.id/en/jakarta/restaurant/mcdonald-s-kemang-b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82/reviews'
response = requests.get(url)
html_content = response.content
today_date = datetime.date.today()

In [54]:
# Step 3: Parse the HTML content 
soup = BeautifulSoup(html_content, 'html.parser')

##### Step 4: Identify the HTML structure of the reviews 
1. Use browser developer tools (usually opened with F12)
2. Inspect the review elements to find patterns in the HTML structure
3. For example, if the reviews are contained within specific tags with unique classes or ids.

In this case, we are using 

In [236]:
print(soup)

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="Browse around for good food, pick what you like, and GoFood can deliver it to you." name="description"/><link href="https://gofood.co.id/en/jakarta/restaurant/mcdonald-s-kemang-b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82/reviews" hreflang="en" rel="alternate"/><link href="https://gofood.co.id/jakarta/restaurant/mcdonald-s-kemang-b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82/reviews" hreflang="id" rel="alternate"/><link href="https://gofood.co.id/jakarta/restaurant/mcdonald-s-kemang-b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82/reviews" hreflang="x-default" rel="alternate"/><meta content="summary_large_image" name="twitter:card"/><meta content="830829963645099" property="fb:app_id"/><meta content="Browse around for good food, pick what you like, and GoFood can deliver it to you." property="og:description"/><meta content="https://gofood.co.id/en/jakarta/restaurant/mcdon

In [244]:
# Step 5: Extract the review data 
script_tags = soup.find_all('script', type = 'application/ld+json')

In [246]:
print(script_tags)

[<script type="application/ld+json">{"@context":"https://schema.org","@type":"WebSite","url":"https://gofood.co.id","potentialAction":[]}</script>, <script type="application/ld+json">{"@context":"https://schema.org","@type":"Organization","name":"GoFood","url":"https://gofood.co.id","logo":"https://i.gojekapi.com/darkroom/gofood-id/v2/images/uploads/f1a8bef4-02a8-4de5-ab95-aa2b1c5e9883_gofood-icon.png","sameAs":["https://www.instagram.com/gofoodindonesia"]}</script>, <script type="application/ld+json">{"@context":"https://schema.org","@type":"Restaurant","@id":"b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82","name":"McDonald&apos;s, Kemang","description":"","url":"/en/jakarta/restaurant/mcdonald-s-kemang-b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82","priceRange":"40k-100k","hasMenu":{"@type":"Menu","name":"McDonald&apos;s, Kemang","description":"","hasMenuSection":[{"@type":"MenuSection","name":"Promo Hari Ini","description":"McDonald&apos;s, Kemang Promo Hari Ini","hasMenuItem":[{"@type":"MenuItem","na

In [380]:
reviews_data = []

# Extract and parse JSON content
for script in script_tags:
    json_content = json.loads(script.string)
    if 'review' in json_content:
        review = json_content['review']
        for review in reviews:
            review_data = {
                "reviewed_at": review['datePublished'],
                "review_date": datetime.datetime.strptime(review['datePublished'], "%Y-%m-%dT%H:%M:%S.%f%z").date(),
                "review_text": review['reviewBody'],
                "merchant_id_jc": json_content['@id'],
                "merchant_name": review['name'],
                "reviewer_name": review['author']['name'],
                "review_rating": review['reviewRating']['ratingValue'],
                "scrap_date": today_date
            } 
            reviews_data.append(review_data)

In [382]:
# Create a Pandas DataFrame
reviews_df = pd.DataFrame(reviews_data)

# Display the DataFrame
reviews_df.head()

Unnamed: 0,reviewed_at,review_date,review_text,merchant_id_jc,merchant_name,reviewer_name,review_rating,scrap_date
0,2024-07-02T23:25:02.622196+07:00,2024-07-02,Sengaja Tidak Lengkap?\nPembelian sebelumnya b...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald&apos;s, Kemang",Agnes,1,2024-07-08
1,2024-06-27T16:37:19.390543+07:00,2024-06-27,"I asked for one extra sauce, not given. not th...",b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald&apos;s, Kemang",J******,2,2024-07-08
2,2024-06-26T14:16:13.258087+07:00,2024-06-26,Padahal saya blg air minum nya gausah dan diga...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald&apos;s, Kemang",S***,5,2024-07-08
3,2024-06-25T12:50:10.354828+07:00,2024-06-25,big macnya kagak ada sayuran segala??\npadahal...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald&apos;s, Kemang",Narendra Nabhastala,2,2024-07-08
4,2024-06-20T22:24:08.389291+07:00,2024-06-20,Gak sesuai permintan,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald&apos;s, Kemang",B****,1,2024-07-08


In [384]:
aggregateRatings_data = []

# Extract and parse JSON content
for script in script_tags:
    json_content = json.loads(script.string)
    if 'aggregateRating' in json_content:
        aggregateRating = json_content['aggregateRating']
        aggregateRating_data = {
            "merchant_name": json_content['name'],
            "merchant_id_jc": json_content['@id'],
            "rating_count": aggregateRating['ratingCount'],
            "rating_value": aggregateRating['ratingValue'],
            "scrap_date": today_date
        } 
        aggregateRatings_data.append(aggregateRating_data)

In [386]:
# Create a Pandas DataFrame
aggregate_ratings_df = pd.DataFrame(aggregateRatings_data)

# Display the DataFrame
print(aggregate_ratings_df)

             merchant_name                        merchant_id_jc  \
0  McDonald&apos;s, Kemang  b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82   

   rating_count  rating_value  scrap_date  
0         91244           4.8  2024-07-08  


In [390]:
# Merge DataFrames
merged_df = pd.merge(reviews_df, aggregate_ratings_df, how='outer', on=['merchant_id_jc', 'merchant_name', 'scrap_date'])

merged_df['merchant_name'] = merged_df['merchant_name'].str.replace("&apos;", "'")
column_order = [
    'scrape_date',
    'merchant_id_jc',
    'merchant_name',
    'rating_count',
    'rating_value',
    'review_date',
    'reviewed_at',
    'reviewer_name',
    'review_text',
    'review_rating'
]

# Display the merged DataFrame
merged_df

Unnamed: 0,reviewed_at,review_date,review_text,merchant_id_jc,merchant_name,reviewer_name,review_rating,scrap_date,rating_count,rating_value
0,2024-07-02T23:25:02.622196+07:00,2024-07-02,Sengaja Tidak Lengkap?\nPembelian sebelumnya b...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",Agnes,1,2024-07-08,91244,4.8
1,2024-06-27T16:37:19.390543+07:00,2024-06-27,"I asked for one extra sauce, not given. not th...",b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",J******,2,2024-07-08,91244,4.8
2,2024-06-26T14:16:13.258087+07:00,2024-06-26,Padahal saya blg air minum nya gausah dan diga...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",S***,5,2024-07-08,91244,4.8
3,2024-06-25T12:50:10.354828+07:00,2024-06-25,big macnya kagak ada sayuran segala??\npadahal...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",Narendra Nabhastala,2,2024-07-08,91244,4.8
4,2024-06-20T22:24:08.389291+07:00,2024-06-20,Gak sesuai permintan,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",B****,1,2024-07-08,91244,4.8
5,2024-06-20T17:37:28.12715+07:00,2024-06-20,Saya sudah order paket dan nugget dan tambah 1...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",jully,1,2024-07-08,91244,4.8
6,2024-06-18T21:16:10.728027+07:00,2024-06-18,French Fries nya mantabs masih panas dan empuk...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",A**** R**,5,2024-07-08,91244,4.8
7,2024-06-18T03:01:43.812024+07:00,2024-06-18,pepek burger nya cuma di kasih daging sama rot...,b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",Daeng,1,2024-07-08,91244,4.8
8,2024-06-15T21:12:32.318665+07:00,2024-06-15,"minta ayam paha atas, yg dikasih paha bawah",b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",Maria Angeline,3,2024-07-08,91244,4.8
9,2024-06-13T16:26:58.227643+07:00,2024-06-13,"Nasi basi ini yg lu kash,ga dksh saos,nasi kmr...",b7bc28fc-3a60-4e6f-87bf-9e7b4444ce82,"McDonald's, Kemang",Gilang ridiawan,1,2024-07-08,91244,4.8


In [392]:
file_name = 'scrape_gofood_mcdonald_kemang_20240708.csv'
merged_df.to_csv(file_name, index = 'False')