In [0]:
!pip install google_play_scraper

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import json
import time
from io import StringIO
from datetime import datetime, timedelta
from google_play_scraper import reviews, Sort
from azure.storage.blob import BlobClient

#CONFIGURATION
APP_ID = "com.agoda.mobile.consumer"
YESTERDAY = (datetime.today() + timedelta(hours=8) - timedelta(days=1)).date()
BATCH_SIZE = 200
MAX_TOTAL = 100000

#Azure Blob Storage info
ACCOUNT_URL = "https://blob.core.windows.net"
SAS_TOKEN = "sv=2024-11-04&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2026-06-30T20:37:08Z&st=2025-05-18T12:37:08Z&spr=https&sig=ztkYcXLKHQ9nC5CE3PThs1OY%2FTDHHzSZ8JD4J6JUc1s%3D"
CONTAINER_NAME = "bronze-webscrape"

# SCRAPE GOOGLE PLAY REVIEWS
print(f"Scraping Agoda reviews for {YESTERDAY}...")

all_reviews = []
next_token = None

while True:
    result, next_token = reviews(
        APP_ID,
        lang='en',
        country='my',
        sort=Sort.NEWEST,
        count=BATCH_SIZE,
        continuation_token=next_token
    )

    for r in result:
        review_date = r.get("at")
        if not isinstance(review_date, datetime):
            continue

        if review_date.date() < YESTERDAY:
            print("Reached reviews before yesterday. Stopping.")
            next_token = None
            break

        if review_date.date() == YESTERDAY:
            r["at"] = review_date.isoformat()
            if isinstance(r.get("repliedAt"), datetime):
                r["repliedAt"] = r["repliedAt"].isoformat()
            all_reviews.append(r)

    print(f"Collected {len(all_reviews)} reviews so far...")

    if not next_token or len(all_reviews) >= MAX_TOTAL:
        break

    time.sleep(1)

# UPLOAD TO AZURE BLOB STORAGE
print("Uploading to Azure Blob Storage...")

filename = f"agoda_reviews_{YESTERDAY.strftime('%Y%m%d')}.json"

# Convert to string and encode to bytes
string_stream = StringIO()
json.dump(all_reviews, string_stream, indent=2, ensure_ascii=False)
byte_data = string_stream.getvalue().encode('utf-8')

blob = BlobClient(account_url=ACCOUNT_URL,
                  container_name=CONTAINER_NAME,
                  blob_name=filename,
                  credential=SAS_TOKEN)

blob.upload_blob(byte_data, overwrite=True)

print(f"Upload complete: {filename} to container '{CONTAINER_NAME}'")


Scraping Agoda reviews for 2025-05-31...
Reached reviews before yesterday. Stopping.
Collected 12 reviews so far...
Uploading to Azure Blob Storage...
Upload complete: agoda_reviews_20250531.json to container 'bronze-webscrape'
