In [0]:
%pip install emoji langdetect langcodes iso-639 requests

import requests
import json
import pandas as pd
import emoji
from langdetect import detect, DetectorFactory
from langcodes import Language
from iso639 import languages
from datetime import datetime, timedelta
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
DetectorFactory.seed = 42  # Ensure reproducibility

#Step 1: Load JSON Data for Previous Day

# Define date range from May 1, 2025 to yesterday
start_date = datetime(2025, 5, 1)
end_date = datetime.utcnow() + timedelta(hours=8) - timedelta(days=1)

date_range = [(start_date + timedelta(days=i)).strftime('%Y%m%d')
              for i in range((end_date - start_date).days + 1)]

container_url = "https://wqd7007.blob.core.windows.net/bronze-webscrape"
sas_token = "sv=2024-11-04&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2026-06-30T20:37:08Z&st=2025-05-18T12:37:08Z&spr=https&sig=ztkYcXLKHQ9nC5CE3PThs1OY%2FTDHHzSZ8JD4J6JUc1s%3D"

# Build SAS URL
for date_str in date_range:
    file_name = f"agoda_reviews_{date_str}.json"
    sas_url = f"{container_url}/{file_name}?{sas_token}"
    print(f"\nProcessing file for: {date_str}")

    response = requests.get(sas_url)
    if response.status_code != 200:
        print(f"Skipping missing file: {file_name}")
        continue

    data = response.json()
    df = pd.DataFrame(data)
    if not data or df.empty or 'content' not in df.columns:
        print(f"Skipping empty or invalid file: {file_name}")
        continue

    #Initial Cleaning
    df_copied = df.copy(deep=True)
    df_copied.drop(columns=['userImage', 'reviewCreatedVersion'], errors='ignore', inplace=True)

    #Remove emojis
    def remove_emojis(text):
        return emoji.replace_emoji(str(text), replace='').strip()

    df_copied['content_no_emojis'] = df_copied['content'].apply(remove_emojis)

    #Language detection using langdetect
    def detect_language(text):
        try:
            return detect(str(text))
        except:
            return "Unknown"

    df_copied['language_code'] = df_copied['content_no_emojis'].astype(str).apply(detect_language)

    #Convert ISO codes to full language names
    def detect_language_name(code):
        try:
            return languages.get(part1=code).name
        except:
            return "Unknown"

    df_copied['language'] = df_copied['language_code'].apply(detect_language_name)

    #Correct English misclassifications via keyword heuristics
    english_terms = [
        'amazing','advertising','advertisement','average','awful','awesome','awesomeness','a+',
        'best','better','brilliant','bravo','comfortable','company','convenience','convenient',
        'coupon','customer service','deal','discount','easy to','easy booking','efficient',
        'excellent','exceptional','expensive','experience','fake','false','fantastic','fast',
        'friendly','fraud','fraudulent','good','great','happy','hassle','helpful','horrible',
        'hidden cost','like','liked','love','loved','luv it','marvelous','nice','nice hotel',
        'not bad','no comment','not working','okay','okey','ok','outstanding','perfect','poor',
        'process','quality','quick','recommended','reliable','response','satisfied','scam',
        'scammer','seamless','simple','slow','smart','so far','star','stunning','super','thank',
        'thx','to use','transaction','useful','useless','value','very bad','very nice','very well',
        'wow','wonderful','worse','worst','works','yeah','yes'
    ]

    df_cp = df_copied.copy(deep=True)
    not_english = df_cp["language_code"] != "en"
    for term in english_terms:
        df_cp.loc[
            not_english &
            df_cp['content_no_emojis'].str.lower().str.contains(term, na=False),
            'language_code'
        ] = 'en'
    df_cp['language'] = df_cp['language_code'].apply(detect_language_name)

    #Manually map zh-cn and zh-tw
    df_cp.loc[df_cp['language_code'] == 'zh-cn', 'language'] = 'Chinese'
    df_cp.loc[df_cp['language_code'] == 'zh-tw', 'language'] = 'Chinese (Traditional)'

    # Step 8: Detect Burmese content by Unicode block
    def contains_burmese(text):
        return any('\u1000' <= ch <= '\u109F' for ch in str(text))

    df_cp.loc[(df_cp['language_code'] == 'Unknown') & df_cp['content_no_emojis'].apply(contains_burmese), 'language_code'] = 'my'
    df_cp.loc[df_cp['language_code'] == 'my', 'language'] = 'Burmese'

    #Convert circled emoji letters to plain English characters
    emoji_to_char = {
        '🅐': 'A', '🅑': 'B', '🅒': 'C', '🅓': 'D', '🅔': 'E', '🅕': 'F', '🅖': 'G', '🅗': 'H', '🅘': 'I',
        '🅙': 'J', '🅚': 'K', '🅛': 'L', '🅜': 'M', '🅝': 'N', '🅞': 'O', '🅟': 'P', '🅠': 'Q', '🅡': 'R',
        '🅢': 'S', '🅣': 'T', '🅤': 'U', '🅥': 'V', '🅦': 'W', '🅧': 'X', '🅨': 'Y', '🅩': 'Z'
    }
    def convert_emoji_to_text(text):
        return ''.join(emoji_to_char.get(ch, ch) for ch in str(text))

    df_cp['content_no_emojis'] = df_cp['content_no_emojis'].astype(str).apply(convert_emoji_to_text)

    #Fallback detection for short strings or Unknowns
    short_text_lang_dict = {
        'nice': 'en', 'bonjour': 'fr', 'hola': 'es', 'ciao': 'it',
        'hallo': 'de', 'salamat': 'tl', 'xin chào': 'vi'
    }
    def detect_lang_code(text):
        try:
            text = str(text).strip().lower()
            if len(text.split()) <= 2:
                if text in short_text_lang_dict:
                    return short_text_lang_dict[text]
            return detect(text)
        except:
            return "Unknown"

    # Apply fallback to Unknowns
    mask_unknown = df_cp['language'] == 'Unknown'
    df_cp.loc[mask_unknown, 'content_no_emojis'] = df_cp.loc[mask_unknown, 'content_no_emojis'].str.lower()
    df_cp.loc[mask_unknown, 'language_code'] = df_cp.loc[mask_unknown, 'content_no_emojis'].apply(detect_lang_code)
    df_cp.loc[mask_unknown, 'language'] = df_cp.loc[mask_unknown, 'language_code'].apply(detect_language_name)

    #Drop Unknown language type as they are empty now
    data = df_cp[~(df_cp['language']=='Unknown')].reset_index(drop=True)

    #Print results
    print("Before: ", df_cp.shape)
    print("After: ", data.shape)
    print("Difference: ", df_cp.shape[0] - data.shape[0])

    #Final output checks
    print(f" Number of languages after dropping Unknown: {data['language'].nunique()}")
    lang_distribution = data['language'].value_counts().reset_index()
    lang_distribution.columns = ['language', 'count']
    display(lang_distribution)

    spark_df = spark.createDataFrame(data)
    spark_df.count() 

    spark_df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("silver_dataprocessing.default.silver_agoda_reviews_details")


In [0]:
%sql
SELECT * FROM silver_dataprocessing.default.silver_agoda_reviews_details


reviewId,userName,content,score,thumbsUpCount,at,replyContent,repliedAt,appVersion,content_no_emojis,language_code,language
f05e4047-5b89-4ac3-8d71-e26b9fc6ba5d,Manoj Jain,"Agoda is a popular hotel booking app known for its wide selection of accommodations worldwide and competitive prices. Whether you’re planning a luxury getaway or a budget trip, Agoda offers great deals with a user-friendly interface that makes booking simple and fast.",5,0,2025-06-03T06:36:15,Thank you for sharing your positive experience with us. Your feedback is much appreciated. We're glad to know that you're happy and we are looking forward for your next journey with us! Have a wonderful day!,2025-06-03T06:37:34,,"Agoda is a popular hotel booking app known for its wide selection of accommodations worldwide and competitive prices. Whether you’re planning a luxury getaway or a budget trip, Agoda offers great deals with a user-friendly interface that makes booking simple and fast.",en,English
219f7878-1c36-42f4-86ac-5716ac9a5928,RatedR9o7,DO NOT USE THIS APP OR BOOK WITH AGODA!!! I booked a room and canceled 3 days before and they still charged me even though it says anything before 24hrs can be canceled. I contacted the hotel manager to dispute it and they said they have no record of my booking so agoda stole my money. and they change the prices on you for the hotel listing.,1,0,2025-06-03T04:54:49,"Hi RatedR9o7, thank you for reaching out to us. We strive to provide the best experience for our customers. Please share your booking ID, email address, and concern through our Help Center page, ""https://www.agoda.com/info/contact.html"". We look forward to assisting you! ^AA",2025-06-03T05:03:49,,DO NOT USE THIS APP OR BOOK WITH AGODA!!! I booked a room and canceled 3 days before and they still charged me even though it says anything before 24hrs can be canceled. I contacted the hotel manager to dispute it and they said they have no record of my booking so agoda stole my money. and they change the prices on you for the hotel listing.,en,English
b35650c5-8f18-4bd7-9faa-3425d4c15746,Sirisuk Jintaviwatwong,ลงแล้วทำเครื่องค้างๆ เปิดแอพอื่นแล้ว crashed เปิดไม่ได้,1,0,2025-06-03T04:53:39,"เราขออภัยในความไม่สะดวกที่เกิดขึ้น เราพยายามอย่างเต็มที่เพื่อมอบประสบการณ์ที่ดีที่สุดให้กับลูกค้าของเรา โปรดแจ้งรหัสการจอง ที่อยู่อีเมล และข้อกังวลของคุณผ่านหน้าศูนย์ช่วยเหลือของเรา ""https://www.agoda.com/info/contact.html"" เราหวังว่าจะได้ช่วยเหลือคุณ ^ Ethan",2025-06-03T04:59:45,,ลงแล้วทำเครื่องค้างๆ เปิดแอพอื่นแล้ว crashed เปิดไม่ได้,en,English
d577621d-aadd-4595-9425-88c8492fa4ba,Jiang Zong Zhe,PayNow QR Payment does not work. Generated QR code cannot be detected,1,0,2025-06-03T03:45:34,"Hi Jiang Zong Zhe, We're sorry for any inconvenience. Please let us assist you with your booking. Send us detailed information by email to googlesupport@agoda.com, and include your case number #1015102 in the subject. We will get back to you as soon as possible. TK",2025-06-03T03:59:07,13.19.0,PayNow QR Payment does not work. Generated QR code cannot be detected,en,English
53d4952f-5fbd-4c82-af3c-8c35c973347d,Alimsar Limqas,feedback,5,0,2025-06-03T00:21:24,Thank you for sharing your positive experience with us. Your feedback is much appreciated. We're glad to know that you're happy and we are looking forward for your next journey with us! Have a wonderful day!,2025-06-03T00:27:34,,feedback,en,English
