In [None]:
import pandas as pd, re, json, httpx, asyncio
from lxml import html
from webcrawler import (
    ProductInfo,
    Earphones,
    Phone,
    Tablet,
    Watch,
    Screen,
    Laptop,
)
from dataclasses import asdict

In [49]:
async def async_inspect(
    url,
    *,
    client: httpx.AsyncClient | None = None,
    xpath: str | None = None,
    sub_content: str | list[str] | None = None,
    semaphore: asyncio.Semaphore | None = None,
    encoding: str | None = None,
    retries: int = 3,
    retry_delay: float = 2.0,
):
    """
    Asynchronously inspect HTML content from given URL.
    """
    last_exception = None
    resp = None
    content = None

    if sub_content and not encoding:
        print("Cannot use 'sub_content' regex without encoding bytes-like content.")
        return

    if not semaphore:  # limit number of concurrent processes
        semaphore = asyncio.Semaphore(5)

    async def get_response(
        client: httpx.AsyncClient = None,
    ):  # handle retries
        nonlocal resp, last_exception
        for _ in range(retries):
            try:
                resp = await client.get(url)
                resp.raise_for_status()
                break
            except httpx.HTTPStatusError as e:
                print(f"Inspecting {url} failed >> {e}")
                return
            except (httpx.RequestError, httpx.TimeoutException) as e:
                print(f"{repr(e)}. Retry after {retry_delay}sec...")
                last_exception = e
                await asyncio.sleep(retry_delay)

    async with semaphore:
        if client:  # global client
            await get_response(client)
        else:
            async with httpx.AsyncClient(
                timeout=10.0,
                headers={
                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100101 Firefox/141.0",
                    "Connection": "keep-alive",
                },
            ) as client:
                await get_response(client)

    if not resp:  # in case failed 3 times
        print(f"Inspecting {url} failed 3 times >> {last_exception}")
        return

    # reduce html content
    if sub_content:
        if isinstance(sub_content, str):
            found = re.findall(
                sub_content,
                resp.content if not encoding else resp.content.decode(encoding),
                re.S,
            )
            content = (
                "\n".join(found)
                if found
                else resp.content if not encoding else resp.content.decode(encoding)
            )
        else:
            result = []
            for i in sub_content:
                found = re.findall(
                    i,
                    resp.content if not encoding else resp.content.decode(encoding),
                    re.S,
                )
                if not found:
                    continue
                result.extend(found)
            content = (
                "\n".join(result)
                if result
                else resp.content if not encoding else resp.content.decode(encoding)
            )
    else:
        content = resp.content if not encoding else resp.content.decode(encoding)

    # inspect
    try:
        source = html.fromstring(content)
        if xpath:
            return source.xpath(xpath)
        return html.tostring(source, pretty_print=True, encoding="unicode")
    except Exception as e:
        print(f"Error occurs while inspecting {url} >> {e}")
        return

In [52]:
data = await async_inspect(
    "https://www.thegioididong.com/dtdd/samsung-galaxy-a07-4gb-64gb?rg=1",
    encoding="utf-8",
    sub_content=[
        r'<script[^>]*id="productld"[^>]*>.*?</script',
        r'<section[^>]*class="detail detailv2"[^>]*>.*?</section>',
    ],
    # [^>] and .*? are for non-greedy
)
print(data)

<html><head><script type="application/ld+json" id="productld">{"@context":"https://schema.org","@type":"Product","name":"Điện thoại Samsung Galaxy A07 4GB/64GB","url":"https://www.thegioididong.com/dtdd/samsung-galaxy-a07-4gb-64gb","image":{"@type":"ImageObject","contentUrl":"https://cdn.tgdd.vn/Products/Images/42/341802/samsung-galaxy-a07-violet-thumb-600x600.jpg"},"description":"Mua điện thoại Samsung Galaxy A07 4GB/64GB giá tốt, chính hãng, tặng sạc nhanh 25W trị giá 540K, mua trả chậm 0% lãi suất - trả trước từ 0đ. Mua ngay!","sku":"341802","mpn":"341802","brand":{"@type":"Brand","name":["Samsung"]},"review":null,"aggregateRating":null,"additionalProperty":[{"@type":"PropertyValue","name":"Hệ điều hành","value":"Android 15"},{"@type":"PropertyValue","name":"Chip xử lý (CPU)","value":"MediaTek Helio G99"},{"@type":"PropertyValue","name":"Tốc độ CPU","value":"2 nhân 2.2 GHz & 6 nhân 2.0 GHz"},{"@type":"PropertyValue","name":"Chip đồ họa (GPU)","value":"Đang cập nhật"},{"@type":"Pro

In [53]:
urls = ["https://www.thegioididong.com/laptop/lenovo-loq-15irx9-i5-83dv003cvn"] * 10
p = []
for i in urls:
    full_data = dict()
    specs_data = []

    fetched = await async_inspect(
        "https://www.thegioididong.com/tai-nghe/tai-nghe-tws-jbl-wave-beam-2",
        xpath="//script[@id='productld']|//div[@class='box-specifi']/ul/li[.//span[@class='circle']"
        + "or .//a[contains(@class,'tzLink')] or .//span[@class='']]",
        encoding="utf-8",
    )
    data = [
        re.sub(r"\s{2,}", ", ", i.text_content().strip())
        for i in fetched
        if ":" in i.text_content()
    ]

    json_content = [i for i in data if re.findall(r"{|}", i)]
    full_data = json.loads(json_content[0])

    tags_content = [
        (i.split(":")[0].strip(), "".join(i.split(":")[1:]).removeprefix(",").strip())
        for i in data
        if not re.findall(r"{|}", i)
    ]
    specs_data.extend(tags_content)

    p.append(specs_data)

p

[[('Thời lượng pin tai nghe', 'Dùng 10 giờ - Sạc 2 giờ'),
  ('Thời lượng pin hộp sạc', 'Dùng 40 giờ - Sạc 2 giờ'),
  ('Cổng sạc', 'Type-C'),
  ('Công nghệ âm thanh',
   'Driver 8 mm, Smart Ambient, JBL Pure Bass Sound, Active Noise Cancellation'),
  ('Tương thích', 'macOS, Android, iOS, Windows'),
  ('Ứng dụng kết nối', 'JBL Headphones'),
  ('Tiện ích',
   'Voice Aware, 4 Micro chống ồn, Chống nước & bụi IP54, Sạc nhanh'),
  ('Kết nối cùng lúc', '2 thiết bị'),
  ('Công nghệ kết nối', 'Bluetooth 5.3'),
  ('Điều khiển', 'Cảm ứng chạm'),
  ('Phím điều khiển',
   'Phát/dừng chơi nhạc, Chuyển bài hát, Bật trợ lí ảo, Nhận/Ngắt cuộc gọi'),
  ('Kích thước', 'Dài 3.3 cm - Rộng 2.4 cm - Cao 2.1 cm'),
  ('Khối lượng', '4.3 g'),
  ('Thương hiệu của', 'Mỹ'),
  ('Sản xuất tại', 'Trung Quốc'),
  ('Hãng', 'JBL. Xem thông tin hãng')],
 [('Thời lượng pin tai nghe', 'Dùng 10 giờ - Sạc 2 giờ'),
  ('Thời lượng pin hộp sạc', 'Dùng 40 giờ - Sạc 2 giờ'),
  ('Cổng sạc', 'Type-C'),
  ('Công nghệ âm thanh',
   '

In [None]:
def parse_common_info(data: dict):
    prd = ProductInfo(
        sku=data["sku"].strip(),
        name=data["name"].strip(),
        price=int(data["offers"]["price"]),
        brand=data["brand"]["name"][0].strip(),
        url=data["url"].strip(),
    )

    if data["aggregateRating"]:
        prd.rating = data["aggregateRating"]["ratingValue"]
        prd.reviews_count = int(data["aggregateRating"]["reviewcount"])

    released_value = [
        i["value"].strip()
        for i in data["additionalProperty"]
        if i["name"] == "Thời điểm ra mắt"
        or i["name"] == "Thời gian ra mắt"
        or i["name"] == "Năm ra mắt"
    ]
    prd.release_date = released_value[0].strip() if released_value else None

    # check device type
    dim_value = [
        i["value"]
        for i in data["additionalProperty"]
        if i["name"] == "Kích thước, khối lượng" or i["name"] == "Khối lượng"
    ]
    jack_value = [
        i["value"].strip()
        for i in data["additionalProperty"]
        if i["name"] == "Jack cắm"
    ]

    if prd.url.split("/")[3] == "laptop":  # classify by url hint
        prd.category = "Laptop"
    elif prd.url.split("/")[3] == "may-tinh-bang":
        prd.category = "Tablet"
    elif prd.url.split("/")[3] == "man-hinh-may-tinh":
        prd.category = "Screen"
    else:  # classify by weight and width
        if dim_value:
            g_vals = re.findall(r"(\d+\.?\d*)\s?(?:g|\()", dim_value[0])
            gam = float(g_vals[0]) if g_vals else None  # actual weight value in gam

            if prd.url.split("/")[3] == "dtdd":
                prd.category = "Smartphone" if gam and gam >= 135.0 else "Phone"
            elif prd.url.split("/")[3] == "dong-ho-thong-minh":
                mm_vals = re.findall(r"Ngang\s?(\d+\.?\d*)\s?mm", dim_value[0])
                mm = float(mm_vals[0]) if mm_vals else None  # actual width value in mm
                prd.category = "Smartwatch" if mm and mm > 33.5 else "Smartband"
            else:
                prd.category = (
                    "Headphone"
                    if gam and gam > 100.0
                    else "Earbuds" if not jack_value else "Earphones"
                )
        else:
            if prd.url.split("/")[3] == "tai-nghe":
                prd.category = "Earphones"

    return asdict(prd)

In [None]:
def parse_specs_info(data: dict, device: str):
    def cpu():
        value = [
            j.strip()
            for i, j in data
            if i in ["Công nghệ CPU", "Chip xử lý (CPU)", "CPU"]
        ]
        return value[0] if value else None

    def cpu_cores():
        value = [j.strip() for i, j in data if i == "Số nhân"]
        return value[0] if value else None

    def cpu_threads():
        value = [j.strip() for i, j in data if i == "Số luồng"]
        return value[0] if value else None

    def cpu_speed():
        value = [j.strip() for i, j in data if i == "Tốc độ CPU"]
        return value[0] if value else None

    def gpu():
        value = [
            j.strip()
            for i, j in data
            if i in ["Chip đồ hoạ (GPU)", "Chip đồ họa (GPU)", "Card màn hình"]
        ]
        return value[0] if value else None

    def ram():
        value = [j.strip() for i, j in data if i == "RAM"]
        return value[0] if value else None

    def max_ram():
        value = [j.strip() for i, j in data if i == "Hỗ trợ RAM tối đa"]
        return value[0] if value else None

    def ram_type():
        value = [j.strip() for i, j in data if i == "Loại RAM"]
        return value[0] if value else None

    def ram_bus():
        value = [j.strip() for i, j in data if i == "Tốc độ Bus RAM"]
        return value[0] if value else None

    def storage():
        value = [
            j.strip()
            for i, j in data
            if i in ["Ổ cứng", "Dung lượng lưu trữ", "Bộ nhớ trong"]
        ]
        return value[0] if value else None

    def webcam():
        value = [j.strip() for i, j in data if i == "Webcam"]
        return value[0] if value else None

    def rearcam_specs():
        if device == "tablet":
            value = [j.strip() for i, j in data if i == "Độ phân giải"]
            return value[1] if value else None
        else:
            value = [j.strip() for i, j in data if i == "Độ phân giải camera sau"]
            return value[0] if value else None

    def frontcam_specs():
        if device == "tablet":
            value = [j.strip() for i, j in data if i == "Độ phân giải"]
            return value[-1] if value else None
        value = [j.strip() for i, j in data if i == "Độ phân giải camera trước"]
        return value[0] if value else None

    def screen_tech():
        value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
        return value[0] if value else None

    def screen_type():
        value = [
            j.strip()
            for i, j in data
            if i in ["Chất liệu mặt", "Mặt kính cảm ứng", "Loại màn hình"]
        ]
        return value[0] if value else None

    def screen_size():
        value = [
            j.strip() for i, j in data if i in ["Kích thước màn hình", "Màn hình rộng"]
        ]
        return value[0].split("-")[0].strip() if value else None

    def screen_panel():
        if device in ["laptop", "screen"]:
            value = [j.strip() for i, j in data if i == "Tấm nền"]
            return value[0] if value else None
        value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
        return value[0] if value else None

    def screen_res():
        value = [
            j.strip() for i, j in data if i in ["Độ phân giải", "Độ phân giải màn hình"]
        ]
        return value[0] if value else None

    def screen_rate():
        if device in ["screen", "laptop"]:
            value = [j.strip() for i, j in data if i == "Tần số quét"]
            return value[0] if value else None
        value = [j.strip() for i, j in data if i == "Màn hình rộng"]
        return (
            re.sub(r".*\s(\d+\.?\d*\s*Hz)", r"\1", value[0].split("-")[1])
            if value
            else None
        )

    def screen_nits():
        if device == "laptop":
            value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
            return (
                re.sub(r".*\s+(\d+\s*nits).*", r"\1", value[0])
                if value and re.findall(r"\d+\s?nits", value[0])
                else None
            )
        value = [j.strip() for i, j in data if i == "Độ sáng tối đa"]
        return value[0] if value else None

    def os():
        value = [j.strip() for i, j in data if i == "Hệ điều hành"]
        return value[0] if value else None

    def water_resistant():
        if device == "earphones":
            value = [j.strip() for i, j in data if i == "Tiện ích"]
            return (
                re.sub(r".*(IP\d+).*", r"\1", value[0])
                if value and re.findall(r"IP\d+", value[0])
                else None
            )
        value = [
            j.strip()
            for i, j in data
            if i in ["Chống nước / Kháng nước", "Kháng nước, bụi"]
        ]
        return value[0] if value else None

    def battery():
        value = [
            j.strip()
            for i, j in data
            if i in ["Thông tin Pin", "Dung lượng pin", "Thời lượng pin tai nghe"]
        ]
        return value[0] if value else None

    def charger():
        value = [j.strip() for i, j in data if i == "Hỗ trợ sạc tối đa"]
        return value[0] if value else None

    def weight():
        if device in ["laptop", "screen"]:
            value = [
                j.strip()
                for i, j in data
                if i in ["Khối lượng có chân đế", "Kích thước"]
            ]
            return value[0].split("-")[-1].strip() if value else None
        value = [
            j.strip() for i, j in data if i in ["Kích thước, khối lượng", "Khối lượng"]
        ]
        return (
            re.sub(r"(.*Nặng\s+)?(\d+\.?\d*)\s*[g(].*", r"\2", value[0]) + " g"
            if value
            else None
        )

    def material():
        value = [
            j.strip() for i, j in data if i in ["Chất liệu khung viền", "Chất liệu"]
        ]
        return value[0] if value else None

    def connectivity():
        value = {
            j.strip() if j else ""
            for i, j in data
            if i
            in [
                "Wifi",
                "Bluetooth",
                "Kết nối khác",
                "Kết nối không dây",
                "Kết nối",
                "Công nghệ kết nối",
            ]
        }
        return ", ".join(value) if value else None

    def network():
        value = [j.strip() for i, j in data if i == "Mạng di động"]
        return value[0] if value else None

    def ports():
        value = {
            j.strip() if j else ""
            for i, j in data
            if i
            in [
                "Jack tai nghe",
                "Cổng kết nối/sạc",
                "Cổng giao tiếp",
                "Cổng sạc",
                "Jack cắm",
                "Cổng kết nối",
            ]
        }
        return ", ".join(value) if value else None

    def sound_tech():
        value = [j.strip() for i, j in data if i == "Công nghệ âm thanh"]
        return value[0] if value else None

    def compatible():
        value = [j.strip() for i, j in data if i == "Tương thích"]
        return value[0] if value else None

    def control():
        value = [j.strip() for i, j in data if i == "Điều khiển"]
        return value[0] if value else None

    def case_battery():
        value = [j.strip() for i, j in data if i == "Thời lượng pin hộp sạc"]
        return value[0] if value else None

    def power_consumption():
        value = [j.strip() for i, j in data if i == "Công suất tiêu thụ điện"]
        return value[0] if value else None

    match device:
        case "phone":
            return {
                "cpu": cpu(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "storage": storage(),
                "rearcam_specs": rearcam_specs(),
                "frontcam_specs": frontcam_specs(),
                "screen_type": screen_type(),
                "screen_size": screen_size(),
                "screen_panel": screen_panel(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "screen_nits": screen_nits(),
                "os": os(),
                "water_resistant": water_resistant(),
                "battery": battery(),
                "charger": charger(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "network": network(),
                "ports": ports(),
            }
        case "tablet":
            return {
                "cpu": cpu(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "storage": storage(),
                "rearcam_specs": rearcam_specs(),
                "frontcam_specs": frontcam_specs(),
                "screen_size": screen_size(),
                "screen_panel": screen_panel(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "os": os(),
                "water_resistant": water_resistant(),
                "battery": battery(),
                "charger": charger(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "network": network(),
                "ports": ports(),
            }
        case "laptop":
            return {
                "cpu": cpu(),
                "cpu_cores": cpu_cores(),
                "cpu_threads": cpu_threads(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "max_ram": max_ram(),
                "ram_type": ram_type(),
                "ram_bus": ram_bus(),
                "storage": storage(),
                "webcam": webcam(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "screen_tech": screen_tech(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "screen_nits": screen_nits(),
                "os": os(),
                "battery": battery(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "ports": ports(),
            }
        case "watch":
            return {
                "cpu": cpu(),
                "storage": storage(),
                "screen_type": screen_type(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "os": os(),
                "water_resistant": water_resistant(),
                "connectivity": connectivity(),
                "battery": battery(),
                "weight": weight(),
                "material": material(),
            }
        case "earphones":
            return {
                "sound_tech": sound_tech(),
                "compatible": compatible(),
                "control": control(),
                "water_resistant": water_resistant(),
                "ports": ports(),
                "connectivity": connectivity(),
                "battery": battery(),
                "case_battery": case_battery(),
                "weight": weight(),
            }
        case "screen":
            return {
                "screen_type": screen_type(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "screen_tech": screen_tech(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "power_consumption": power_consumption(),
                "ports": ports(),
                "weight": weight(),
            }

In [None]:
p = Earphones(
    **parse_common_info(full_data), **parse_specs_info(specs_data, "earphones")
)
asdict(p)

In [None]:
df = pd.read_csv("../data/scraped/thegioididong_earphones_2025-09-04.csv")

df.drop_duplicates(subset="sku").sort_values("name")