In [None]:
import pandas as pd, re, json
from webcrawler import (
    ProductInfo,
    Crawler,
    Earphone,
    Phone,
    Tablet,
    Watch,
    Screen,
    Laptop,
)
from dataclasses import asdict

In [None]:
full_data = dict()
specs_data = []
include = [
    ("*", "@id='productld'", ""),  # tag, condition, ancestor
    ("span", "@class='circle'", "/ancestor::li"),
    ("a", "contains(@class,'tzLink')", "/ancestor::li"),
    ("span", "@class=''", "/ancestor::li"),
]

fetched = await Crawler.async_inspect(
    "https://www.thegioididong.com/tai-nghe/tai-nghe-bluetooth-true-wireless-ava-go-p310?utm_flashsale=1",
    xpath="|".join([f"//{i[0]}[{i[1]}]{i[2]}" for i in include]),
    encoding="utf-8",
)
data = [
    re.sub(r"\s{2,}", ", ", i.text_content().strip())
    for i in fetched
    if ":" in i.text_content()
]

json_content = [i for i in data if re.findall(r"{|}", i)]
full_data = json.loads(json_content[0])

tags_content = [
    (i.split(":")[0].strip(), "".join(i.split(":")[1:]).removeprefix(",").strip())
    for i in data
    if not re.findall(r"{|}", i)
]
specs_data.extend(tags_content)

specs_data

In [None]:
def parse_common_info(data: dict):
    prd = ProductInfo(
        sku=data["sku"].strip(),
        name=data["name"].strip(),
        price=int(data["offers"]["price"]),
        brand=data["brand"]["name"][0].strip(),
        url=data["url"].strip(),
    )

    if data["aggregateRating"]:
        prd.rating = data["aggregateRating"]["ratingValue"]
        prd.reviews_count = int(data["aggregateRating"]["reviewcount"])

    released_value = [
        i["value"].strip()
        for i in data["additionalProperty"]
        if i["name"] == "Thời điểm ra mắt"
        or i["name"] == "Thời gian ra mắt"
        or i["name"] == "Năm ra mắt"
    ]
    prd.release_date = released_value[0].strip() if released_value else None

    # check device type
    dim_value = [
        i["value"]
        for i in data["additionalProperty"]
        if i["name"] == "Kích thước, khối lượng" or i["name"] == "Khối lượng"
    ]
    jack_value = [
        i["value"].strip()
        for i in data["additionalProperty"]
        if i["name"] == "Jack cắm"
    ]

    if prd.url.split("/")[3] == "laptop":  # classify by url hint
        prd.category = "Laptop"
    elif prd.url.split("/")[3] == "may-tinh-bang":
        prd.category = "Tablet"
    elif prd.url.split("/")[3] == "man-hinh-may-tinh":
        prd.category = "Screen"
    else:  # classify by weight and width
        if dim_value:
            g_vals = re.findall(r"(\d+\.?\d*)\s?(?:g|\()", dim_value[0])
            gam = float(g_vals[0]) if g_vals else None  # actual weight value in gam

            if prd.url.split("/")[3] == "dtdd":
                prd.category = "Smartphone" if gam and gam >= 135.0 else "Phone"
            elif prd.url.split("/")[3] == "dong-ho-thong-minh":
                mm_vals = re.findall(r"Ngang\s?(\d+\.?\d*)\s?mm", dim_value[0])
                mm = float(mm_vals[0]) if mm_vals else None  # actual width value in mm
                prd.category = "Smartwatch" if mm and mm > 33.5 else "Smartband"
            else:
                prd.category = (
                    "Headphone"
                    if gam and gam > 100.0
                    else "Earphone" if jack_value else "Earbuds"
                )

    return asdict(prd)

In [None]:
def parse_specs_info(data: dict, device: str):
    def cpu():
        value = [
            j.strip()
            for i, j in data
            if i in ["Công nghệ CPU", "Chip xử lý (CPU)", "CPU"]
        ]
        return value[0] if value else None

    def cpu_cores():
        value = [j.strip() for i, j in data if i == "Số nhân"]
        return value[0] if value else None

    def cpu_threads():
        value = [j.strip() for i, j in data if i == "Số luồng"]
        return value[0] if value else None

    def cpu_speed():
        value = [j.strip() for i, j in data if i == "Tốc độ CPU"]
        return value[0] if value else None

    def gpu():
        value = [
            j.strip()
            for i, j in data
            if i in ["Chip đồ hoạ (GPU)", "Chip đồ họa (GPU)", "Card màn hình"]
        ]
        return value[0] if value else None

    def ram():
        value = [j.strip() for i, j in data if i == "RAM"]
        return value[0] if value else None

    def max_ram():
        value = [j.strip() for i, j in data if i == "Hỗ trợ RAM tối đa"]
        return value[0] if value else None

    def ram_type():
        value = [j.strip() for i, j in data if i == "Loại RAM"]
        return value[0] if value else None

    def ram_bus():
        value = [j.strip() for i, j in data if i == "Tốc độ Bus RAM"]
        return value[0] if value else None

    def storage():
        value = [
            j.strip()
            for i, j in data
            if i in ["Ổ cứng", "Dung lượng lưu trữ", "Bộ nhớ trong"]
        ]
        return value[0] if value else None

    def webcam():
        value = [j.strip() for i, j in data if i == "Webcam"]
        return value[0] if value else None

    def rearcam_specs():
        if device == "tablet":
            value = [j.strip() for i, j in data if i == "Độ phân giải"]
            return value[1] if value else None
        else:
            value = [j.strip() for i, j in data if i == "Độ phân giải camera sau"]
            return value[0] if value else None

    def frontcam_specs():
        if device == "tablet":
            value = [j.strip() for i, j in data if i == "Độ phân giải"]
            return value[-1] if value else None
        value = [j.strip() for i, j in data if i == "Độ phân giải camera trước"]
        return value[0] if value else None

    def screen_tech():
        value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
        return value[0] if value else None

    def screen_type():
        value = [
            j.strip()
            for i, j in data
            if i in ["Chất liệu mặt", "Mặt kính cảm ứng", "Loại màn hình"]
        ]
        return value[0] if value else None

    def screen_size():
        value = [
            j.strip() for i, j in data if i in ["Kích thước màn hình", "Màn hình rộng"]
        ]
        return value[0].split("-")[0].strip() if value else None

    def screen_panel():
        if device in ["laptop", "screen"]:
            value = [j.strip() for i, j in data if i == "Tấm nền"]
            return value[0] if value else None
        value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
        return value[0] if value else None

    def screen_res():
        value = [
            j.strip() for i, j in data if i in ["Độ phân giải", "Độ phân giải màn hình"]
        ]
        return value[0] if value else None

    def screen_rate():
        if device in ["screen", "laptop"]:
            value = [j.strip() for i, j in data if i == "Tần số quét"]
            return value[0] if value else None
        value = [j.strip() for i, j in data if i == "Màn hình rộng"]
        return (
            re.sub(r".*\s(\d+\.?\d*\s*Hz)", r"\1", value[0].split("-")[1])
            if value
            else None
        )

    def screen_nits():
        if device == "laptop":
            value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
            return (
                re.sub(r".*\s+(\d+\s*nits).*", r"\1", value[0])
                if value and re.findall(r"\d+\s?nits", value[0])
                else None
            )
        value = [j.strip() for i, j in data if i == "Độ sáng tối đa"]
        return value[0] if value else None

    def os():
        value = [j.strip() for i, j in data if i == "Hệ điều hành"]
        return value[0] if value else None

    def water_resistant():
        if device == "earphone":
            value = [j.strip() for i, j in data if i == "Tiện ích"]
            return (
                re.sub(r".*(IP\d+).*", r"\1", value[0])
                if value and re.findall(r"IP\d+", value[0])
                else None
            )
        value = [
            j.strip()
            for i, j in data
            if i in ["Chống nước / Kháng nước", "Kháng nước, bụi"]
        ]
        return value[0] if value else None

    def battery():
        value = [
            j.strip()
            for i, j in data
            if i in ["Thông tin Pin", "Dung lượng pin", "Thời lượng pin tai nghe"]
        ]
        return value[0] if value else None

    def charger():
        value = [j.strip() for i, j in data if i == "Hỗ trợ sạc tối đa"]
        return value[0] if value else None

    def weight():
        if device in ["laptop", "screen"]:
            value = [
                j.strip()
                for i, j in data
                if i in ["Khối lượng có chân đế", "Kích thước"]
            ]
            return value[0].split("-")[-1].strip() if value else None
        value = [
            j.strip() for i, j in data if i in ["Kích thước, khối lượng", "Khối lượng"]
        ]
        return (
            re.sub(r"(.*Nặng\s+)?(\d+\.?\d*)\s*[g(].*", r"\2", value[0]) + " g"
            if value
            else None
        )

    def material():
        value = [
            j.strip() for i, j in data if i in ["Chất liệu khung viền", "Chất liệu"]
        ]
        return value[0] if value else None

    def connectivity():
        value = {
            j.strip() if j else ""
            for i, j in data
            if i
            in [
                "Wifi",
                "Bluetooth",
                "Kết nối khác",
                "Kết nối không dây",
                "Kết nối",
                "Công nghệ kết nối",
            ]
        }
        return ", ".join(value) if value else None

    def network():
        value = [j.strip() for i, j in data if i == "Mạng di động"]
        return value[0] if value else None

    def ports():
        value = {
            j.strip() if j else ""
            for i, j in data
            if i
            in [
                "Jack tai nghe",
                "Cổng kết nối/sạc",
                "Cổng giao tiếp",
                "Cổng sạc",
                "Jack cắm",
                "Cổng kết nối",
            ]
        }
        return ", ".join(value) if value else None

    def sound_tech():
        value = [j.strip() for i, j in data if i == "Công nghệ âm thanh"]
        return value[0] if value else None

    def speaker_driver():
        value = [j.strip() for i, j in data if i == "Công nghệ âm thanh"]
        return (
            re.sub(r".*(Driver\s+\d+\s*mm).*", r"\1", value[0])
            if value and re.findall(r"Driver\s+\d+\s*mm", value[0])
            else None
        )

    def compatible():
        value = [j.strip() for i, j in data if i == "Tương thích"]
        return value[0] if value else None

    def control():
        value = [j.strip() for i, j in data if i == "Điều khiển"]
        return value[0] if value else None

    def case_battery():
        value = [j.strip() for i, j in data if i == "Thời lượng pin hộp sạc"]
        return value[0] if value else None

    def power_consumption():
        value = [j.strip() for i, j in data if i == "Công suất tiêu thụ điện"]
        return value[0] if value else None

    match device:
        case "phone":
            return {
                "cpu": cpu(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "storage": storage(),
                "rearcam_specs": rearcam_specs(),
                "frontcam_specs": frontcam_specs(),
                "screen_type": screen_type(),
                "screen_size": screen_size(),
                "screen_panel": screen_panel(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "screen_nits": screen_nits(),
                "os": os(),
                "water_resistant": water_resistant(),
                "battery": battery(),
                "charger": charger(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "network": network(),
                "ports": ports(),
            }
        case "tablet":
            return {
                "cpu": cpu(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "storage": storage(),
                "rearcam_specs": rearcam_specs(),
                "frontcam_specs": frontcam_specs(),
                "screen_size": screen_size(),
                "screen_panel": screen_panel(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "os": os(),
                "water_resistant": water_resistant(),
                "battery": battery(),
                "charger": charger(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "network": network(),
                "ports": ports(),
            }
        case "laptop":
            return {
                "cpu": cpu(),
                "cpu_cores": cpu_cores(),
                "cpu_threads": cpu_threads(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "max_ram": max_ram(),
                "ram_type": ram_type(),
                "ram_bus": ram_bus(),
                "storage": storage(),
                "webcam": webcam(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "screen_tech": screen_tech(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "screen_nits": screen_nits(),
                "os": os(),
                "battery": battery(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "ports": ports(),
            }
        case "watch":
            return {
                "cpu": cpu(),
                "storage": storage(),
                "screen_type": screen_type(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "os": os(),
                "water_resistant": water_resistant(),
                "connectivity": connectivity(),
                "battery": battery(),
                "weight": weight(),
                "material": material(),
            }
        case "earphone":
            return {
                "sound_tech": sound_tech(),
                "speaker_driver": speaker_driver(),
                "compatible": compatible(),
                "control": control(),
                "water_resistant": water_resistant(),
                "ports": ports(),
                "connectivity": connectivity(),
                "battery": battery(),
                "case_battery": case_battery(),
                "weight": weight(),
            }
        case "screen":
            return {
                "screen_type": screen_type(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "screen_tech": screen_tech(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "power_consumption": power_consumption(),
                "ports": ports(),
                "weight": weight(),
            }