In [None]:
import pandas as pd, re,json
from webcrawler import Crawler, Phone, Laptop, Tablet, Watch, Screen, Earphones
from dataclasses import asdict

In [None]:
def parse_specs_info(data: dict, device: str):
    def cpu():
        value = [
            j.strip()
            for i, j in data
            if i in ["Công nghệ CPU", "Chip xử lý (CPU)", "CPU"]
        ]
        return value[0] if value else None

    def cpu_cores():
        value = [j.strip() for i, j in data if i == "Số nhân"]
        return value[0] if value else None

    def cpu_threads():
        value = [j.strip() for i, j in data if i == "Số luồng"]
        return value[0] if value else None

    def cpu_speed():
        value = [j.strip() for i, j in data if i == "Tốc độ CPU"]
        return value[0] if value else None

    def gpu():
        value = [
            j.strip()
            for i, j in data
            if i in ["Chip đồ hoạ (GPU)", "Chip đồ họa (GPU)", "Card màn hình"]
        ]
        return value[0] if value else None

    def ram():
        value = [j.strip() for i, j in data if i == "RAM"]
        return value[0] if value else None

    def max_ram():
        value = [j.strip() for i, j in data if i == "Hỗ trợ RAM tối đa"]
        return value[0] if value else None

    def ram_type():
        value = [j.strip() for i, j in data if i == "Loại RAM"]
        return value[0] if value else None

    def ram_bus():
        value = [j.strip() for i, j in data if i == "Tốc độ Bus RAM"]
        return value[0] if value else None

    def storage():
        value = [
            j.strip()
            for i, j in data
            if i in ["Ổ cứng", "Dung lượng lưu trữ", "Bộ nhớ trong"]
        ]
        return value[0] if value else None

    def webcam():
        value = [j.strip() for i, j in data if i == "Webcam"]
        return value[0] if value else None

    def rearcam_specs():
        if device == "tablet":
            value = [j.strip() for i, j in data if i == "Độ phân giải"]
            return value[1] if value else None
        else:
            value = [j.strip() for i, j in data if i == "Độ phân giải camera sau"]
            return value[0] if value else None

    def frontcam_specs():
        if device == "tablet":
            value = [j.strip() for i, j in data if i == "Độ phân giải"]
            return value[-1] if value else None
        value = [j.strip() for i, j in data if i == "Độ phân giải camera trước"]
        return value[0] if value else None

    def screen_tech():
        value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
        return value[0] if value else None

    def screen_type():
        value = [
            j.strip()
            for i, j in data
            if i in ["Chất liệu mặt", "Mặt kính cảm ứng", "Loại màn hình"]
        ]
        return value[0] if value else None

    def screen_size():
        value = [
            j.strip() for i, j in data if i in ["Kích thước màn hình", "Màn hình rộng"]
        ]
        return value[0].split("-")[0].strip() if value else None

    def screen_panel():
        if device in ["laptop", "screen"]:
            value = [j.strip() for i, j in data if i == "Tấm nền"]
            return value[0] if value else None
        value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
        return value[0] if value else None

    def screen_res():
        value = [
            j.strip() for i, j in data if i in ["Độ phân giải", "Độ phân giải màn hình"]
        ]
        return value[0] if value else None

    def screen_rate():
        if device in ["screen", "laptop"]:
            value = [j.strip() for i, j in data if i == "Tần số quét"]
            return value[0] if value else None
        value = [j.strip() for i, j in data if i == "Màn hình rộng"]
        return (
            re.sub(r".*?(\d+\.?\d*\s*Hz)", r"\1", value[0])
            if value and re.findall(r"\d+\.?\d*\s*Hz", value[0])
            else None
        )

    def screen_nits():
        if device == "laptop":
            value = [j.strip() for i, j in data if i == "Công nghệ màn hình"]
            return (
                re.sub(r".*?(\d+\s?nits).*", r"\1", value[0])
                if value and re.findall(r"\d+\s?nits", value[0])
                else None
            )
        value = [j.strip() for i, j in data if i == "Độ sáng tối đa"]
        return value[0] if value else None

    def os():
        value = [j.strip() for i, j in data if i == "Hệ điều hành"]
        return value[0] if value else None

    def water_resistant():
        if device == "earphones":
            value = [j.strip() for i, j in data if i == "Tiện ích"]
            return (
                re.sub(r".*?(IP[X0-9]+).*", r"\1", value[0])
                if value and re.findall(r"IP[X0-9]+", value[0])
                else None
            )
        value = [
            j.strip()
            for i, j in data
            if i in ["Chống nước / Kháng nước", "Kháng nước, bụi"]
        ]
        return value[0] if value else None

    def battery():
        value = [
            j.strip()
            for i, j in data
            if i in ["Thông tin Pin", "Dung lượng pin", "Thời lượng pin tai nghe"]
        ]
        return value[0] if value else None

    def charger():
        value = [j.strip() for i, j in data if i == "Hỗ trợ sạc tối đa"]
        return value[0] if value else None

    def weight():
        if device in ["laptop", "screen"]:
            value = [
                j.strip()
                for i, j in data
                if i in ["Khối lượng có chân đế", "Kích thước"]
            ]
            return value[0].split("-")[-1].strip() if value else None
        value = [
            j.strip() for i, j in data if i in ["Kích thước, khối lượng", "Khối lượng"]
        ]
        return (
            re.sub(r"(.*Nặng\s+)?(\d+\.?\d*)\s*[g(].*", r"\2", value[0]) + " g"
            if value and re.findall(r"(.*Nặng\s+)?\d+\.?\d*\s*[g(]", value[0])
            else None
        )

    def material():
        value = [
            j.strip() for i, j in data if i in ["Chất liệu khung viền", "Chất liệu"]
        ]
        return value[0] if value else None

    def connectivity():
        value = {
            j.strip() if j else ""
            for i, j in data
            if i
            in [
                "Wifi",
                "Bluetooth",
                "Kết nối khác",
                "Kết nối không dây",
                "Kết nối",
                "Công nghệ kết nối",
            ]
        }
        return ", ".join(value) if value else None

    def network():
        value = [j.strip() for i, j in data if i == "Mạng di động"]
        return value[0] if value else None

    def ports():
        value = {
            j.strip() if j else ""
            for i, j in data
            if i
            in [
                "Jack tai nghe",
                "Cổng kết nối/sạc",
                "Cổng giao tiếp",
                "Cổng sạc",
                "Jack cắm",
                "Cổng kết nối",
            ]
        }
        return ", ".join(value) if value else None

    def sound_tech():
        value = [j.strip() for i, j in data if i == "Công nghệ âm thanh"]
        return value[0] if value else None

    def compatible():
        value = [j.strip() for i, j in data if i == "Tương thích"]
        return value[0] if value else None

    def control():
        value = [j.strip() for i, j in data if i == "Điều khiển"]
        return value[0] if value else None

    def case_battery():
        value = [j.strip() for i, j in data if i == "Thời lượng pin hộp sạc"]
        return value[0] if value else None

    def power_consumption():
        value = [j.strip() for i, j in data if i == "Công suất tiêu thụ điện"]
        return value[0] if value else None

    match device:
        case "phone":
            return {
                "cpu": cpu(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "storage": storage(),
                "rearcam_specs": rearcam_specs(),
                "frontcam_specs": frontcam_specs(),
                "screen_type": screen_type(),
                "screen_size": screen_size(),
                "screen_panel": screen_panel(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "screen_nits": screen_nits(),
                "os": os(),
                "water_resistant": water_resistant(),
                "battery": battery(),
                "charger": charger(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "network": network(),
                "ports": ports(),
            }
        case "tablet":
            return {
                "cpu": cpu(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "storage": storage(),
                "rearcam_specs": rearcam_specs(),
                "frontcam_specs": frontcam_specs(),
                "screen_size": screen_size(),
                "screen_panel": screen_panel(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "os": os(),
                "water_resistant": water_resistant(),
                "battery": battery(),
                "charger": charger(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "network": network(),
                "ports": ports(),
            }
        case "laptop":
            return {
                "cpu": cpu(),
                "cpu_cores": cpu_cores(),
                "cpu_threads": cpu_threads(),
                "cpu_speed": cpu_speed(),
                "gpu": gpu(),
                "ram": ram(),
                "max_ram": max_ram(),
                "ram_type": ram_type(),
                "ram_bus": ram_bus(),
                "storage": storage(),
                "webcam": webcam(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "screen_tech": screen_tech(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "screen_nits": screen_nits(),
                "os": os(),
                "battery": battery(),
                "weight": weight(),
                "material": material(),
                "connectivity": connectivity(),
                "ports": ports(),
            }
        case "watch":
            return {
                "cpu": cpu(),
                "storage": storage(),
                "screen_type": screen_type(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "os": os(),
                "water_resistant": water_resistant(),
                "connectivity": connectivity(),
                "battery": battery(),
                "weight": weight(),
                "material": material(),
            }
        case "earphones":
            return {
                "sound_tech": sound_tech(),
                "compatible": compatible(),
                "control": control(),
                "water_resistant": water_resistant(),
                "ports": ports(),
                "connectivity": connectivity(),
                "battery": battery(),
                "case_battery": case_battery(),
                "weight": weight(),
            }
        case "screen":
            return {
                "screen_type": screen_type(),
                "screen_panel": screen_panel(),
                "screen_size": screen_size(),
                "screen_tech": screen_tech(),
                "screen_res": screen_res(),
                "screen_rate": screen_rate(),
                "power_consumption": power_consumption(),
                "ports": ports(),
                "weight": weight(),
            }

In [None]:
full_data = None  # include some specs info but messy
specs_data = []  # full specs info
product = None
path = None


fetched = await Crawler.async_inspect(
    "https://www.thegioididong.com/dong-ho-thong-minh/masstel-smart-hero-20",
    xpath="//script[@id='productld']|//div[@class='box-specifi']/ul/"
    + "li[.//span[@class='circle'] or .//a[contains(@class,'tzLink')] or .//span[@class='']]",
    limit_content_in=[
        r'<script[^>]*id="productld"[^>]*>.*?</script>',
        r'<section[^>]*class="detail detailv2"[^>]*>.*?</section>',
    ],  # [^>] and .*? are for non-greedy
    encoding="utf-8",
)

# classify fetched data
data = [
    re.sub(r"\s{2,}", ", ", i.text_content().strip())
    for i in fetched
    if ":" in i.text_content()
]  # remove noise from fetched json
json_content = [i for i in data if re.findall(r"{|}", i)]
tags_content = [
    (
        i.split(":")[0].strip(),
        "".join(i.split(":")[1:]).removeprefix(",").strip(),
    )
    for i in data
    if not re.findall(r"{|}", i)
]

full_data = json.loads(json_content[0])
specs_data.extend(tags_content)

In [None]:
p = Watch(1413, "safds", **parse_specs_info(specs_data, "watch"))
asdict(p)

In [None]:
df = pd.read_csv("../data/scraped/thegioididong_screens_2025-09-05.csv")



df
# df.columns