In [1]:
%pip install beautifulsoup4 lxml openpyxl

Note: you may need to restart the kernel to use updated packages.


In [None]:
from html_handler import convert_html_to_xlsx
from pathlib import Path


HTML_INPUT_DIR = Path("../data/html")
OUTPUT_DIR = Path("../data/dictionaries")  # Folder contains .xlsx
OUTPUT_DIR.mkdir(exist_ok=True)

for html_path in HTML_INPUT_DIR.glob("*.html"):
    source_lang = html_path.stem.split("-")[0]
    target_lang = html_path.stem.split("-")[1]
    df = convert_html_to_xlsx(
        html_path=html_path,
        out_folder=OUTPUT_DIR,
        source_lang=source_lang,
        target_lang=target_lang,
    )
    print(
        f"Converted {html_path} to {OUTPUT_DIR / (html_path.stem + '.xlsx')} with {len(df)} entries."
    )


Converted ../data/html/vi-ko.html to ../data/dictionaries/vi-ko.xlsx with 11729 entries.
Converted ../data/html/ko-vi.html to ../data/dictionaries/ko-vi.xlsx with 568 entries.
Converted ../data/html/ge-vi.html to ../data/dictionaries/ge-vi.xlsx with 23543 entries.
Converted ../data/html/vi-ge.html to ../data/dictionaries/vi-ge.xlsx with 11432 entries.


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import re

def html_to_xlsx_clean(html_file: str, out_file: str, header_tag: str = "h2"):
    with open(html_file, "r", encoding="utf-8", errors="ignore") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    rows = []
    
    for h in soup.find_all(header_tag, class_="calibre2"):
        headword = h.get_text(strip=True)
        
        for sib in h.next_siblings:
            if (getattr(sib, "name", None) == header_tag and 
                "calibre2" in sib.get("class", [])):
                break
                
            text = ""
            if isinstance(sib, str):
                text = sib.strip()
            elif hasattr(sib, "get_text"):
                text = sib.get_text(separator="\n", strip=True)

            for line in text.split("\n"):
                if line.strip().startswith("-"):
                    tgt = line.strip().lstrip("- ").strip()
                    tgt = re.sub(r"\{.*?\}", "", tgt).strip()
                    if tgt:
                        rows.append({
                            "Source": headword,
                            "Type": "",
                            "Pronunciation": "",
                            "Target": tgt
                        })

    df = pd.DataFrame(rows, columns=["Source", "Type", "Pronunciation", "Target"])
    df.to_excel(out_file, index=False)
    print(f"Converted {len(df)} valid rows -> {out_file} (using {header_tag} tags)")
    return df



In [None]:


#vn-jp :h2
#fr-vn: h3
#vn-fr: h2

HTML_INPUT_DIR_FR_VN = Path("../data/html/vn-fr.html")
OUTPUT_DIR_FR_VN = Path("../data/dictionaries/vn-fr.xlsx") 



html_to_xlsx_clean(HTML_INPUT_DIR_FR_VN, OUTPUT_DIR_FR_VN, header_tag="h2")

Converted 37992 valid rows -> ../data/dictionaries/vn-fr.xlsx (using h2 tags)


Unnamed: 0,Source,Type,Pronunciation,Target
0,a dua,,,faire chorus avec; suivre par snobisme = A_dua...
1,a ha,,,hourra!; hurrah! = A ha! máy_bay địch cháy rồi...
2,a hoàn,,,"(lit., arch.) servante"
3,a la hán,,,(rel.) arhant; arahant; lo-hant
4,a phiến,,,(arch.) (cũng nói a_phiện) opium
...,...,...,...,...
37987,ực,,,bruit de déglutition =ừng ực +(redoublement; a...
37988,ỷ,,,(bot.) idésia -loge des tablettes ancestrales ...
37989,ỷ lại,,,compter passivement sur; ne compter que sur; s...
37990,ỷ quyền,,,se prévaloir de son autorité (de l'autorité de...
