In [13]:
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import os

## Static Variables

In [4]:
# Config
LANGUAGE = "hu" # "gr", "m", "s", "a", "uz", "cz", "th"
VOCABULARY = "vo"
SENTENCE = "es"

# File path
DATA_DIR = "./data"
VOCABULARY_EXCEL_PATH = f"{DATA_DIR}/중급헝가리어 어휘 (600).xlsx"
SENTENCE_EXCEL_PATH = f"{DATA_DIR}/중급헝가리어 문장 (700).xlsx"

## LOAD TEMPLATE 

In [25]:
# load html via bs4
with open(f"{DATA_DIR}/template.html", encoding="UTF8") as fp:
    template = BeautifulSoup(fp, 'html.parser')

## FIND TAGS

In [8]:
template.find("div", {"class": "child-div page-header-text"})

<div class="child-div page-header-text">1주차.단어</div>

In [9]:
template.find("span", {"id": "text1", "class": "block-text-span"})

<span class="block-text-span" id="text1">헝가리어 텍스트 입력</span>

In [10]:
template.find("span", {"id": "text2", "class": "block-text-span", "style": "display: none"})

<span class="block-text-span" id="text2" style="display: none">한국어 텍스트 입력</span>

In [11]:
template.find("audio", {"id": "audio1"}).find("source")["src"]

'audio/gr-es-01-001.mp3'

In [12]:
template.find("audio", {"id": "audio2"}).find("source")["src"]

'audio/gr-es-01-001.mp3'

## CONVERT

In [16]:
n_week = 2
contents_idx = 44
audio_file_name = f"audio/{LANGUAGE}-{VOCABULARY}-{n_week:02}-{contents_idx:03}.mp3"

In [30]:
# access by tag and replace with new text
template.find("div", {"class": "child-div page-header-text"}).string.replace_with(f"{n_week}주차.단어")

'1주차.단어'

In [None]:
template.find("audio", {"id": "audio1"}).find("source")["src"] = audio_file_name
template.find("audio", {"id": "audio2"}).find("source")["src"] = audio_file_name

## READ Excel

### VOCABULARY

In [32]:
for n_week in tqdm(range(1, 15)):
    # Convert vocabulary
    vocabulary_df = pd.read_excel(
        VOCABULARY_EXCEL_PATH,
        sheet_name=f"단어-{n_week}주차",
        header=None,
        names=[LANGUAGE, "kr"]
    )
    
    for sent_idx, sentence in vocabulary_df.iterrows():
        with open(f"{DATA_DIR}/template.html", encoding="UTF8") as fp:
            template = BeautifulSoup(fp, 'html.parser')
        template.find("div", {"class": "child-div page-header-text"}).string.replace_with(f"{n_week}주차.단어")
        template.find("span", {"id": "text1", "class": "block-text-span"}).string.replace_with(sentence[LANGUAGE])
        template.find("span", {"id": "text2", "class": "block-text-span", "style": "display: none"}).string.replace_with(sentence["kr"])
        audio_file_name = f"audio/{LANGUAGE}-{VOCABULARY}-{n_week:02}-{(sent_idx+1):03}.mp3"
        template.find("audio", {"id": "audio1"}).find("source")["src"] = audio_file_name
        template.find("audio", {"id": "audio2"}).find("source")["src"] = audio_file_name

        save_path = f"./result/단어/{n_week}주차"
        if not os.path.exists(save_path): 
            os.makedirs(save_path)
        with open(f"{save_path}/{sent_idx+2}.html", "w", encoding="UTF8") as file:
            file.write(str(template))

100%|██████████| 14/14 [00:01<00:00,  7.51it/s]


## SENSTENCE

In [33]:
for n_week in tqdm(range(1, 15)):
    # Convert sentence
    sentence_df = pd.read_excel(
        SENTENCE_EXCEL_PATH,
        sheet_name=f"예문-{n_week}주차",
        header=None,
        names=[LANGUAGE, "kr"]
    )
    
    for sent_idx, sentence in sentence_df.iterrows():
        with open(f"{DATA_DIR}/template.html", encoding="UTF8") as fp:
            template = BeautifulSoup(fp, 'html.parser')
        template.find("div", {"class": "child-div page-header-text"}).string.replace_with(f"{n_week}주차.문장")
        template.find("span", {"id": "text1", "class": "block-text-span"}).string.replace_with(sentence[LANGUAGE])
        template.find("span", {"id": "text2", "class": "block-text-span", "style": "display: none"}).string.replace_with(sentence["kr"])
        audio_file_name = f"audio/{LANGUAGE}-{SENTENCE}-{n_week:02}-{(sent_idx+1):03}.mp3"
        template.find("audio", {"id": "audio1"}).find("source")["src"] = audio_file_name
        template.find("audio", {"id": "audio2"}).find("source")["src"] = audio_file_name

        save_path = f"./result/문장/{n_week}주차"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(f"{save_path}/{sent_idx+2}.html", "w", encoding="UTF8") as file:
            file.write(str(template))

100%|██████████| 14/14 [00:02<00:00,  5.75it/s]
