### Tải thư viện Selenium
- Đối với máy tính cá nhân dùng lệnh: `pip install selenium webdriver_manager`
- Đối với colab, sử dụng cell bên dưới để tải selenium

In [1]:
# %%shell
# # Ubuntu no longer distributes chromium-browser outside of snap
# #
# # Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# # Add debian buster
# cat > /etc/apt/sources.list.d/debian.list << "EOF"
# deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
# deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
# deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
# EOF

# # Add keys
# apt-key adv--keyserver keyserver.ubuntu.com--recv-keys DCC9EFBF77E11517
# apt-key adv--keyserver keyserver.ubuntu.com--recv-keys 648ACFD622F3D138
# apt-key adv--keyserver keyserver.ubuntu.com--recv-keys 112695A0E562B32A

# apt-key export 77E11517 | gpg--dearmour-o /usr/share/keyrings/debian-buster.gpg
# apt-key export 22F3D138 | gpg--dearmour-o /usr/share/keyrings/debian-buster-updates.gpg
# apt-key export E562B32A | gpg--dearmour-o /usr/share/keyrings/debian-security-buster.gpg

# # Prefer debian repo for chromium* packages only
# # Note the double-blank lines between entries
# cat > /etc/apt/preferences.d/chromium.pref << "EOF"
# Package: *
# Pin: release a=eoan
# Pin-Priority: 500


# Package: *
# Pin: origin "deb.debian.org"
# Pin-Priority: 300


# Package: chromium*
# Pin: origin "deb.debian.org"
# Pin-Priority: 700
# EOF

# # Install chromium and chromium-driver
# apt-get update
# apt-get install chromium chromium-driver

# # Install selenium
# pip install selenium

### Import các thư viện cần thiết

In [1]:
import pandas as pd
import re
import os
import requests
import time
import random

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Khởi tạo Selenium driver    
Driver trong Selenium đóng vai trò như trình duyệt web, giúp thực hiện các thao tác như truy cập vào trang web dựa vào đường dẫn, thao tác chuyển trang,...

In [2]:
WEBDRIVER_DELAY_TIME_INT = 10
TIMEOUT_INT = 10
service = Service()
chrome_options = webdriver.EdgeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("window-size=1920x1080")
chrome_options.headless = True
driver = webdriver.Edge(service=service, options=chrome_options)
driver.implicitly_wait(TIMEOUT_INT)
wait = WebDriverWait(driver, WEBDRIVER_DELAY_TIME_INT)

### Trích xuất nội dung thơ
- Dữ liệu về các bài thơ sẽ được lấy từ trang web [thivien.net](https://www.thivien.net/)

#### Tự code

In [4]:
def extract_poem_links(driver, url):
    driver.get(url)
    content_tags_xpath = '//*[@class="page-content container"]//div[@class="page-content-main"]//div[@class="list-item"]'
    content_tags = driver.find_elements(By.XPATH, content_tags_xpath)
    
    poem_links = []
    for tag in content_tags:
        try: 
            link_element = tag.find_element(By.XPATH, './/h4[@class="list-item-header"]/a')
            title = link_element.text
            link = link_element.get_attribute('href')
            poem_links.append({
                'title': title, 
                'link': link
            })
        except Exception as e: 
            print(f'Error extracting link: {e}')
            continue
    return poem_links

poem_links = []
for i in range(1, 11):
    links = extract_poem_links(driver, 
                           url=f'https://www.thivien.net/searchpoem.php?PoemType=16&ViewType=1&Country=2&Page={i}')
    poem_links.extend(links)
len(poem_links)

100

In [5]:
def scrape_poems(driver, poem_links):
    datasets = [] 
    for poem_link_idx in tqdm(range(len(poem_links))): 
        link = poem_links[poem_link_idx]['link']
        try:
            driver.get(link)
            time.sleep(random.uniform(3, 5))
            
            content = driver.find_element(By.XPATH, '//div[@class="poem-content"]/p')
            src_tag = driver.find_element(By.XPATH, '//div[@class="small"]')
            poem = {
                'title': poem_links[poem_link_idx]['title'],
                'content': content.text, 
                'source': src_tag.text, 
                'link': link, 
            }
            datasets.append(poem)
        except Exception as e:
            print(f'Error processing {link}: {e}')
    return datasets  

In [None]:
datasets = scrape_poems(driver=driver, poem_links=poem_links)

In [7]:
len(datasets)

75

In [8]:
# lưu thành file csv
df = pd.DataFrame(datasets)
df.to_csv('poem_dataset.csv', index=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    75 non-null     object
 1   content  75 non-null     object
 2   source   75 non-null     object
 3   link     75 non-null     object
dtypes: object(4)
memory usage: 2.5+ KB


#### code AIO

In [3]:
def extract_poem_links(driver, page_idx):
    main_url = f"https://www.thivien.net/searchpoem.php?PoemType=16&ViewType=1&Country=2&Age[]=3&Page={page_idx}"
    driver.get(main_url)
    time.sleep(random.uniform(3, 5))

    content_tags_xpath = '//*[@class="page-content container"]//div[@class="page-content-main"]//div[@class="list-item"]'
    content_tags = driver.find_elements(By.XPATH, content_tags_xpath)
    poem_links = []
    for tag in content_tags:
        try:
            link_element = tag.find_element(By.XPATH, './/h4[@class="list-item-header"]/a')
            poem_title = link_element.text
            poem_url = link_element.get_attribute("href")
            poem_links.append({"title": poem_title, "url": poem_url})
        except Exception as e:
            print(f"Error extracting link: {e}")
            continue
    return poem_links

In [4]:
def clean_poem_html(html):
    html = re.sub(r"<img.*?>", "", html, flags=re.IGNORECASE)
    html = re.sub(r"<i>.*?</i>", "", html, flags=re.IGNORECASE | re.DOTALL)
    html = re.sub(r"<b>(.*?)</b>(?!\s*(?:<br\s*/?>\s*){2,})", r"\1", html, flags=re.IGNORECASE)
    html = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
    html = re.sub(r"</?p>", "", html, flags=re.IGNORECASE)

    return html.strip()

def process_poem_content(html, poem_src, poem_url, default_title=""):
    cleaned = clean_poem_html(html)

    pattern = re.compile(r"<b>(.*?)</b>\s*\n{2,}", flags=re.IGNORECASE)
    matches = list(pattern.finditer(cleaned))

    poems = []
    if matches:
        for i, match in enumerate(matches):
            title = match.group(1).strip()
            start = match.end()
            end = matches[i+1].start() if i + 1 < len(matches) else len(cleaned)
            content = cleaned[start:end].strip("\n")
            poems.append({
                "title": title,
                "content": content,
                "source": poem_src,
                "url": poem_url
            })
    else:
        poems.append({
            "title": default_title,
            "content": cleaned,
            "source": poem_src,
            "url": poem_url
        })
    return poems

In [5]:
def scrape_poem(driver, poem_url, default_title):
    driver.get(poem_url)
    time.sleep(random.uniform(3, 5))

    poem_content_tag = WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div.poem-content"))
    )

    html_content = poem_content_tag.get_attribute("innerHTML")

    try:
        poem_src_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="small"]'))
        )
        poem_src = poem_src_tag.text
    except Exception:
        poem_src = ""
    return process_poem_content(html_content, poem_src, poem_url, default_title)

def scrape_poems(driver, num_pages=10):
    datasets = []
    for page_idx in tqdm(range(1, num_pages + 1)):
        poem_links = extract_poem_links(driver, page_idx)
        for poem in poem_links:
            poem_url = poem["url"]
            try:
                poems = scrape_poem(driver, poem_url, default_title=poem['title'])
                datasets.extend(poems)
            except Exception as e:
                print(f"Error processing {poem_url}: {e}")
                continue
    return datasets

In [6]:
datasets = scrape_poems(driver, num_pages=10)
driver.quit()

  0%|          | 0/10 [00:00<?, ?it/s]

 40%|████      | 4/10 [03:53<05:46, 57.75s/it]

Error processing https://www.thivien.net/Tu%E1%BB%87-S%E1%BB%B9/18/poem-vvXMqDr3PYay-7iOBtJn6A: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF738BCA7D5+20677]
	simdutf::get_active_implementation [0x00007FF738B26F80+492656]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF738E1DC8A+1807418]
	(No symbol) [0x00007FF7389672AC]
	(No symbol) [0x00007FF73896758B]
	(No symbol) [0x00007FF7389A85A7]
	(No symbol) [0x00007FF73898849F]
	(No symbol) [0x00007FF73895D6ED]
	(No symbol) [0x00007FF7389A5D61]
	(No symbol) [0x00007FF7389881C3]
	(No symbol) [0x00007FF73895CC36]
	(No symbol) [0x00007FF73895C0C4]
	(No symbol) [0x00007FF73895CA63]
	(No symbol) [0x00007FF738A458DD]
	(No symbol) [0x00007FF738A51D42]
	(No symbol) [0x00007FF738A4A703]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF738CA47DA+262026]
	simdutf::get_active_implementation [0x00007FF738B346B1+547745]
	simdutf::get_active_implementation [0x00007FF738B2D964+519764]
	simdutf::get_active_implem

 50%|█████     | 5/10 [05:32<06:03, 72.63s/it]

Error processing https://www.thivien.net/Tr%E1%BB%A5-V%C5%A9/01/poem-k6MlVSOJZfPvxk-XTGJRvA: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF738BCA7D5+20677]
	simdutf::get_active_implementation [0x00007FF738B26F80+492656]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF738E1DC8A+1807418]
	(No symbol) [0x00007FF7389672AC]
	(No symbol) [0x00007FF73896758B]
	(No symbol) [0x00007FF7389A85A7]
	(No symbol) [0x00007FF73898849F]
	(No symbol) [0x00007FF73895D6ED]
	(No symbol) [0x00007FF7389A5D61]
	(No symbol) [0x00007FF7389881C3]
	(No symbol) [0x00007FF73895CC36]
	(No symbol) [0x00007FF73895C0C4]
	(No symbol) [0x00007FF73895CA63]
	(No symbol) [0x00007FF738A458DD]
	(No symbol) [0x00007FF738A51D42]
	(No symbol) [0x00007FF738A4A703]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF738CA47DA+262026]
	simdutf::get_active_implementation [0x00007FF738B346B1+547745]
	simdutf::get_active_implementation [0x00007FF738B2D964+519764]
	simdutf::get_active_implement

100%|██████████| 10/10 [13:41<00:00, 82.10s/it]


In [7]:
# Lưu thành file csv
df2 = pd.DataFrame(datasets)
df2.to_csv('poem_dataset2.csv', index=True)

In [8]:
df2.head()

Unnamed: 0,title,content,source,url
0,“Bạn xấu như chiếc bóng”,Bạn xấu như chiếc bóng\nCứ bám riết theo anh\n...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
1,“Cái làm ta hạnh phúc”,Cái làm ta hạnh phúc\nThực ra cũng chẳng nhiều...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
2,“Chiều vừa xốp trên tay”,Chiều vừa xốp trên tay\nChợt nghe thoáng ong b...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/L%C3%A2m-Huy-Nhu%E1%BA...
3,“Chơi thân không có nghĩa”,Chơi thân không có nghĩa\nKhông cãi nhau bao g...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
4,“Có thể buồn chút ít”,"Có thể buồn chút ít\nMột mình, không người yêu...",[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
