In [5]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.NeBHJd4A9K/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.z3zplBkhZI/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.nnqHrp5jj4/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://security.ubuntu.com/ubuntu jammy



In [6]:
import pandas as pd
import os
import requests
import time
import re
import random

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [7]:
class PoemCrawler:
    def __init__(self, chromedriver_path='/usr/bin/chromedriver', timeout=10):
        """Initialize the poem crawler with WebDriver configuration."""
        self.timeout = timeout

        # Configure Chrome options
        service = Service(executable_path=chromedriver_path)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('window-size=1920x1080')
        chrome_options.headless = True

        # Initialize driver
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        self.driver.implicitly_wait(timeout)
        self.wait = WebDriverWait(self.driver, timeout)

    def clean_poem_content(self, html):
        """Clean HTML poem content and extract only the poem text with proper formatting."""
        # Remove unnecessary HTML tags but retain line breaks
        html = re.sub(r'<img.*?>', '', html, flags=re.IGNORECASE)
        html = re.sub(r'<i\b[^>]*>.*?</i>', '', html, flags=re.IGNORECASE | re.DOTALL)
        html = re.sub(r'</?b>', '', html, flags=re.IGNORECASE)
        html = re.sub(r'<span.*?</span>', '', html, flags=re.DOTALL)

        # Convert <p> and <br> tags to line breaks
        html = re.sub(r'</?p>', '\n', html, flags=re.IGNORECASE)
        html = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE)

        # Normalize line breaks and remove extra whitespace
        html = re.sub(r'\n{2,}', '\n', html)
        html = re.sub(r' +', ' ', html)

        # Format for Luc Bat style (filter lines with at least 6 words)
        lines = html.strip().split('\n')
        formatted_lines = []

        for line in lines:
            line = line.strip()
            if line:
                words = line.split()
                if len(words) >= 6:  # Luc Bat typically has at least 6 words per line
                    formatted_lines.append(" ".join(words))

        return "\n".join(formatted_lines)

    def extract_poem_source(self):
        """Extract poem source information from the webpage."""
        try:
            poem_src_tag = self.wait.until(
                EC.presence_of_element_located((By.XPATH, '//div[@class="small"]'))
            )
            return poem_src_tag.text
        except Exception:
            return ''

    def extract_poem_links(self, page_idx):
        """Extract poem links from the given page number."""
        main_url = f'https://www.thivien.net/searchpoem.php?VNPoem=th%C3%B4i&PoemType=13&ViewType=1&Country=2&Sort=Views&SortOrder=desc&Page={page_idx}'
        self.driver.get(main_url)
        time.sleep(random.uniform(2, 3))  # Reduced delay

        content_tags_xpath = '//*[@class="page-content container"]//div[@class="page-content-main"]//div[@class="list-item"]'
        content_tags = self.driver.find_elements(By.XPATH, content_tags_xpath)
        poem_links = []

        for tag in content_tags:
            try:
                link_element = tag.find_element(By.XPATH, './/h4[@class="list-item-header"]/a')
                poem_title = link_element.text
                poem_url = link_element.get_attribute('href')
                poem_links.append({'title': poem_title, 'url': poem_url})
            except Exception:
                continue

        return poem_links

    def scrape_poem(self, poem_url, poem_title):
        """Scrape a single poem from its URL."""
        self.driver.get(poem_url)
        time.sleep(random.uniform(1.5, 2.5))  # Reduced delay

        poem_content_tag = self.wait.until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.poem-content'))
        )
        html_content = poem_content_tag.get_attribute('innerHTML')
        poem_src = self.extract_poem_source()

        # Extract title from HTML (if not already provided)
        title_match = re.search(r'<h1>(.*?)</h1>', html_content, flags=re.IGNORECASE)
        title = title_match.group(1).strip() if title_match else poem_title

        # Clean and format poem content
        content = self.clean_poem_content(html_content)

        return {
            'title': title,
            'content': content,
            'source': poem_src,
            'url': poem_url
        }

    def scrape_poems(self, num_pages=10):
        """Scrape poems from the specified number of pages."""
        datasets = []

        for page_idx in tqdm(range(1, num_pages + 1), desc="Scraping pages"):
            poem_links = self.extract_poem_links(page_idx)

            for poem in tqdm(poem_links, desc=f"Page {page_idx} poems", leave=False):
                poem_url = poem['url']
                poem_title = poem['title']

                try:
                    poem_data = self.scrape_poem(poem_url, poem_title)
                    datasets.append(poem_data)
                except Exception as e:
                    print(f"Error processing {poem_url}: {str(e)}")
                    continue

        return datasets

    def save_to_csv(self, datasets, output_file='poem_dataset.csv'):
        """Save the collected poems to a CSV file."""
        df = pd.DataFrame(datasets)
        df.to_csv(output_file, index=True, encoding='utf-8')
        print(f"Successfully saved {len(datasets)} poems to {output_file}")
        return df

    def close(self):
        """Close the WebDriver."""
        if self.driver:
            self.driver.quit()

In [14]:
crawler = PoemCrawler()

# Scrape poems from 10 pages
datasets = crawler.scrape_poems(num_pages=10)

# Save to csv
df = pd.DataFrame(datasets)
df.to_csv('poem_lucbat_realfinal_dataset.csv', index=True)

Scraping pages:   0%|          | 0/10 [00:00<?, ?it/s]
Page 1 poems:   0%|          | 0/10 [00:00<?, ?it/s][A
Page 1 poems:  10%|█         | 1/10 [00:07<01:05,  7.32s/it][A
Page 1 poems:  20%|██        | 2/10 [00:10<00:41,  5.18s/it][A
Page 1 poems:  30%|███       | 3/10 [00:14<00:31,  4.56s/it][A
Page 1 poems:  40%|████      | 4/10 [00:18<00:24,  4.15s/it][A
Page 1 poems:  50%|█████     | 5/10 [00:24<00:25,  5.00s/it][A
Page 1 poems:  60%|██████    | 6/10 [00:29<00:18,  4.72s/it][A
Page 1 poems:  70%|███████   | 7/10 [00:42<00:22,  7.64s/it][A
Page 1 poems:  80%|████████  | 8/10 [00:47<00:13,  6.60s/it][A
Page 1 poems:  90%|█████████ | 9/10 [00:51<00:05,  5.77s/it][A
Page 1 poems: 100%|██████████| 10/10 [00:57<00:00,  5.92s/it][A
Scraping pages:  10%|█         | 1/10 [01:06<09:55, 66.19s/it]
Page 2 poems:   0%|          | 0/10 [00:00<?, ?it/s][A
Page 2 poems:  10%|█         | 1/10 [00:03<00:28,  3.17s/it][A
Page 2 poems:  20%|██        | 2/10 [00:06<00:28,  3.52s/it][A
P