In [None]:
!pip install beautifulsoup4



In [None]:
from enum import Enum
from bs4 import BeautifulSoup

In [None]:
class ScrapeType(Enum):
  TEXT = 1
  HTML = 2

In [None]:
# Scrapes parallel video subtitles from "https://ailt.ilrdf.org.tw/colloquial/index"
class HTMLScraper:
  def init_soup(self, html: str):
    self.soup = BeautifulSoup(html)

  # Returns a list of lists pairing source lang subtitles with target lang subtitles
  def parallel_scrape(self, source_class: str, target_class: str):
    sources = self.soup.find_all('div', class_=source_class) #translation_ind
    targets = self.soup.find_all('div', class_=target_class)  #translation_zh

    pairs = []
    for source_div, target_div in zip(sources, targets):
      source_spans = [span.text for span in source_div.find_all('span', class_='ind_dictionary')]
      source_line = ' '.join(source_spans)
      target_line = target_div.get_text(strip=True)
      pairs.append([source_line, target_line])

    return pairs

In [None]:
# Scrapes parallel sentences from presidential apologies or any text file.
class TextFileScraper:
  def set_doc_locations(self, d1: str, d2: str):
    self.source_loc = d1
    self.target_loc = d2

  def parallel_scrape(self):
    with open(self.source_loc, 'r', encoding='utf-8') as file:
      source_text = file.read()
    with open(self.target_loc, 'r', encoding='utf-8') as file:
      target_text = file.read()

    source_sents = source_text.split("\n")
    target_sents = target_text.split("\n")
    min_length = min(len(source_sents), len(target_sents)) # Min length sentence alignment - may change
    source_sents = source_sents[:min_length]
    target_sents = target_sents[:min_length]

    pairs = []
    for source_sent, target_sent in zip(source_sents, target_sents):
        pairs.append([source_sent.strip() + '\n', target_sent.strip() + '\n'])

    return pairs

In [None]:
# Factory method for creating scrapers
def getScraper(scrapeType : ScrapeType):
    scrapers = {
        ScrapeType.HTML: HTMLScraper,
        ScrapeType.TEXT: TextFileScraper,
    }

    return scrapers[scrapeType]()