In [None]:
import time
import pickle
from itertools import combinations

import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select

import pyderman

from tqdm import tqdm

from lazyme import zigzag, retry

In [None]:
options = Options()
options.headless = True # Somehow somtimes headless don't working =( for this site.

path = pyderman.install(browser=pyderman.chrome)

driver = webdriver.Chrome(path, options=options)

In [None]:
# Fetch the page.
driver.get('https://www.gov.sg/resources/translation')

In [None]:
# Select the "All" category.
category=Select(driver.find_element_by_name("content_0$DdlCategory"))
category.select_by_value('-1') # All.

In [None]:
# Selec the "languages".
from_lang = Select(driver.find_element_by_name("content_0$DdlFrom"))
to_lang = Select(driver.find_element_by_name("content_0$DdlTo"))

from_lang.select_by_value('1') # English.
to_lang.select_by_value('2')   # Mandarin.

In [None]:
@retry(Exception, delay=1)
def find_last_page(driver):
    # Go to the last page .
    driver.find_element_by_id("content_0_RGridTranslation_ctl00_ctl03_ctl01_Last").click()
    # Find what is the page no. of the last page.
    bsoup = BeautifulSoup(driver.page_source, 'lxml')
    last_page = int(bsoup.find("tr", attrs={"class":"rgPager"}).find_all('span')[-1].text)
    return last_page
    
# Click on the "search" (magnifying glass) button.
driver.find_element_by_name("content_0$BtnTranslateSearch").click()
# Find the last page.
last_page = find_last_page(driver)
# Go back to the first page.
driver.find_element_by_id("content_0_RGridTranslation_ctl00_ctl03_ctl01_BtnFirst").click()

In [None]:
assert last_page == 1045

In [None]:
@retry(Exception, delay=1)
def munge_page_for_translations(driver):
    # Reads the page source into beautiful soup.
    html = driver.page_source
    bsoup = BeautifulSoup(driver.page_source, 'lxml')
    # Munge and get the translations. 
    translations = [div.text.strip() for div in bsoup.find('tbody').find_all('div') if div.text.strip()]
    # zigzag splits a list into two by alternative, even and odd items.
    # zip(*iterable) iterates throught the zigzag list one pair at a time.
    return dict(zip(*zigzag(translations)))

In [None]:
munge_page_for_translations(driver)

In [None]:
terminology = {}

# Iterate through the pages and get the dictionary entries for each page.
for i in tqdm(range(last_page)):
    translations = munge_page_for_translations(driver)
    terminology.update(translations)
    # Moves to the next page.    
    driver.find_element_by_id("content_0_RGridTranslation_ctl00_ctl03_ctl01_Next").click()
    driver.implicitly_wait(1.5)

In [None]:
# Convert the dictionary to a two columns dataframe.
df = pd.DataFrame(list(terminology.items()), columns=['english', 'mandarin'])

In [None]:
df.head()

In [None]:
# Save the dataframe to tsv file.
df.to_csv('../datasets/gov-sg-terms-translations.tsv', sep='\t', index=False, quotechar='"')

In [None]:
# Example to re-read the saved tsv file.
pd.read_csv('../datasets/gov-sg-terms-translations.tsv', sep='\t', quotechar='"')