In [50]:
import time
import pickle
from itertools import combinations

import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select

import pyderman

from tqdm import tqdm

from lazyme import zigzag, retry

In [32]:
options = Options()
options.headless = True # Somehow headless is not working =( for this site.

path = pyderman.install(browser=pyderman.chrome)

driver = webdriver.Chrome(path, options=options)

chromedriver is already installed.


In [33]:
# Fetch the page.
driver.get('https://www.gov.sg/resources/translation')

In [34]:
# Select the "All" category.
category=Select(driver.find_element_by_name("content_0$DdlCategory"))
category.select_by_value('-1') # All.

In [35]:
# Selec the "languages".
from_lang = Select(driver.find_element_by_name("content_0$DdlFrom"))
to_lang = Select(driver.find_element_by_name("content_0$DdlTo"))

from_lang.select_by_value('1') # English.
to_lang.select_by_value('2')   # Mandarin.

In [36]:
@retry(Exception, delay=1)
def find_last_page(driver):
    # Go to the last page .
    driver.find_element_by_id("content_0_RGridTranslation_ctl00_ctl03_ctl01_Last").click()
    # Find what is the page no. of the last page.
    bsoup = BeautifulSoup(driver.page_source, 'lxml')
    last_page = int(bsoup.find("tr", attrs={"class":"rgPager"}).find_all('span')[-1].text)
    return last_page
    
# Click on the "search" (magnifying glass) button.
driver.find_element_by_name("content_0$BtnTranslateSearch").click()
# Find the last page.
last_page = find_last_page(driver)
# Go back to the first page.
driver.find_element_by_id("content_0_RGridTranslation_ctl00_ctl03_ctl01_BtnFirst").click()

'NoneType' object has no attribute 'find_all', Retrying in 1 seconds...


In [37]:
assert last_page == 1045

In [40]:
@retry(Exception, delay=1)
def munge_page_for_translations(driver):
    # Reads the page source into beautiful soup.
    html = driver.page_source
    bsoup = BeautifulSoup(driver.page_source, 'lxml')
    # Munge and get the translations. 
    translations = [div.text.strip() for div in bsoup.find('tbody').find_all('div') if div.text.strip()]
    # zigzag splits a list into two by alternative, even and odd items.
    # zip(*iterable) iterates throught the zigzag list one pair at a time.
    return dict(zip(*zigzag(translations)))

In [41]:
munge_page_for_translations(driver)

{'"Coverage for Life, Coverage for All"': '终身受保，人人受保',
 '"Don\'t Drive to Drink. And "You\'ll Never Drink and Drive."': '“不要开车去喝酒就可避免酒后开车。”',
 '"Foreign Equity Investment in Singapore" Report': '《新加坡外来投资》报告',
 '"Key Household Income Trends" Report': '《住户收入主要趋势》报告',
 '"Made in Singapore" flea market': '“新加坡制造跳蚤市场”',
 '"Monthly Digest of Statistics, Singapore"': '《新加坡统计月刊》',
 '"Multi-polar, multi-support" strategy': '多点多极支撑发展战略',
 '"My Charity" page': '"我的慈善"网页',
 '"Population Trends" Report': '《人口趋势》报告',
 '"Really Worth It " Hawker List': '“最物有所值”小贩名单'}

In [42]:
terminology = {}

# Iterate through the pages and get the dictionary entries for each page.
for i in tqdm(range(last_page)):
    translations = munge_page_for_translations(driver)
    terminology.update(translations)
    # Moves to the next page.    
    driver.find_element_by_id("content_0_RGridTranslation_ctl00_ctl03_ctl01_Next").click()
    driver.implicitly_wait(1.5)

  1%|▏         | 15/1045 [00:33<40:55,  2.38s/it]

KeyboardInterrupt: 

In [55]:
# Convert the dictionary to a two columns dataframe.
df = pd.DataFrame(list(terminology.items()), columns=['english', 'mandarin'])

In [56]:
df.head()

Unnamed: 0,english,mandarin
0,"""Coverage for Life, Coverage for All""",终身受保，人人受保
1,"""Don't Drive to Drink. And ""You'll Never Drink...",“不要开车去喝酒就可避免酒后开车。”
2,"""Foreign Equity Investment in Singapore"" Report",《新加坡外来投资》报告
3,"""Key Household Income Trends"" Report",《住户收入主要趋势》报告
4,"""Made in Singapore"" flea market",“新加坡制造跳蚤市场”


In [59]:
# Save the dataframe to tsv file.
df.to_csv('../datasets/gov-sg-terms-translations.tsv', sep='\t', index=False, quotechar='"')

In [61]:
# Example to re-read the saved tsv file.
pd.read_csv('../datasets/gov-sg-terms-translations.tsv', sep='\t', quotechar='"')

Unnamed: 0,english,mandarin
0,"""Coverage for Life, Coverage for All""",终身受保，人人受保
1,"""Don't Drive to Drink. And ""You'll Never Drink...",“不要开车去喝酒就可避免酒后开车。”
2,"""Foreign Equity Investment in Singapore"" Report",《新加坡外来投资》报告
3,"""Key Household Income Trends"" Report",《住户收入主要趋势》报告
4,"""Made in Singapore"" flea market",“新加坡制造跳蚤市场”
...,...,...
10435,ZHENGHUA FLYOVER,正华立交桥
10436,Zhong Clan Association Singapore,新加坡钟氏公会
10437,ZION CLOSE,锡安弄
10438,ZION ROAD,锡安路
