#### LING 111 Project: ENG-CHN word to word translation

In [13]:
# Scraping a dictionary: rudimentary steps
# First, request a website!
# For this one, I'll request the Cambridge ENG-CHN dictionary
# Having checked https://dictionary.cambridge.org/robots.txt to make sure I'm legally scraping the website
# We can continue!

import requests
from bs4 import BeautifulSoup

# Putting website content into a request object
# Cambridge gives us a ConnectionError when scraping without an agent
# So we're using a random user agent to avoid this problem
r = requests.get("https://dictionary.cambridge.org/dictionary/english-chinese-simplified/peacock", headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
# Otherwise we could just have done this:
#r = requests.get("https://dictionary.cambridge.org/dictionary/english-chinese-simplified/person")

# check status code: if successfully accessed, will print Response [200]
print(r)

# print contents. Will print the entire HTML code of the page!
print(r.content)


<Response [200]>
b'<!doctype html>\n<html lang="en" >\n\n<head>\n    <title>peacock in Simplified Chinese - Cambridge Dictionary</title>\n    <meta charset="utf-8">\n    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n    <meta name="description" content="peacock translate: &#23380;&#38592;, &#27880;&#37325;&#20202;&#34920;&#21644;&#34915;&#39280;&#30340;&#30007;&#23376;&#65292;&#34394;&#33635;&#39556;&#20658;&#30340;&#30007;&#23376;. Learn more in the Cambridge English-Chinese simplified Dictionary.">\n    <meta name="keywords" content="peacock, chinese (simplified), dictionary, english, british, british english, definition, define, meaning, spelling, conjugation, audio pronunciation, free, online">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name=\'viewport\' content="width=device-width,minimum-scale=1,initial-scale=1">\n            \n        \n    \n                \n                    <script type="text/javascript">\n                w

In [9]:
def getTranslation(tlString):
    # Takes in a string tlString, formatted as such:
    # <span class='trans dtrans dtrans-se break-cj' 
    # lang='zh-Hans'> [chinese translation equivalent] </span>
    # Returns the most relevant Chinese translation
    import re

    # Using regex to get [chinese translation equivalent]
    match = re.findall(r"(?<=>).*?(?=<)", tlString)[0]

    paren1 = match.find("（")
    paren2 = match.find("）")

    # Removes parenthesis section
    while paren1 != -1:
        if paren2 == -1:
            paren2 = len(match)
        match = match[:paren1] + match[paren2+1:]
        paren1 = match.find("（")
        paren2 = match.find("）")
    
    ellipsis = match.find("…")
    if ellipsis != -1:
        match = match[:ellipsis] + match[ellipsis+1:]

    slash = match.find("／")
    if slash != -1:
        match = match[:slash]
    comma = match.find("，")

    # Removing any commas or semicolons
    # We just want the first entry, which occurs before the commas/semicolons
    if comma != -1:
        match = match[:comma]

    comma2 = match.find("、")
    if comma2 != -1:
        match = match[:comma2]

    semic = match.find("；")
    if semic != -1:
        match = match[:semic]

    return match

In [14]:
# Okay, so we have our entire HTML page. But it's somewhat messy.
# We can use BeautifulSoup to clean it up.

# This puts our page content into proper format.
dictContent = BeautifulSoup(r.content, 'html.parser')

# How do we get our desired Chinese equivalent from this?
# Having inspected the format of the dictionary, we can see that the Chinese translation of an inputted word
# is best found in the HTML part that starts with "<span class="trans dtrans dtrans-se break-cj" ..."
# Thus, we will use the find method with the class_ kwarg in order to get the first instance
# of this class in the HTML
# Which will give us the <span> that covers the translation we're looking for!
equivs = dictContent.find_all(class_="trans dtrans dtrans-se break-cj", limit=3)

print(str(equivs))
# myString = str(dictContent.find(class_="trans dtrans dtrans-se break-cj"))

# Using our defined getTranslation function
# chnEquiv = getTranslation(myString)
# print(chnEquiv)
mylist = getTranslation(str(equivs))
print(mylist)


[<span class="trans dtrans dtrans-se break-cj" lang="zh-Hans">孔雀</span>, <span class="trans dtrans dtrans-se break-cj" lang="zh-Hans">注重仪表和衣饰的男子，虚荣骄傲的男子</span>]
孔雀


In [12]:
# Our full scraper script for the Cambridge online dictionary!
# VERSION 1
# Actually, I used a modified version that split the ENG words dataset into
# batches so that I could run the scrapers in parallel. They accomplish
# the same thing as this one large chunk of code.

import json
from urllib.error import HTTPError

# Load english text set
engWords = []
with open("newEngWords.txt") as in_file:
    for line in in_file:
        engWords.append(line.strip())

engChnDict = dict()
url = "https://dictionary.cambridge.org/dictionary/english-chinese-simplified/"

# Iterating over every word in our text set
for word in engWords:
    # Creating URL to query
    wordUrl = f"https://dictionary.cambridge.org/dictionary/english-chinese-simplified/{word}"
    
    # Getting HTMl, formatting, finding translation segment
    try:
        r = requests.get(wordUrl, headers={"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"})
    except HTTPError as err:
        continue
    rhtml = BeautifulSoup(r.content, 'html.parser')
    tlString = rhtml.find(class_="trans dtrans dtrans-se break-cj")

    # If word isn't in dictionary, rhtml.find returns None
    # as there is no trans dtrans dtrans-se break-cj class, since
    # Cambridge sends us to the homepage when a word isn't in the dictionary
    if tlString == None:
        continue
    else: 
        # Word is in dictionary, so get the translation
        tlString = str(tlString)
        zhEquiv = getTranslation(tlString)

    engChnDict[word] = zhEquiv

# Pushing the whole dictionary to a JSON file!
with open("cambridgedict.json", "w", encoding='utf-8-sig') as out_file:
    json.dump(engChnDict, out_file, indent=4, ensure_ascii=False)


FileNotFoundError: [Errno 2] No such file or directory: 'newEngWords.txt'

In [160]:
# Post-processing the data sets
def processWord(match):
    import re
    if match=="":
        return None
    
    index = re.search('[a-zA-Z]', match)
    if index != None:
        return None

    # Removes parenthesis section
    match = re.sub("[\（\[].*?[\）\]]", "", match)

    # Removing other non-relevant charascters that show up
    match = re.sub('[《》]', '', match)
        # match = match[:bracket1] + match[bracket1+1:bracket2] + match[bracket2+1:]

    colon = match.find("：")
    if colon != -1:
        match = match[:colon]
    
    match = re.sub('[…]', '', match)

    slash = match.find("／")
    if slash != -1:
        match = match[:slash]

    # Removing any commas or semicolons
    # We just want the first entry, which occurs before the commas/semicolons
    comma = match.find("，")
    if comma != -1:
        match = match[:comma]

    comma2 = match.find("、")
    if comma2 != -1:
        match = match[:comma2]

    semic = match.find("；")
    if semic != -1:
        match = match[:semic]

    semic2 = match.find("; ")
    if semic2 != -1:
        match = match[:semic2]

    return match


腹


In [1]:
# Knitting the scraped JSON dicts together
import json
dictList = []

for i in range(1, 6):
    with open(f"cambridge dict jsons/cambridgedict three senses {i}.json", encoding="utf-8-sig") as in_file:
        cdict = json.load(in_file)
        dictList.append(cdict)

fullCDict = dictList[0] | dictList[1]

for i in range(2, 5):
    fullCDict = fullCDict | dictList[i]

with open("cambridge dict jsons/full cdict three senses.json", "w", encoding="utf-8-sig") as out_file:
    json.dump(fullCDict, out_file, indent=4, ensure_ascii=False)

In [3]:
# Post-processing the text

with open("cambridge dict jsons/full cdict three senses.json", encoding="utf-8-sig") as in_file:
    cdict = json.load(in_file)

toDel = []
engWords = list(cdict.keys())
chnWords = list(cdict.values())

# Iterating over all Chinese words in the dictionary
# Processing them, appending the english key counterpart to a list
# if the word is empty
# and reassigning the key to the processed value otherwise
for i in range(len(chnWords)):
    totalProcessed = [word for word in chnWords[i] if word != None]
    if len(totalProcessed) == 0:
        toDel.append(engWords[i])
    else:
        engWord = engWords[i]
        cdict[engWord] = totalProcessed

for word in toDel:
    del cdict[word]

with open("cambridge dict jsons/processed cdict three senses.json", "w", encoding="utf-8-sig") as out_file:
    json.dump(cdict, out_file, indent=4, ensure_ascii=False)