In [80]:
# Yabla dictionary scraper

import requests
from bs4 import BeautifulSoup

# Putting website content into a request object
r = requests.get("https://chinese.yabla.com/chinese-english-pinyin-dictionary.php?define=point")

# check status code: if successfully accessed, will print Response [200]
print(r)

# print contents. Will print the entire HTML code of the page!
print(r.content)

rformat = BeautifulSoup(r.content, 'html.parser')


r2 = requests.get("https://chinese.yabla.com/chinese-english-pinyin-dictionary.php?define=seashell")

rformat2 = BeautifulSoup(r2.content, 'html.parser')


<Response [200]>
b'<!DOCTYPE html>\n\n<html  lang="en">\n<head>\n\n\n<title>point | Definition | Mandarin Chinese Pinyin English Dictionary | Yabla Chinese</title>\n<!-- \n\tServer Page Time: 0.008s rey.yabla.com\n\t-->\n\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<meta name="msapplication-config" content="none"/>\n<meta name="description" content="point definition at Chinese.Yabla.com, a free online dictionary with English, Mandarin Chinese, Pinyin, Strokes &amp; Audio. Look it up now!" />\n<!-- favicon images for all different resolutions -->\n<link rel="apple-touch-icon-precomposed" sizes="57x57" href="//yabla.vo.llnwd.net/media.yabla.com/images/new-favicons/apple-touch-icon-57x57.png" />\n<link rel="apple-touch-icon-precomposed" sizes="60x60" href="//yabla.vo.llnwd.net/media.yabla.com/images/new-favicons/apple-touch-icon-60x60.png" />\n<link rel="apple-touch-icon-precomposed" sizes="72x72" href="//yabla.vo.llnwd.net/media.yabla.com/images/new-favicons/a

In [86]:
# So, Yabla's dictionary contents are pretty straightforward
# We can find the first/most relevant definition in
# <span class="word">
#   <a href="?define-[character]">[relevant character 1]</a>
#   <a href="?define-[character]">[relevant character 2]</a>
# and so on so forth - a multi-character word thus gets two or three of these
# If the term isn't in the dictionary, we can see this under <ul id="search_results">
# Where <li> will say "No matches found for "
# Actually though, if the term isn't in the dictionary, then what we do is
# use the .find() method on a BeautifulSoup object
# which will just give us None if the term isn't in the dictionary

# Let's write some code to extract just one word!
import re
tlString = rformat.find_all(class_="entry center_maxed", limit=3)
wordStrings = []

for strng in tlString:
    equiv = strng.find(class_="word")
    equivMatches = re.findall("(?<=\">).(?=</a)", str(equiv))
    wordStrings.append("".join(equivMatches))

print(wordStrings)
tlString2 = rformat2.find_all(class_="entry center_maxed")
print(tlString2)

# Pretty sure we can use regex to extract the characters between "> and </a
# import re
# matches = re.findall("(?<=\">).(?=</a)", tlString)
# matches = re.findall('(?<=")(.*?)(?=</a)', str(tlString))
# print(matches)

# print(matches)

# matches2 = re.findall("(?<=\">).(?=</a)", tlString2)
# print(matches2)

# # So our findall method using regex gives us a LIST of individual characters making up the list
# # We can then concatenate the list using "join"

# matchStr = "".join(matches)
# matchStr2 = "".join(matches2)

# print(matchStr)
# print(matchStr2)


['处', '点', '热点']
[]


In [83]:
# Defining a helper function to extract TL equiv from the HTML
def getEquiv(html):
    import re
    wordStrings = []

    for strng in html:
        equiv = strng.find(class_="word")
        equivMatches = re.findall("(?<=\">).(?=</a)", str(equiv))
        wordStrings.append("".join(equivMatches))
    # tlString = str(html)

    # tlChars = re.findall("(?<=\">).(?=</a)", tlString)

    # tlEquiv = "".join(tlChars)

    # return tlEquiv
    return wordStrings



In [87]:
# Our full scraper script for the Yabla online dictionary!

import json
from urllib.error import HTTPError

# Load english text set
engWords = []
with open("C:/Users/jkami/OneDrive/Documents/UCSB/SPR23/LING111 Project/english_lexicon.txt") as in_file:
    for line in in_file:
        engWords.append(line.strip())
engChnDict = dict()
i = 0

# Iterating over every word in our text set
for word in engWords[]:
    if (i % 11900) == 0:
        print(i)
    i += 1
    # Creating URL to query
    wordUrl = f"https://chinese.yabla.com/chinese-english-pinyin-dictionary.php?define={word}"
    
    # Getting HTMl, formatting, finding translation segment
    try:
        r = requests.get(wordUrl)
    except HTTPError as err:
        continue
    rhtml = BeautifulSoup(r.content, 'html.parser')
    # tlString = rhtml.find(class_="word")
    
    tlString = rhtml.find_all(class_="entry center_maxed", limit=3)

    # If word isn't in dictionary, rhtml.find returns None
    if tlString == []:
        continue
    else: 
        # Word is in dictionary, so get the translation
        zhEquiv = getEquiv(tlString)
        
    engChnDict[word] = zhEquiv


# Pushing the whole dictionary to a JSON file!
with open("yabla dict.json", "w", encoding='utf-8-sig') as out_file:
    json.dump(engChnDict, out_file, indent=4, ensure_ascii=False)


0
['前', '关于', '关于']
['亚琛', '阿亨', '阿亨工业大学']
['土猪', '土豚']
['炎亚纶', '亚伦']


In [3]:
# Now to put all three Yabla dictionaries together...

import json
dictList = []

for i in range(1, 4):
    with open(f"yabla dict jsons/yabla dict {i}.json", encoding="utf-8-sig") as in_file:
        ydict = json.load(in_file)
        dictList.append(ydict)

fullYablaDict = dictList[0] | dictList[1] | dictList[2]

# for i in range(2, 6):
#     fullCDict = fullCDict | dictList[i]

with open("yabla dict jsons/full yabla dict.json", "w", encoding="utf-8-sig") as out_file:
    json.dump(fullYablaDict, out_file, indent=4, ensure_ascii=False)