In [None]:
%cd ..

import os
import json
import yaml
import time
import random
from tqdm.notebook import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib
from urllib.error import HTTPError

In [None]:
config_path = 'config.yaml'
with open(config_path) as file:
    config = yaml.safe_load(file)['dataset']

In [None]:
common = []
with open(config['common_char_path'], 'r') as f:
    for line in f:
        common.append(line.strip())
        
common = common[:100]

In [None]:
def send_word_key(word, driver):
    elem = driver.find_element_by_class_name('MuiInputBase-input')

    while len(elem.get_attribute('value')) > 0:
        elem.send_keys(Keys.BACK_SPACE)

    elem.send_keys(word)

    elem = driver.find_element_by_class_name("MuiButton-label")
    script = "arguments[0].click()"
    driver.execute_script(script, elem)
    assert driver.find_element_by_class_name('MuiInputBase-input').get_attribute('value') == word
    
def get_img_wrappers(driver):
    content = driver.page_source
    soup = BeautifulSoup(content, features='lxml')
    wrappers = soup.find_all('div', {'class': 'MuiGrid-root MuiGrid-item MuiGrid-grid-xs-4 MuiGrid-grid-sm-3 MuiGrid-grid-md-2 MuiGrid-grid-lg-2'})
    imgs = []
    for wrapper in wrappers:
        imgs.append((wrapper.find('img'), wrapper.find('p').text))
    return imgs

In [None]:
chrome_path = '/Users/kx/chromedriver'
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_path, options=chrome_options)

dict_url = 'https://www.shufadict.com/dict/x'
driver.get(dict_url)

loading = True
while loading:
    content = driver.page_source
    soup = BeautifulSoup(content)
    loading = soup.find('div', {'id': 'loading'}) is not None
    time.sleep(3)

In [None]:
with open(config['lookup_path'], 'r') as f:
    lookup_ls = json.load(f)

done = set([x[0] for x in lookup_ls])

In [None]:
prev_img = ''
for i, word in enumerate(common):

    print('[STATUS] {}/{} {}'.format(i + 1, len(common), word))
    
    if word in done:
        continue
    
    send_word_key(word, driver)

    word_path = os.path.join(config['raw_dir'], word)
    if not os.path.exists(word_path):
        os.makedirs(word_path)
    
    retries = 3
    imgs = get_img_wrappers(driver)
    
    while (len(imgs) == 0 or imgs[0] == prev_img) and retries > 0:
        time.sleep(random.random() + 5)
        imgs = get_img_wrappers(driver)
        retries -= 1
        
    if len(imgs) == 0:
        print('[ERROR] 0 images found'.format(word))
    
    for item in tqdm(imgs):
        
        (img, author) = item
        img_src = img['src']
        filename = img_src.split('/')[-1].split('@')[0]
        lookup = [word, author, filename]
        if lookup not in lookup_ls:
            lookup_ls.append(lookup)
            img_path = os.path.join(word_path, filename)
            if not os.path.exists(img_path):
                try:
                    urllib.request.urlretrieve(img_src, img_path)
                except HTTPError:
                    print('[ERROR] HTTPError', img_src)
        
    with open(config['lookup_path'], 'w') as f:
        json.dump(lookup_ls, f)
    
    prev_img = imgs[0]

In [None]:
driver.quit()