# Scraping common Japanese words with examples

The goal is to obtain a csv file with data scraped from https://iknow.jp/courses

In [None]:
#import libraries
#https://www.linkedin.com/pulse/scraping-therapists-python-selenium-beautifulsoup-scott-edenbaum
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import numpy as np
import lxml
import pandas as pd
import re
import sys
from bs4 import BeautifulSoup, Comment

def scrape_japanese(url):
    driver = webdriver.Chrome("C:/Users/karol/drivers/chromedriver")  #select selenium web driver
    driver.get(url) #open the url in selenium
    jpVocabTable = [] #this list will be populated with scraped text
    soup = BeautifulSoup(driver.page_source,'html5lib') #grab the content with beautifulsoup for parsing
    main_table = soup.findAll("li",{"class":"item"})  # select the desired html node

    for row in main_table:
        rowList = []
        
        ######## find japanese words ###########################
        
        href = row.contents[0].contents[2].contents[0].find('a', href=True)
        word = row.find_all('a', href=True)
        jpWord= href.text.replace('\'', '')
        
        ######## find transliteration ###########################
        
        translit = row.contents[0].contents[2].contents[0].find('span', {"class":"transliteration"})
        if translit is not None:
            comments = translit.findAll(text=lambda text:isinstance(text, Comment))
            for comment in comments:
                comment.extract()
            transliteration = re.sub('\[|\]', '', translit.text)
        else: #there are cases where there is no transliteration and we want them empty
            transliteration = ""

        ######## find English translation ###########################

        trans = row.contents[0].contents[2].find('p', {"class":"response"})
        #if trans is not None:  #some returning nones
        translation = trans.text.replace('\'', '')

         ######## find example sentence 1 ###########################

        sen1 = row.contents[1].contents[0].contents[2].find('span').text

        ######## find example sentence 1 transliteration ###########################

        sen1transli = row.contents[1].contents[0].contents[2].find('p', {"class":"transliteration"}).text

        ######## find example sentence 1 translation ###########################

        sen1transla = row.contents[1].contents[0].contents[2].find('p', {"class":"translation"}).text

        ######## find example sentence 2 ###########################
        try:
            sen2 = row.contents[1].contents[1].contents[2].find('span').text
        except IndexError as e:
            print(e)

        ######## find example sentence 2 transliteration ###########################
        try:
            sen2transli = row.contents[1].contents[1].contents[2].find('p', {"class":"transliteration"}).text
        except IndexError as e:
            print(e)

        ######## find example sentence 2 translation ###########################
        try:
            sen2transla = row.contents[1].contents[1].contents[2].find('p', {"class":"translation"}).text
        except IndexError as e:
            print(e)


        rowList.extend([jpWord,transliteration, translation, sen1, sen1transli, sen1transla, sen2, sen2transli, sen2transla])
        jpVocabTable.append(rowList)
            
    driver.close()    
    print("Scraping Complete!")
    return(jpVocabTable)
        
        
url = "https://iknow.jp/courses/566921" 
japanese_list = scrape_japanese(url)

In [3]:
import pandas as pd
df = pd.DataFrame(japanese_list)
df.columns = ['word', 'kana', 'translation', 'sentence1', 'sen1kana', 'sen1trans','sentence2','sen2kana', 'sen2trans']
df.to_csv('japaneseWords_100.csv', encoding='utf8', index = False) #don't add an extra line for indexing
df.shape

(100, 9)

In [4]:
random_row = df.sample(n=1)
list_row =random_row.values.tolist()
list_row

[['体',
  'からだ',
  'body, physique, physical condition',
  '私は体が丈夫だ。',
  'わたし は からだ が じょうぶ だ。',
  "I'm physically strong.",
  'タバコは体に悪い。',
  'タバコ は からだ に わるい。',
  'Cigarettes are bad for your health.']]

In [1]:
#Now, repeat scraping for all 1000 words, saving them in separate files
#Note the pages do not follow consecurive numbers https://iknow.jp/courses/566921
#prepare URLs
def createUrlList():
    urlFront = "https://iknow.jp/courses/5669"
    urlEnds = ["21", "22", "24", "25", "26", "27", "28", "29", "30", "32"]
    urlList = []
    for element in urlEnds:
        newListelement = urlFront+element
        urlList.append(newListelement)
    return urlList

urls = createUrlList()
urls

['https://iknow.jp/courses/566921',
 'https://iknow.jp/courses/566922',
 'https://iknow.jp/courses/566924',
 'https://iknow.jp/courses/566925',
 'https://iknow.jp/courses/566926',
 'https://iknow.jp/courses/566927',
 'https://iknow.jp/courses/566928',
 'https://iknow.jp/courses/566929',
 'https://iknow.jp/courses/566930',
 'https://iknow.jp/courses/566932']

When running my function for other vocabulary pages, I got many "list index out of range errors". Sentences of interest were not always in the same place in each of the documents. I ran a few examples and noticed there are 2 possible tree structures. I rewrote my scraping program to check the length of the list, and then depending on the number of list elements retrieve the last element. This could be rewriten in an even nicer way, so it's not hard-coded (get element 2 or 3) but get the last element in a list of a any size. SOmething to improve in the future. Here is my new function:

In [2]:
#import libraries
#https://www.linkedin.com/pulse/scraping-therapists-python-selenium-beautifulsoup-scott-edenbaum
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import numpy as np
import lxml
import pandas as pd
import re
import sys
from bs4 import BeautifulSoup, Comment

def scrape_japanese_all(url):
    driver = webdriver.Chrome("C:/Users/karol/drivers/chromedriver")  #select selenium web driver
    driver.get(url) #open the url in selenium
    jpVocabTable = [] #this list will be populated with scraped text
    soup = BeautifulSoup(driver.page_source,'html5lib') #grab the content with beautifulsoup for parsing
    main_table = soup.findAll("li",{"class":"item"})  # select the desired html node

    for row in main_table:
        rowList = []
        
        ######## find japanese words ###########################
        
        href = row.contents[0].contents[2].contents[0].find('a', href=True)
        word = row.find_all('a', href=True)
        jpWord= href.text.replace('\'', '')
        
        ######## find transliteration ###########################
        
        translit = row.contents[0].contents[2].contents[0].find('span', {"class":"transliteration"})
        if translit is not None:
            comments = translit.findAll(text=lambda text:isinstance(text, Comment))
            for comment in comments:
                comment.extract()
            transliteration = re.sub('\[|\]', '', translit.text)
        else: #there are cases where there is no transliteration and we want them empty
            transliteration = ""

        ######## find English translation ###########################

        trans = row.contents[0].contents[2].find('p', {"class":"response"})
        #if trans is not None:  #some returning nones
        translation = trans.text.replace('\'', '')
        
        ######## find example sentence 1 ###########################
        
        #create temporary list for find its length
        temp_list_sen1 = row.contents[1].contents[0].contents
        
        # depending on length, ask for the last element       
        if len(temp_list_sen1) ==2 :
            sen1 = row.contents[1].contents[0].contents[1].find('span').text
           # print(sen2)
            sen1transli = row.contents[1].contents[0].contents[1].find('p', {"class":"transliteration"}).text
            sen1transla = row.contents[1].contents[0].contents[1].find('p', {"class":"translation"}).text
        elif len(temp_list_sen1) ==3 :
            sen1 = row.contents[1].contents[0].contents[2].find('span').text
           # print(sen2)
            sen1transli = row.contents[1].contents[0].contents[2].find('p', {"class":"transliteration"}).text
            sen1transla = row.contents[1].contents[0].contents[2].find('p', {"class":"translation"}).text
        else:
            print("different length")

        ######## find example sentence 2 ###########################
    
        temp_list_sen2 = row.contents[1].contents[1].contents
        
        if len(temp_list_sen2) ==2 :
            sen2 = row.contents[1].contents[1].contents[1].find('span').text
           # print(sen2)
            sen2transli = row.contents[1].contents[1].contents[1].find('p', {"class":"transliteration"}).text
            sen2transla = row.contents[1].contents[1].contents[1].find('p', {"class":"translation"}).text
        elif len(temp_list_sen2) ==3 :
            sen2 = row.contents[1].contents[1].contents[2].find('span').text
           # print(sen2)
            sen2transli = row.contents[1].contents[1].contents[2].find('p', {"class":"transliteration"}).text
            sen2transla = row.contents[1].contents[1].contents[2].find('p', {"class":"translation"}).text
        else:
            print("different length")
            
        #add scraped text to list
        rowList.extend([jpWord,transliteration, translation, sen1, sen1transli, sen1transla, sen2, sen2transli, sen2transla])
        jpVocabTable.append(rowList)
            
    driver.close()    
    print("Scraping Complete!")
    return(jpVocabTable)
        
        
#url = "https://iknow.jp/courses/566927" 
#japanese_list = scrape_japanese(url)

In [5]:
# Run scraper for each URl
def getAllVocab(pages_list):   
    for counter, page in enumerate(pages_list, 1):
        japList = scrape_japanese_all(page)
        japDf = pd.DataFrame(japList)
        japDf.columns = ['word', 'kana', 'translation', 'sentence1', 'sen1kana', 'sen1trans','sentence2','sen2kana', 'sen2trans']
        csvFileName = 'japaneseWords_100_' + str(counter) + '.csv'
        japDf.to_csv(csvFileName, encoding='utf8', index = False) #don't add an extra line for indexing

        
######################### enumerate example ############

# my_list = ['apple', 'banana', 'grapes', 'pear']
# for c, value in enumerate(my_list, 1):
#     print(c, value)

# # Output:
# # 1 apple
# # 2 banana
# # 3 grapes
# # 4 pear
#########################################################
getAllVocab(urls)

#this will produce all 10 csv files with 1000 most common japanese words with examples.

Scraping Complete!
Scraping Complete!
Scraping Complete!
Scraping Complete!
Scraping Complete!
Scraping Complete!
Scraping Complete!
Scraping Complete!


ValueError: Length mismatch: Expected axis has 0 elements, new values have 9 elements

The program doesn't always scrape the page properly (not sure why).It fails on various pages, with the: "Length mismatch: Expected axis has 0 elements, new values have 9 elements", the list doesn't get populated at all, on different runs it happens to various pages. I managed to get the 9 csvs, the last one will be done manually below:

In [6]:
        japList = scrape_japanese_all('https://iknow.jp/courses/566932')
        japDf = pd.DataFrame(japList)
        japDf.columns = ['word', 'kana', 'translation', 'sentence1', 'sen1kana', 'sen1trans','sentence2','sen2kana', 'sen2trans']
        csvFileName = 'japaneseWords_100_10.csv'
        japDf.to_csv(csvFileName, encoding='utf8', index = False)

Scraping Complete!
