In [1]:
import cardbuilder
from bs4 import BeautifulSoup
import requests
import re

検索I began by importing some required libraries and creating a function which returns the content of the page returned by OJAD word search when you search for some word. Basically the same as the query_api method from ScrapingMerriamWebster:

In [2]:
def query_api(word):
    url = "http://www.gavo.t.u-tokyo.ac.jp/ojad/search/index/word:{}".format(word)
    response = requests.get(url).content
    return response

html = query_api("犬")

In [3]:
soup = BeautifulSoup(html, "html.parser")

In the resulting document there is a table element containing the different word forms of the searched word and their pitch accent. There is one table cell with a class "midashi". In this cell, we have a base form of the word - when you look up a verb it contains the -ru/-u and -masu forms. It also has buttons to play the audio for each word form in either the male or female voice. These buttons don't seem to work for nouns, but they do work for verbs and adjectives. I tried it in multiple browsers with no luck. I don't think this cell will probably be useful for scraping the pages, because the text inside will basically just give us back the string arg we put into query_api(), and the onclick function for the buttons doesn't give us enough information to figure out the specific URL's to the audio files. I tried looking for the pronounce_play_batch() function from the page's JavaScript and I couldn't find it. Maybe you know more about JS than me and you can figure out what's going on with that.

[the page returned by searching for "犬"] (http://www.gavo.t.u-tokyo.ac.jp/ojad/search/index/sortprefix:accent/narabi1:kata_asc/narabi2:accent_asc/narabi3:mola_asc/yure:visible/curve:invisible/details:invisible/limit:20/word:%E7%8A%AC)

In each of the pages I scraped in this notebook, there was only one row in the table. But if you looked up a word and there were multiple results, it would be easy enough to iterate over each row of the table and extract the same information. Or maybe it makes more sense to only be concerned with the first result.

In these examples, I know I am using a different syntax than you are to search in the DOM. If I create a scraping OJAD class and make a pull request I will clean it up so that it searches the DOM tree in the same way as ScrapingMerriamWebster. I just haven't used BeautifulSoup very much so I used it in the way I was familiar with. 

In [4]:
midashi = soup.find("p", class_="midashi_word")
midashi.string

'犬'

In [5]:
midashi = soup.find("td", class_="midashi")
midashi

<td class="midashi">
<div class="proc_batch_button_word">
<a class="katsuyo_proc_batch_word_female_button" href="#" onclick="pronounce_play_batch('word','3965','female');return false;"></a>
<a class="katsuyo_proc_batch_word_male_button" href="#" onclick="pronounce_play_batch('word','3965','male');return false;"></a>
</div>
<div class="midashi_wrapper">
<p class="midashi_word">犬</p>
</div>
</td>

In the table, after the "midashi" cell, there are several cells with a class "katsuyo"

In [6]:
katsuyo = soup.find("td", class_="katsuyo")
katsuyo

<td class="katsuyo katsuyo_jisho_js">
<div class="katsuyo_proc">
<p>
<span class="katsuyo_accent"><span class="accented_word"><span class="mola_-2"><span class="inner"><span class="char">い</span></span></span><span class="accent_top mola_-1"><span class="inner"><span class="char">ぬ</span></span></span></span></span>
</p>
<div class="katsuyo_proc_button clearfix">
</div>
</div>
</td>

In [7]:
def print_table_cells(word):
    html = query_api(word)
    soup = BeautifulSoup(html, "html.parser")
    midashi = soup.find("td", class_="midashi")
    katsuyo = soup.find("td", class_="katsuyo")
    print(word)
    print()
    print(midashi)
    print()
    print(katsuyo)

In [8]:
print_table_cells("食べる")

食べる

<td class="midashi">
<div class="proc_batch_button_word">
<a class="katsuyo_proc_batch_word_female_button" href="#" onclick="pronounce_play_batch('word','1238','female');return false;"></a>
<a class="katsuyo_proc_batch_word_male_button" href="#" onclick="pronounce_play_batch('word','1238','male');return false;"></a>
</div>
<div class="midashi_wrapper">
<p class="midashi_word">食べる・食べます</p>
</div>
</td>

<td class="katsuyo katsuyo_jisho_js">
<div class="katsuyo_proc">
<p>
<span class="katsuyo_accent"><span class="accented_word"><span class="mola_-3"><span class="inner"><span class="char">た</span></span></span><span class="accent_top mola_-2"><span class="inner"><span class="char">べ</span></span></span><span class="mola_-1"><span class="inner"><span class="char">る</span></span></span></span></span>
</p>
<div class="katsuyo_proc_button clearfix">
<a class="katsuyo_proc_female_button js_proc_female_button" href="#" id="1238_1_1_female" onclick="pronounce_play('1238_1_1_female');return 

In [9]:
html = query_api("歩く")
html[:100]

b'<!DOCTYPE html>\n<html>\n<head>\n    <meta http-equiv="X-UA-Compatible" content="IE=10" />\r\n<meta http-'

In [10]:
soup = BeautifulSoup(html, "html.parser")
katsuyo = soup.find("td", class_="katsuyo")
katsuyo

<td class="katsuyo katsuyo_jisho_js">
<div class="katsuyo_proc">
<p>
<span class="katsuyo_accent"><span class="accented_word"><span class="mola_-3"><span class="inner"><span class="char">あ</span></span></span><span class="accent_top mola_-2"><span class="inner"><span class="char">る</span></span></span><span class="mola_-1"><span class="inner"><span class="char">く</span></span></span></span></span>
</p>
<div class="katsuyo_proc_button clearfix">
<a class="katsuyo_proc_female_button js_proc_female_button" href="#" id="68_1_1_female" onclick="pronounce_play('68_1_1_female');return false;"></a>
<a class="katsuyo_proc_male_button js_proc_male_button" href="#" id="68_1_1_male" onclick="pronounce_play('68_1_1_male');return false;"></a>
</div>
</div>
</td>

In [11]:
buttons = katsuyo.find_all("a")
buttons

[<a class="katsuyo_proc_female_button js_proc_female_button" href="#" id="68_1_1_female" onclick="pronounce_play('68_1_1_female');return false;"></a>,
 <a class="katsuyo_proc_male_button js_proc_male_button" href="#" id="68_1_1_male" onclick="pronounce_play('68_1_1_male');return false;"></a>]

In [12]:
for button in buttons:
    print(button.get("onclick"))

pronounce_play('68_1_1_female');return false;
pronounce_play('68_1_1_male');return false;


In [13]:
onclick = buttons[0].get("onclick")
onclick

"pronounce_play('68_1_1_female');return false;"

In [14]:
import math

def get_pronounce_url(soundfile):
    num = "00" + str(math.floor(int(soundfile.split("_")[0]) / 100))
    gender = soundfile.split("_")[-1]
    return "http://www.gavo.t.u-tokyo.ac.jp/ojad/sound4/mp3/" + gender + "/" + num[-3:] + "/" + soundfile + ".mp3"

In [17]:
get_pronounce_url("68_1_1_female")

'http://www.gavo.t.u-tokyo.ac.jp/ojad/sound4/mp3/female/000/68_1_1_female.mp3'

In [18]:
html = query_api("犬")
soup = BeautifulSoup(html, "html.parser")

print_table_cells("犬")

犬

<td class="midashi">
<div class="proc_batch_button_word">
<a class="katsuyo_proc_batch_word_female_button" href="#" onclick="pronounce_play_batch('word','3965','female');return false;"></a>
<a class="katsuyo_proc_batch_word_male_button" href="#" onclick="pronounce_play_batch('word','3965','male');return false;"></a>
</div>
<div class="midashi_wrapper">
<p class="midashi_word">犬</p>
</div>
</td>

<td class="katsuyo katsuyo_jisho_js">
<div class="katsuyo_proc">
<p>
<span class="katsuyo_accent"><span class="accented_word"><span class="mola_-2"><span class="inner"><span class="char">い</span></span></span><span class="accent_top mola_-1"><span class="inner"><span class="char">ぬ</span></span></span></span></span>
</p>
<div class="katsuyo_proc_button clearfix">
</div>
</div>
</td>


In [20]:
test_inu_str = "3965_1_1_female"
get_pronounce_url(test_inu_str)

'http://www.gavo.t.u-tokyo.ac.jp/ojad/sound4/mp3/female/039/3965_1_1_female.mp3'

In [21]:
test_taberu = "1238_1_1_female"
get_pronounce_url(test_taberu)

'http://www.gavo.t.u-tokyo.ac.jp/ojad/sound4/mp3/female/012/1238_1_1_female.mp3'

In [22]:
taberu_male = "1238_1_1_male"
get_pronounce_url(taberu_male)

'http://www.gavo.t.u-tokyo.ac.jp/ojad/sound4/mp3/male/012/1238_1_1_male.mp3'

In [23]:
url = get_pronounce_url(taberu_male)
response = requests.get(url, stream=True)
response.content

b'\xff\xfb\x94\xc4\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00Info\x00\x00\x00\x0f\x00\x00\x00 \x00\x001\x80\x00\x08\x08\x08\x10\x10\x10\x18\x18\x18   (((000888@@@@HHHPPPXXX```hhhpppxxx\x80\x80\x80\x80\x88\x88\x88\x90\x90\x90\x98\x98\x98\xa0\xa0\xa0\xa8\xa8\xa8\xb0\xb0\xb0\xb8\xb8\xb8\xc0\xc0\xc0\xc0\xc8\xc8\xc8\xd0\xd0\xd0\xd8\xd8\xd8\xe0\xe0\xe0\xe8\xe8\xe8\xf0\xf0\xf0\xf8\xf8\xf8\xff\xff\xff\x00\x00\x009LAME3.99r\x01\xcd\x00\x00\x00\x00.\x05\x00\x00\x14\x80$\x05\x08\x82\x00\x00\x80\x00\x001\x80?\x0e\xc7\x9d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\

In [24]:
with open("taberu.mp3", "wb") as mp3:
    for chunk in response.iter_content(chunk_size=128):
        mp3.write(chunk)

In [25]:
def get_soup(word):
    html = query_api(word)
    return BeautifulSoup(html, "html.parser")

In [26]:
soup = get_soup("難しい")
midashi = soup.find("td", class_="midashi")
katsuyo = soup.find("td", class_="katsuyo")
print(midashi)
print()
print(katsuyo)

<td class="midashi">
<div class="proc_batch_button_word">
<a class="katsuyo_proc_batch_word_female_button" href="#" onclick="pronounce_play_batch('word','3076','female');return false;"></a>
<a class="katsuyo_proc_batch_word_male_button" href="#" onclick="pronounce_play_batch('word','3076','male');return false;"></a>
</div>
<div class="midashi_wrapper">
<p class="midashi_word">難しい・難しいです</p>
</div>
</td>

<td class="katsuyo katsuyo_jisho_js">
<div class="katsuyo_proc">
<p>
<span class="katsuyo_accent"><span class="accented_word"><span class="mola_-5"><span class="inner"><span class="char">む</span></span></span><span class="accent_plain mola_-4"><span class="inner"><span class="char">ず</span></span></span><span class="accent_plain mola_-3"><span class="inner"><span class="char">か</span></span></span><span class="accent_plain mola_-2"><span class="inner"><span class="char">し</span></span></span><span class="accent_plain mola_-1"><span class="inner"><span class="char">い</span></span></span>

In [27]:
accent = katsuyo.find("span", class_="accented_word")
accent

<span class="accented_word"><span class="mola_-5"><span class="inner"><span class="char">む</span></span></span><span class="accent_plain mola_-4"><span class="inner"><span class="char">ず</span></span></span><span class="accent_plain mola_-3"><span class="inner"><span class="char">か</span></span></span><span class="accent_plain mola_-2"><span class="inner"><span class="char">し</span></span></span><span class="accent_plain mola_-1"><span class="inner"><span class="char">い</span></span></span></span>

In [28]:
for span in accent.find_all("span"):
    print(span)

<span class="mola_-5"><span class="inner"><span class="char">む</span></span></span>
<span class="inner"><span class="char">む</span></span>
<span class="char">む</span>
<span class="accent_plain mola_-4"><span class="inner"><span class="char">ず</span></span></span>
<span class="inner"><span class="char">ず</span></span>
<span class="char">ず</span>
<span class="accent_plain mola_-3"><span class="inner"><span class="char">か</span></span></span>
<span class="inner"><span class="char">か</span></span>
<span class="char">か</span>
<span class="accent_plain mola_-2"><span class="inner"><span class="char">し</span></span></span>
<span class="inner"><span class="char">し</span></span>
<span class="char">し</span>
<span class="accent_plain mola_-1"><span class="inner"><span class="char">い</span></span></span>
<span class="inner"><span class="char">い</span></span>
<span class="char">い</span>


In [29]:
morae_elems = list(accent.children)
morae_elems

[<span class="mola_-5"><span class="inner"><span class="char">む</span></span></span>,
 <span class="accent_plain mola_-4"><span class="inner"><span class="char">ず</span></span></span>,
 <span class="accent_plain mola_-3"><span class="inner"><span class="char">か</span></span></span>,
 <span class="accent_plain mola_-2"><span class="inner"><span class="char">し</span></span></span>,
 <span class="accent_plain mola_-1"><span class="inner"><span class="char">い</span></span></span>]

In [30]:
accent_data = []
for span in morae_elems:
    accent = 0
    if span.get("class")[0].startswith("accent"):
        accent = 1
    char = span.find("span", class_="char").string
    accent_data.append((char, accent))
print(accent_data)

[('む', 0), ('ず', 1), ('か', 1), ('し', 1), ('い', 1)]


In [31]:
# A function that scrapes the sound files and pitch accent data from the dictionary form of a word
def scrape_pitch_accent_data(word):
    soup = get_soup(word)
    katsuyo = soup.find("td", class_="katsuyo")
    sound_buttons = katsuyo.find_all("a")
    for button in sound_buttons:
        onclick = button.get("onclick")
        sound_file = re.match(r"pronounce_play\('(\w+)'\)", onclick).groups()[0]
        url = get_pronounce_url(sound_file)
        mp3_response = requests.get(url)
        if mp3_response.ok:
            with open("{}.mp3".format(sound_file), "wb") as mp3_file:
                for chunk in mp3_response.iter_content(chunk_size=128):
                    mp3_file.write(chunk)
    accent = katsuyo.find("span", class_="accented_word")
    morae = accent.children
    accent_list = []
    for span in morae:
        accent = 0
        if span.get("class")[0].startswith("accent"):
            accent = 1
        char = span.find("span", class_="char").string
        accent_list.append((char, accent))
    return accent_list

In [32]:
scrape_pitch_accent_data("歩く")

[('あ', 0), ('る', 1), ('く', 0)]

In [33]:
scrape_pitch_accent_data("働く")

[('は', 0), ('た', 1), ('ら', 1), ('く', 1)]

In [34]:
scrape_pitch_accent_data("可愛い")

[('か', 0), ('わ', 1), ('い', 1), ('い', 0)]