# Scrape oxford 3000,5000 words from website
https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000


In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
driver = webdriver.Firefox()
driver.get("https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000")

In [3]:
# Make sure oxford 5000 is selected
content = driver.page_source
soup = BeautifulSoup(content)
wordlistsContentPanel = soup.find("div", {"id": "wordlistsContentPanel"})
wordlist = wordlistsContentPanel.find("ul")

## Each word in wordlist looks like
```
<li data-hw="a" data-ox3000="a1" data-ox5000="a1">
 <a href="/definition/english/a_1">
  a
 </a>
 <span class="pos">
  indefinite article
 </span>
 <div>
  <span class="belong-to">
   a1
  </span>
  <div class="sound audio_play_button icon-audio pron-uk" data-src-mp3="/media/english/uk_pron/a/a__/a__gb/a__gb_2.mp3" data-src-ogg="/media/english/uk_pron_ogg/a/a__/a__gb/a__gb_2.ogg">
  </div>
  <div class="sound audio_play_button icon-audio pron-us" data-src-mp3="/media/english/us_pron/a/a__/a__us/a__us_2_rr.mp3" data-src-ogg="/media/english/us_pron_ogg/a/a__/a__us/a__us_2_rr.ogg">
  </div>
 </div>
</li>
```

In [4]:
# Create empty a frame with columns we wish to extract
columns = ["word", "definition_link", "type", "cefr", "UK", "US"]
df = pd.DataFrame(columns=columns)

# Extract fields from wordlist
words = wordlist.findChildren("li", recursive=False)
for word in words:
    if word.has_attr("class"):
        # Skip where class="hidden"
        continue
    a = word.find("a")
    word_name = a.text
    definition_link = a["href"]
    type = word.find("span").text
    div = word.find("div")
    cefr = div.find("span").text
    try:
        uk = div.find("div", {"class": "sound audio_play_button icon-audio pron-uk"})["data-src-mp3"]
        us = div.find("div", {"class": "sound audio_play_button icon-audio pron-us"})["data-src-mp3"]
    except TypeError:
        uk = ""
        us = ""
    # Add data to dataframe
    data = {
        "word": word_name,
        "definition_link": definition_link,
        "type": type,
        "cefr": cefr,
        "UK": uk,
        "US": us,
    }
    df_new = pd.DataFrame.from_records([data])
    df = pd.concat([df, df_new])

In [35]:
# Save data to pkl file
#df.to_pickle("./data/df.pkl")
df = pd.read_pickle("./data/df.pkl")

## Scrape audio with requests
requests to grab audio and save to `./audio` directory

formatted as `{word}_uk.mp3`

Example:
* word: A 
* relative URL: `/media/english/us_pron/a/a__/a__us/a__us_2_rr.mp3`
* full URL: `https://www.oxfordlearnersdictionaries.com/media/english/us_pron/a/a__/a__us/a__us_2_rr.mp3`

```
Host: www.oxfordlearnersdictionaries.com
User-Agent: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Accept: audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5
Accept-Language: en-US,en;q=0.5
Referer: https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000
Range: bytes=0-
Connection: keep-alive
Cookie: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 
Sec-Fetch-Dest: audio
Sec-Fetch-Mode: no-cors
Sec-Fetch-Site: same-origin
If-Modified-Since: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
If-None-Match: ""
```

In [8]:
import requests
from pathlib import Path
from time import sleep
import string


In [None]:
# select which data to grab or continue if time-out or connectionrefused etc
# MUST_START_WITH = ['a']
# MUST_START_WITH = [x for x in string.ascii_lowercase if x != 'a']
MUST_START_WITH = [x for x in string.ascii_lowercase]
CONTINUE_AFTER_WORD, AFTER_WORD = False, "visual"
TIME_BETWEEN_REQUESTS = 0 # time to wait between requests (in seconds)

AUDIO_PATH = Path("./audio")
BASE_URL = 'https://www.oxfordlearnersdictionaries.com'

callback = 'angular.callbacks._0'
headers = {
    'Referer': 'https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/75.0.3770.100 Safari/537.36',
}
params = (
    ('callback', callback),
)

with requests.Session() as session:
    for _, row in df.iterrows():
        # Continue after word
        if CONTINUE_AFTER_WORD:
            if row.word == AFTER_WORD:
                CONTINUE_AFTER_WORD = False
            continue

        # Select data to request
        if row.word[0].lower() not in MUST_START_WITH:
            continue

        # Skip if no audio urls
        if row.US == "" or row.UK == "":
            continue

        # Request and Write UK audio file
        filename = Path(f"{row.word}_uk.mp3")
        url = BASE_URL + row.UK

        response = session.get(url, headers=headers, params=params)
        assert response.headers["Content-Type"] == "audio/mpeg"
        with open(AUDIO_PATH / filename, "wb") as file:
            file.write(response.content)

        sleep(TIME_BETWEEN_REQUESTS)

        # Request and Write US audio file
        filename = Path(f"{row.word}_us.mp3")
        url = BASE_URL + row.US

        response = session.get(url, headers=headers, params=params)
        assert response.headers["Content-Type"] == "audio/mpeg"
        with open(AUDIO_PATH / filename, "wb") as file:
            file.write(response.content)

        sleep(TIME_BETWEEN_REQUESTS)

## Dictionary definition
Request 
* word: A
* Relative URL: `/definition/english/a_1`
* Full URL: `https://www.oxfordlearnersdictionaries.com/definition/english/a_1`

In [6]:
from selenium.webdriver.firefox.options import Options

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

columns = ["word", "phon_br", "phon_n_am", "definition", "example"]
df_definition = pd.DataFrame(columns=columns)

In [15]:
# This cell runs slowly and tends to crash multiple times
# Use the CONTINUE_AFTER_WORD option
# select which data to grab or continue if time-out or connectionrefused etc
# MUST_START_WITH = ['a']
# MUST_START_WITH = [x for x in string.ascii_lowercase if x != 'a']
MUST_START_WITH = [x for x in string.ascii_lowercase]
CONTINUE_AFTER_WORD, AFTER_WORD, OFFSET = True, "register", 1
BASE_URL = 'https://www.oxfordlearnersdictionaries.com'

offset = 0
for idx, row in df.iterrows():
    # Continue after word
    if CONTINUE_AFTER_WORD:
        if row.word == AFTER_WORD:
            if offset == OFFSET:
                CONTINUE_AFTER_WORD = False
            else:
                offset += 1
        continue

    # Select data to request
    if row.word[0].lower() not in MUST_START_WITH:
        continue

    # Request definition
    url = BASE_URL + row.definition_link
    driver.get(url)

    # Extract fields
    content = driver.page_source
    soup = BeautifulSoup(content)

    main_container = soup.find("div", {"id": "main-container"})
    try:
        phonetics = soup.find("span", {"class": "phonetics"})
        phon_br = phonetics.find("div", {"class":"phons_br"}).find("span", {"class": "phon"}).text
        phon_n_am = phonetics.find("div", {"class":"phons_n_am"}).find("span", {"class": "phon"}).text
    except AttributeError:
        print("phonetics", row.word, page_id)
        phon_br = ""
        phon_n_am = ""

    page_id = row.definition_link.split('/')[-1]
    try:
        entry = main_container.find("div", {"id": page_id})
    except AttributeError:
        pass
    try:
        definition = entry.find("ol").find("li").find("span", {"class": "def"}).text
        examples = entry.find("ol").find("ul", {"class": "examples"}).find("li")
        first_example = ', '.join([x.text for x in examples.findChildren("span", recursive=False)])
        example = first_example.replace("<span>", "").replace("</span>", "")
    except AttributeError:
        print("definition", row.word, page_id)
        definition = ""
        example = ""

    # Add data
    data = {
        "word": row.word,
        "phon_br": phon_br,
        "phon_n_am": phon_n_am,
        "definition": definition,
        "example": example,
    }
    df_new = pd.DataFrame.from_records([data])
    df_definition = pd.concat([df_definition, df_new])


definition rely rely
definition row row1_1#row1_sng_1
definition sake sake1
definition seventeen seventeen
definition seventy seventy
definition sixteen sixteen
definition sixty sixty
definition sum sum_2
definition text text_2#text_sng_9
definition thirteen thirteen
definition thirty thirty
definition this this_1#this_sng_1
definition trillion trillion
definition twenty twenty
definition upon upon
definition wind wind1_1#wind1_sng_1
definition yeah yeah
phonetics yield yield_1


In [36]:
df_definition.tail()

Unnamed: 0,word,phon_br,phon_n_am,definition,example
5938,yours,/jɔːz/,/jərz/,of or belonging to you,Is that book yours?
5939,yourself,/jɔːˈself/,/jɔːrˈself/,used when the person or people being spoken to...,Have you hurt yourself?
5940,youth,/juːθ/,/juːθ/,"the time of life when a person is young, espec...","in somebody's youth, He had been a talented mu..."
5941,zero,/ˈzɪərəʊ/,/ˈzɪrəʊ/,0,"Five, four, three, two, one, zero… We have lif..."
5942,zone,/zəʊn/,/zəʊn/,an area or a region with a particular feature ...,a war/combat/demilitarized/exclusion zone


In [37]:
# Save data to pkl file
#df_definition.to_pickle("./data/df_definition.pkl")
df_definition = pd.read_pickle("./data/df_definition.pkl")

## Fix and merge df and df_definition

In [38]:
df = df.reset_index()
del df['index']

In [39]:
df.head()

Unnamed: 0,word,definition_link,type,cefr,UK,US
0,a,/definition/english/a_1,indefinite article,a1,/media/english/uk_pron/a/a__/a__gb/a__gb_2.mp3,/media/english/us_pron/a/a__/a__us/a__us_2_rr.mp3
1,abandon,/definition/english/abandon_1,verb,b2,/media/english/uk_pron/a/aba/aband/abandon__gb...,/media/english/us_pron/a/aba/aband/abandon__us...
2,ability,/definition/english/ability_1,noun,a2,/media/english/uk_pron/a/abi/abili/ability__gb...,/media/english/us_pron/a/abi/abili/ability__us...
3,able,/definition/english/able_1,adjective,a2,/media/english/uk_pron/a/abl/able_/able__gb_1.mp3,/media/english/us_pron/a/abl/able_/able__us_2.mp3
4,abolish,/definition/english/abolish,verb,c1,/media/english/uk_pron/a/abo/aboli/abolish__gb...,/media/english/us_pron/a/abo/aboli/abolish__us...


In [40]:
print(df.shape)
print(df_definition.shape)

(5943, 6)
(5943, 5)


In [41]:
# Sort
# df_definition.sort_values(by=['word'], key=lambda col: col.str.lower()).head()

# Reset index
df_definition = df_definition.reset_index()
del df_definition['index']

In [42]:
print(df.shape)
print(df_definition.shape)
assert(df.shape[0] == df_definition.shape[0])

(5943, 6)
(5943, 5)


In [43]:
df_concat = pd.concat([df, df_definition], axis=1)

In [44]:
df_concat.shape

(5943, 11)

## Add missing dictionary entries
input manually or use other dictionary

In [45]:
df_concat[df_concat["definition"] == ""]

Unnamed: 0,word,definition_link,type,cefr,UK,US,word.1,phon_br,phon_n_am,definition,example
219,amount,/definition/english/amount_2,verb,b2,/media/english/uk_pron/a/amo/amoun/amount__gb_...,/media/english/us_pron/a/amo/amoun/amount__us_...,amount,/əˈmaʊnt/,/əˈmaʊnt/,,
1093,consist,/definition/english/consist,verb,b1,/media/english/uk_pron/x/xco/xcons/xconsist__g...,/media/english/us_pron/x/xco/xcons/xconsist__u...,consist,/kənˈsɪst/,/kənˈsɪst/,,
1104,constitutional,/definition/english/constitutional_2,adjective,c1,/media/english/uk_pron/x/xco/xcons/xconstituti...,/media/english/us_pron/x/xco/xcons/xconstituti...,constitutional,/ˌkɒnstɪˈtjuːʃənl/,/ˌkɑːnstɪˈtuːʃənl/,,
1137,contrary,/definition/english/contrary1_1#contrary1_sng_1,adjective,c1,/media/english/uk_pron/x/xco/xcont/xcontrary__...,/media/english/us_pron/x/xco/xcont/xcontrary__...,contrary,/ˈkɒntrəri/,/ˈkɑːntreri/,,
1428,deprive,/definition/english/deprive,verb,c1,/media/english/uk_pron/d/dep/depri/deprive__gb...,/media/english/us_pron/d/dep/depri/deprive__us...,deprive,/dɪˈpraɪv/,/dɪˈpraɪv/,,
1431,derive,/definition/english/derive,verb,b2,/media/english/uk_pron/d/der/deriv/derive__gb_...,/media/english/us_pron/d/der/deriv/derive__us_...,derive,/dɪˈraɪv/,/dɪˈraɪv/,,
1473,devote,/definition/english/devote,verb,b2,/media/english/uk_pron/d/dev/devot/devote__gb_...,/media/english/us_pron/d/dev/devot/devote__us_...,devote,/dɪˈvəʊt/,/dɪˈvəʊt/,,
1717,eight,/definition/english/eight#eight_sng_1,number,a1,/media/english/uk_pron/e/eig/eight/eight__gb_2...,/media/english/us_pron/e/eig/eight/eight__us_1...,eight,/eɪt/,/eɪt/,,
1718,eighteen,/definition/english/eighteen,number,a1,/media/english/uk_pron/e/eig/eight/eighteen__g...,/media/english/us_pron/e/eig/eight/eighteen__u...,eighteen,/ˌeɪˈtiːn/,/ˌeɪˈtiːn/,,
1719,eighty,/definition/english/eighty,number,a1,/media/english/uk_pron/e/eig/eight/eighty__gb_...,/media/english/us_pron/e/eig/eight/eighty__us_...,eighty,/ˈeɪti/,/ˈeɪti/,,


In [46]:
import json

# Import tusharlock10 dictionary
tusharlock10_dictionary = {}
for symbol in string.ascii_uppercase:
    filename = Path('./tusharlock10-Dictionary') / Path('D' + symbol + '.json')
    with open(filename, mode='r') as f:
        dictionary = json.load(f)
        tusharlock10_dictionary = {**tusharlock10_dictionary, **dictionary}

In [47]:
# Get relevant fields for some words and put it in a dataframe
def tusharlock10_meaning(w,type):
    try:
        data = tusharlock10_dictionary[w.upper()]
        first_meaning = list(data['MEANINGS'].values())[0]
        meaning_type = first_meaning[0]
        if meaning_type.upper() == type.upper():
            meaning_data = first_meaning[1]
        else:
            meaning_data = ""
    except (KeyError, IndexError, AttributeError):
        meaning_data = ""
    return meaning_data

for idx, row in df_concat.iterrows():
    if row.definition != "":
        continue
    df_concat.loc["definition"] = tusharlock10_meaning(row.word, row.type) 


In [48]:
# Save data to pkl file
df_concat.to_pickle("./data/df_concat.pkl")
# df_concat = pd.read_pickle("./data/df_concat.pkl")

## Prepare df_concat for JSON output

In [49]:
df_concat.head()

Unnamed: 0,word,definition_link,type,cefr,UK,US,word.1,phon_br,phon_n_am,definition,example
0,a,/definition/english/a_1,indefinite article,a1,/media/english/uk_pron/a/a__/a__gb/a__gb_2.mp3,/media/english/us_pron/a/a__/a__us/a__us_2_rr.mp3,a,/ə/,/ə/,used before countable or singular nouns referr...,a man/horse/unit
1,abandon,/definition/english/abandon_1,verb,b2,/media/english/uk_pron/a/aba/aband/abandon__gb...,/media/english/us_pron/a/aba/aband/abandon__us...,abandon,/əˈbændən/,/əˈbændən/,"to leave somebody, especially somebody you are...","abandon somebody, The baby had been abandoned ..."
2,ability,/definition/english/ability_1,noun,a2,/media/english/uk_pron/a/abi/abili/ability__gb...,/media/english/us_pron/a/abi/abili/ability__us...,ability,/əˈbɪləti/,/əˈbɪləti/,the fact that somebody/something is able to do...,People with the disease may lose their ability...
3,able,/definition/english/able_1,adjective,a2,/media/english/uk_pron/a/abl/able_/able__gb_1.mp3,/media/english/us_pron/a/abl/able_/able__us_2.mp3,able,/ˈeɪbl/,/ˈeɪbl/,"to have the skill, intelligence, opportunity, ...",You must be able to speak French for this job.
4,abolish,/definition/english/abolish,verb,c1,/media/english/uk_pron/a/abo/aboli/abolish__gb...,/media/english/us_pron/a/abo/aboli/abolish__us...,abolish,/əˈbɒlɪʃ/,/əˈbɑːlɪʃ/,"to officially end a law, a system or an instit...",This tax should be abolished.


In [50]:
# Drop extra work column, definition_link and prep to rename audio links
column_numbers = [x for x in range(df_concat.shape[1])]  # list of columns' integer indices
column_numbers.remove(6) #removing column integer index 0
df_concat = df_concat.iloc[:, column_numbers]
df_concat = df_concat.drop(['definition_link', 'UK', 'US'], axis=1)

In [51]:
df_concat.head()

Unnamed: 0,word,type,cefr,phon_br,phon_n_am,definition,example
0,a,indefinite article,a1,/ə/,/ə/,used before countable or singular nouns referr...,a man/horse/unit
1,abandon,verb,b2,/əˈbændən/,/əˈbændən/,"to leave somebody, especially somebody you are...","abandon somebody, The baby had been abandoned ..."
2,ability,noun,a2,/əˈbɪləti/,/əˈbɪləti/,the fact that somebody/something is able to do...,People with the disease may lose their ability...
3,able,adjective,a2,/ˈeɪbl/,/ˈeɪbl/,"to have the skill, intelligence, opportunity, ...",You must be able to speak French for this job.
4,abolish,verb,c1,/əˈbɒlɪʃ/,/əˈbɑːlɪʃ/,"to officially end a law, a system or an instit...",This tax should be abolished.


In [52]:
df_concat["uk"] = df_concat.apply(lambda row: f"{row.word}_uk.mp3" , axis=1)
df_concat["us"] = df_concat.apply(lambda row: f"{row.word}_us.mp3" , axis=1)

In [53]:
df_concat.head()

Unnamed: 0,word,type,cefr,phon_br,phon_n_am,definition,example,uk,us
0,a,indefinite article,a1,/ə/,/ə/,used before countable or singular nouns referr...,a man/horse/unit,a_uk.mp3,a_us.mp3
1,abandon,verb,b2,/əˈbændən/,/əˈbændən/,"to leave somebody, especially somebody you are...","abandon somebody, The baby had been abandoned ...",abandon_uk.mp3,abandon_us.mp3
2,ability,noun,a2,/əˈbɪləti/,/əˈbɪləti/,the fact that somebody/something is able to do...,People with the disease may lose their ability...,ability_uk.mp3,ability_us.mp3
3,able,adjective,a2,/ˈeɪbl/,/ˈeɪbl/,"to have the skill, intelligence, opportunity, ...",You must be able to speak French for this job.,able_uk.mp3,able_us.mp3
4,abolish,verb,c1,/əˈbɒlɪʃ/,/əˈbɑːlɪʃ/,"to officially end a law, a system or an instit...",This tax should be abolished.,abolish_uk.mp3,abolish_us.mp3


## Oxford 5000

In [54]:
df_5000 = df_concat
df_5000.to_pickle("./data/oxford_5000.pkl")
# df_5000 = pd.read_pickle("./data/oxford_5000.pkl")
df_5000.to_json('./data/oxford_5000.json', orient='index', index=True, indent=2)
df_5000.to_csv('./data/oxford_5000.csv', index=True)

## Oxford 3000

In [55]:
driver = webdriver.Firefox()
driver.get("https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000")

In [57]:
# Make sure oxford 3000 is selected
content = driver.page_source
soup = BeautifulSoup(content)
wordlistsContentPanel = soup.find("div", {"id": "wordlistsContentPanel"})
wordlist = wordlistsContentPanel.find("ul")

In [58]:
# Create empty a frame with columns we wish to extract
columns = ["word", "type", "cefr"]
df = pd.DataFrame(columns=columns)

# Extract fields from wordlist
words = wordlist.findChildren("li", recursive=False)
for word in words:
    if word.has_attr("class"):
        # Skip where class="hidden"
        continue
    a = word.find("a")
    word_name = a.text
    definition_link = a["href"]
    type = word.find("span").text
    div = word.find("div")
    cefr = div.find("span").text
    try:
        uk = div.find("div", {"class": "sound audio_play_button icon-audio pron-uk"})["data-src-mp3"]
        us = div.find("div", {"class": "sound audio_play_button icon-audio pron-us"})["data-src-mp3"]
    except TypeError:
        uk = ""
        us = ""
    # Add data to dataframe
    data = {
        "word": word_name,
        "type": type,
        "cefr": cefr,
    }
    df_new = pd.DataFrame.from_records([data])
    df = pd.concat([df, df_new])

In [59]:
## Fill in missing oxford 3000 list from oxford 5000
df.head()
# Left join
df_3000=pd.merge(df_5000, df, how='inner', on=['word','type','cefr'])

In [60]:
df_3000.to_pickle("./data/oxford_3000.pkl")
# df_3000 = pd.read_pickle("./data/oxford_3000.pkl")
df_3000.to_json('./data/oxford_3000.json', orient='index', index=True, indent=2)
df_3000.to_csv('./data/oxford_3000.csv', index=True)

## Exclusive 5000

In [61]:
driver = webdriver.Firefox()
driver.get("https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000")

In [62]:
# Make sure oxford 5000 exclusive is selected
content = driver.page_source
soup = BeautifulSoup(content)
wordlistsContentPanel = soup.find("div", {"id": "wordlistsContentPanel"})
wordlist = wordlistsContentPanel.find("ul")

In [63]:
# Create empty a frame with columns we wish to extract
columns = ["word", "type", "cefr"]
df = pd.DataFrame(columns=columns)

# Extract fields from wordlist
words = wordlist.findChildren("li", recursive=False)
for word in words:
    if word.has_attr("class"):
        # Skip where class="hidden"
        continue
    a = word.find("a")
    word_name = a.text
    definition_link = a["href"]
    type = word.find("span").text
    div = word.find("div")
    try:
        cefr = div.find("span").text
    except:
        print(word)
        cefr = ""
    try:
        uk = div.find("div", {"class": "sound audio_play_button icon-audio pron-uk"})["data-src-mp3"]
        us = div.find("div", {"class": "sound audio_play_button icon-audio pron-us"})["data-src-mp3"]
    except TypeError:
        uk = ""
        us = ""
    # Add data to dataframe
    data = {
        "word": word_name,
        "type": type,
        "cefr": cefr,
    }
    df_new = pd.DataFrame.from_records([data])
    df = pd.concat([df, df_new])

<li data-hw="accounting"> <a href="/definition/english/accounting">accounting</a> <span class="pos">noun</span><div> <div class="sound audio_play_button icon-audio pron-uk" data-src-mp3="/media/english/uk_pron/a/acc/accou/accounting__gb_1.mp3" data-src-ogg="/media/english/uk_pron_ogg/a/acc/accou/accounting__gb_1.ogg"> </div> <div class="sound audio_play_button icon-audio pron-us" data-src-mp3="/media/english/us_pron/a/acc/accou/accounting__us_1.mp3" data-src-ogg="/media/english/us_pron_ogg/a/acc/accou/accounting__us_1.ogg"> </div></div></li>
<li data-hw="angrily"> <a href="/definition/english/angrily">angrily</a> <span class="pos">adverb</span><div> <div class="sound audio_play_button icon-audio pron-uk" data-src-mp3="/media/english/uk_pron/a/ang/angri/angrily__gb_1.mp3" data-src-ogg="/media/english/uk_pron_ogg/a/ang/angri/angrily__gb_1.ogg"> </div> <div class="sound audio_play_button icon-audio pron-us" data-src-mp3="/media/english/us_pron/a/ang/angri/angrily__us_1.mp3" data-src-ogg="

In [64]:
## Fill in missing exclusive oxford 5000 list from oxford 5000
df.head()
# Left join
df_5000_exclusive=pd.merge(df_5000, df, how='inner', on=['word','type','cefr'])


In [65]:
df_5000_exclusive.head()

Unnamed: 0,word,type,cefr,phon_br,phon_n_am,definition,example,uk,us
0,abolish,verb,c1,/əˈbɒlɪʃ/,/əˈbɑːlɪʃ/,"to officially end a law, a system or an instit...",This tax should be abolished.,abolish_uk.mp3,abolish_us.mp3
1,abortion,noun,c1,/əˈbɔːʃn/,/əˈbɔːrʃn/,the deliberate ending of a pregnancy at an ear...,to support/oppose abortion,abortion_uk.mp3,abortion_us.mp3
2,absence,noun,c1,/ˈæbsəns/,/ˈæbsəns/,the fact of somebody being away from a place w...,"in somebody's absence, The decision was made i...",absence_uk.mp3,absence_us.mp3
3,absent,adjective,c1,/ˈæbsənt/,/ˈæbsənt/,"not in a place because of illness, etc.",He was absent from work for two weeks.,absent_uk.mp3,absent_us.mp3
4,absorb,verb,b2,/əbˈzɔːb/,/əbˈzɔːrb/,"to take in a liquid, gas or other substance fr...","absorb something, Plants absorb carbon dioxide...",absorb_uk.mp3,absorb_us.mp3


In [66]:
df_5000_exclusive.shape

(2138, 9)

In [67]:
df_5000_exclusive.to_pickle("./data/oxford_5000_exclusive.pkl")
# df_5000_exclusive = pd.read_pickle("./data/oxford_5000_exclusive.pkl")
df_5000_exclusive.to_json('./data/oxford_5000_exclusive.json', orient='index', index=True, indent=2)
df_5000_exclusive.to_csv('./data/oxford_5000_exclusive.csv', index=True)