In [None]:
# Written by Y. Won in 2021. https://linguistry.github.io/ 

In [1]:
# importing required packages
import pandas as pd
import numpy as np

In [2]:
## Import a file
word_df = pd.read_excel('01_word_list_input.xlsx')
word_df.head()

Unnamed: 0,word
0,Dream
1,have
2,FirST


In [3]:
# lower the words
word_df['word'] = word_df['word'].apply(lambda x: x.lower())
word_df

Unnamed: 0,word
0,dream
1,have
2,first


## Add pronunciations using online dictionary

In [4]:
import requests
import time
from bs4 import BeautifulSoup
from nltk import word_tokenize

In [5]:
# word_df['word'] = word_df['word'].apply(lambda x: x.lower())
word_df['word'] = word_df['word'].astype('string')  # to us strip(), converted the data to string
word_df

Unnamed: 0,word
0,dream
1,have
2,first


In [6]:
# crawl pronunciations from an online dictionary 
# this part is modified from https://www.programmersought.com/article/65882923972/
def extract_pronunciation(word):
    url = "add an online dictionary address" + word.strip()+'?q='+ word.strip() # this last part could vary depending on the dictionary
    wbdata = requests.get(url, headers={'User-Agent': 'Chrome/88.0.4324.96'}).text
    soup = BeautifulSoup(wbdata, 'html.parser')
    NAEng = soup.select("div > span.phonetics > div.phons_n_am > span") # this css selector could vary
    for n in NAEng:
        pron = n.get_text()
        return pron

In [7]:
index = 0
for word in word_df['word']:
    pron = extract_pronunciation(word)
    print(pron)
    word_df.at[index, 'pronunciation'] = pron
    index+=1

/driːm/
/həv/
/fɜːrst/


In [8]:
word_df

Unnamed: 0,word,pronunciation
0,dream,/driːm/
1,have,/həv/
2,first,/fɜːrst/


In [9]:
word_df.to_excel("word_pronun_output.xlsx")

## Add example sentences

In [10]:
# import corpus into pandas
sent_df = pd.read_csv('01_corpus_input.txt', sep="\n", header=None)
sent_df.rename(columns={0: 'text'}, inplace=True)
sent_df['text'] = sent_df['text'].apply(lambda x: x.lower())
sent_df

Unnamed: 0,text
0,she referred in one report to the heartbreak o...
1,he would have been correct to say the evidence...
2,"through the first six months of 2020, it was a..."
3,"both candidates, he said, violated that rule d..."
4,enforcement of debate rules can make for unant...
5,this is officially october.
6,and that was just last week.
7,the challenges facing the reelection team are ...
8,several attendees at that event have since tes...
9,his first debate performance ignited a firesto...


In [11]:
# make a list of sentences
sent_list = []
for word in word_df['word']: 
    sent_df2 = sent_df[sent_df['text'].str.contains(word, na=True)]
    sent = sent_df2.iloc[:, 0].values.tolist() # one sentence for each word. 
    sent_list.append(sent)

In [12]:
# show sentences
sent_list

[['she referred in one report to the heartbreak of an 11- or 12-year-old girl told to forget her dreams and prepare to marry a man twice her age and have children.'],
 ['she referred in one report to the heartbreak of an 11- or 12-year-old girl told to forget her dreams and prepare to marry a man twice her age and have children.',
  'he would have been correct to say the evidence is scanty.',
  'several attendees at that event have since tested positive.'],
 ['through the first six months of 2020, it was about $130 billion.',
  'both candidates, he said, violated that rule during the first debate.',
  'his first debate performance ignited a firestorm over white supremacy.']]

In [13]:
# convert a list of sentences to pandas 
sentences = pd.DataFrame(sent_list)
sentences

Unnamed: 0,0,1,2
0,she referred in one report to the heartbreak o...,,
1,she referred in one report to the heartbreak o...,he would have been correct to say the evidence...,several attendees at that event have since tes...
2,"through the first six months of 2020, it was a...","both candidates, he said, violated that rule d...",his first debate performance ignited a firesto...


In [14]:
# combine word with its sentences
Add_sent = pd.concat([word_df, sentences], axis=1)
Add_sent

Unnamed: 0,word,pronunciation,0,1,2
0,dream,/driːm/,she referred in one report to the heartbreak o...,,
1,have,/həv/,she referred in one report to the heartbreak o...,he would have been correct to say the evidence...,several attendees at that event have since tes...
2,first,/fɜːrst/,"through the first six months of 2020, it was a...","both candidates, he said, violated that rule d...",his first debate performance ignited a firesto...


In [15]:
Add_sent.to_excel("word_pronun_sentences_output.xlsx")

In [16]:
# End of Code