# Find most common words

### Create environment

In [2]:
# import necessary libraries
import operator
import pandas as pd
import requests
import string
import time

from bs4 import BeautifulSoup
from collections import Counter
from encoding import decode

In [3]:
# constants
odyssey_url = "https://www.sacred-texts.com/cla/homer/greek/ody09.htm"
iliad_url = "https://www.sacred-texts.com/cla/homer/greek/ili06.htm"

base_start = 'http://www.perseus.tufts.edu/hopper/morph?l='
base_end = '&la=greek'

headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

In [4]:
# get text
ody9 = BeautifulSoup(requests.get(odyssey_url, headers=headers).text, "html.parser")
ody9_book = ody9.find_all('p')
ody9_lines = []
for line in ody9_book:
    ody9_lines.append(line.text)

ili6 = BeautifulSoup(requests.get(iliad_url, headers=headers).text, "html.parser")
ili6_book = ili6.find_all('p')
ili6_lines = []
for line in ili6_book:
    ili6_lines.append(line.text)

### Clean text

In [5]:
# remove irrelevant lines
ody9_lines = [x for x in ody9_lines if x!='\xa0'][:-1]
ili6_lines = [x for x in ili6_lines if x!='\xa0'][:-1]

# remove line numbers
for i in range(len(ody9_lines)):
    if ((i+1)%5)==0:
        ody9_lines[i] = ody9_lines[i].replace(f' {i+1}', '')
for i in range(len(ili6_lines)):
    if ((i+1)%5)==0:
        ili6_lines[i] = ili6_lines[i].replace(f' {i+1}', '')

# choose relevent iliad lines
ili6_lines = ili6_lines[118:236]

## Find most common words

In [6]:
# save odyssey vocabulary
ody9_text = ' '.join(ody9_lines)
ody9_text = ody9_text.translate(str.maketrans('', '', string.punctuation))
ody9_dict = Counter(ody9_text.split())
ody9_lemmatised = {}

for word in ody9_dict.keys():
    decoded = decode(word)

    definitions = BeautifulSoup(requests.get(base_start+decoded+base_end, headers=headers).text, "html.parser")
    gr = definitions.find_all('h4')
    en = definitions.find_all('span')
    
    gr_list = []
    for lemma in gr:
        gr_list.append(lemma)
    en_list = []
    for defs in en:
        en_list.append(defs)


    for i in range(len(gr_list)):
        if gr_list[i] in ody9_lemmatised.keys():
            ody9_lemmatised[gr_list[i].text][1] += ody9_dict[word]
        else:
            ody9_lemmatised[gr_list[i].text] = [str(en_list[i].text), ody9_dict[word]]
    
    time.sleep(3)

words = [word for word in ody9_lemmatised.keys()]
defs = []
freq = []
for word in ody9_lemmatised.keys():
    defs.append(ody9_lemmatised[word][0])
    freq.append(ody9_lemmatised[word][1])

df = pd.DataFrame(list(zip(words, defs, freq)), columns=['word', 'definition', 'frequency'])
df.to_csv('odyssey9_vocab.csv', header=True)

In [44]:
ody9_lemmatised

{'ὁ': ['\n\t\tthe following\n    \t', 3],
 'τόνος': ['\n\t\tthat by which a thing is stretched,\n    \t', 12],
 'τονή': ['\n\t\tprolongation\n    \t', 12],
 'δέ': ['\n\t\tbut\n    \t', 24],
 'ἀπαμείβομαι': ['\n\t\treply, answer,\n    \t', 1],
 'πρόσφημι': ['\n\t\tspeak to, address\n    \t', 3],
 'πολύμητις': ['\n\t\tof many counsels,\n    \t', 1],
 'Ὀδυσσεύς': ['\n\t\tOdysseus,\n    \t', 1],
 'Ἀλκίνοος': ['\n[definition unavailable]\n', 1],
 'Κρείων': ['\n\t\tCreon.\n    \t', 1],
 'κρείων': ['\n\t\truler, lord, master\n    \t', 1],
 'κρεῖον': ['\n\t\tmeat-tray, dresser\n    \t', 1],
 'πᾶς': ['\n[definition unavailable]\n', 3],
 'πᾶϲ': ['\n[definition unavailable]\n', 3],
 'ἀριδείκετος': ['\n\t\tfamous, glorious\n    \t', 1],
 'λᾶας': ['\n\t\tstone\n    \t', 1],
 'λαός': ['\n\t\tmen\n    \t', 1],
 'ἄημι': ['\n\t\tvā´ti\n    \t', 3],
 'ἀάω': ['\n\t\thurt, damage,\n    \t', 3],
 'ἐάω': ['\n\t\tsuffer, permit\n    \t', 2],
 'ἐή': ['\n[definition unavailable]\n', 3],
 'εἰμί': ['\n\t\tsum\n 

In [None]:
# save odyssey vocabulary
ili6_text = ' '.join(ili6_lines)
ili6_text = ili6_text.translate(str.maketrans('', '', string.punctuation))
ili6_dict = Counter(ili6_text.split())
ili6_lemmatised = {}

for word in ili6_dict.keys():
    decoded = decode(word)

    definitions = BeautifulSoup(requests.get(base_start+decoded+base_end, headers=headers).text, "html.parser")
    gr = definitions.find_all('h4')
    en = definitions.find_all('span')
    
    gr_list = []
    for lemma in gr:
        gr_list.append(lemma)
    en_list = []
    for defs in en:
        en_list.append(defs)


    for i in range(len(gr_list)):
        if gr_list[i] in ili6_lemmatised.keys():
            ili6_lemmatised[gr_list[i].text][1] += ili6_dict[word]
        else:
            ili6_lemmatised[gr_list[i].text] = [str(en_list[i].text), ili6_dict[word]]
    
    time.sleep(3)

words = [word for word in ili6_lemmatised.keys()]
defs = []
freq = []
for word in ili6_lemmatised.keys():
    defs.append(ili6_lemmatised[word][0])
    freq.append(ili6_lemmatised[word][1])

df = pd.DataFrame(list(zip(words, defs, freq)), columns=['word', 'definition', 'frequency'])
df.to_csv('iliad6_vocab.csv', header=True)