In [244]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

URL_ROOT = 'https://en.wiktionary.org'

soups = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [245]:
pd.set_option('display.max_colwidth', 250)

In [246]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1788,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1789,/wiki/%D5%9B,՛
1790,/wiki/%D5%9E,՞
1791,/wiki/%D6%89,։


In [247]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

In [248]:
def getHeadingLevel(cs):
    levels = [c for c in cs if c != 'mw-heading']
    if len(levels) > 0:
        return levels[0]

def getSubSections(section):
    results = []
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        classes = tag.attrs.get('class', [])
        if tag.name == 'div' and 'mw-heading' in classes:
            heading_level = getHeadingLevel(classes)
            if not top_heading_level:
                top_heading_level = heading_level
            elif heading_level == top_heading_level and result:
                # the first time you encounter the top-level heading again...
                if len(result) > 1: # this means there were other headings in between
                    results.append(result)
                    result = {}
                else: # consecutive top-level headings means flat hierarchy
                    top_heading_level = '_' # non-existent heading
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            # if not heading, add to current heading's subsection 
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    results.append(result)
    return results

In [371]:
entries = {}

for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas.sample(500).iterrows()]):
#for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas[lemmas.title.isin(['աման', 'փիս', '-անք', 'համ'])].iterrows()]):
    sections = getLanguageSections(get_soup(href))
    if 'Armenian' not in sections:
        continue
    entries[title] = getSubSections(sections['Armenian'])

100%|██████████| 500/500 [07:39<00:00,  1.09it/s]


In [372]:
results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

In [373]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Etymology,Pronunciation,Adjective,Declension,Noun,Antonyms,Related terms,Derived terms,Etymology 1,Alternative forms,...,Preposition,Conjunction,Numeral,Interjection,Prefix,Conjugation,Phrase,Etymology 3,Hyponyms,Etymology 4
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
մատաղ,1,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">մատաղ</a>], , [(], [matał], [)], ; see it for more.\n]]","[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...","[[[<strong class=""Armn headword"" lang=""hy"">մատաղ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">mataġ</span>, ) (,...","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">մատաղ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">mataġ</span>, )], ...",,,,,,...,,,,,,,,,,
արդարություն,1,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a href=""/wiki/%D5%A1%D6%80%D5%A4%D5%A1%D6%80%D5%B8%D6%82%D5%A9%D5%AB%D6%82%D5%B6#Old_Armenian"" title=""արդարութ...","[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...",,"[[\n, [<small> <i>n</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">արդարություն</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">ardarutʻyun...","[[[<span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%A1%D5%B6%D5%A1%D6%80%D5%A4%D5%A1%D6%80%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6#Armenian"" title=""անարդարություն"">անարդարություն</a></span>, , <span class=""mention-gloss-paren annotation-paren"">...","[[[<span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%A1%D6%80%D5%A4%D5%A1%D6%80#Armenian"" title=""արդար"">արդար</a></span>, , <span class=""mention-gloss-paren annotation-paren"">(</span>, <span class=""tr Latn"" lang=""hy-Latn"">ardar</span>, <span class...",,,,...,,,,,,,,,,
սիրահար,1,"[[Literally, ""love-stricken"", from , [<a href=""/wiki/%D5%BD%D5%A5%D6%80#Armenian"" title=""սեր"">սեր</a>], , [(], [ser], [)], +‎ , [<a href=""/wiki/-%D5%A1-#Armenian"" title=""-ա-"">-ա-</a>], , [(], [-a-], [)], +‎ , [<a class=""extiw"" href=""https://e...","[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...",,"[[\n, [<small> <i>i</i>-type, animate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;text...","[[[<strong class=""Armn headword"" lang=""hy"">սիրահար</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">sirahar</span>, ...",,,"[[[<ul><li><span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%BD%D5%AB%D6%80%D5%A1%D5%B0%D5%A1%D6%80%D5%BE%D5%A5%D5%AC#Armenian"" title=""սիրահարվել"">սիրահարվել</a></span> <span class=""mention-gloss-paren annotation-paren"">(</span><span class=""tr Latn...",,,...,,,,,,,,,,
այս,1,,"[[[<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></...",,,,,,"[[[<ul><li><span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D5%A1%D5%B5%D5%BD%D5%AB%D5%B6%D5%B9&amp;action=edit&amp;redlink=1"" title=""այսինչ (page does not exist)"">այսինչ</a></span> <span class=""mention-gloss-paren annotation...","[[[Learned borrowing], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">այս</a>], , [(], [ays], [)], . The ...","[[[<span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%A7%D5%BD#Armenian"" title=""էս"">էս</a></span>, , <span class=""mention-gloss-paren annotation-paren"">(</span>, <span class=""tr Latn"" lang=""hy-Latn"">ēs</span>, <span class=""mention-gloss-paren annot...",...,,,,,,,,,,
այս,2,,,,"[[\n, [<small> <i>i</i>-type, animate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;text...","[[[<strong class=""Armn headword"" lang=""hy"">այս</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">ays</span>, )], \n],...",,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
համատիրություն,1,"[[[<a href=""/wiki/%D5%B0%D5%A1%D5%B4%D5%A1-#Armenian"" title=""համա-"">համա-</a>], , [(], [hama-], [)], +‎ , [<a href=""/wiki/%D5%BF%D5%AB%D6%80%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6#Armenian"" title=""տիրություն"">տիրություն</a>], , [(], [tirutʻ...","[[[<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></...",,"[[\n, [<small> <i>n</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">համատիրություն</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">hamatirut...",,,,,,...,,,,,,,,,,
թառ,1,,"[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...",,"[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">թառ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">tʻaṙ</span>, )], \n]...",,,,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Persian_language"" title=""w:Persian language"">Persian</a>], , [<a href=""/wiki/%D8%AA%D8%A7%D8%B1#Persian"" title=""تار"">تار</a>], , [(], [târ], [)], .\n]]",,...,,,,,,,,,,
թառ,2,,,,"[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">թառ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">tʻaṙ</span>, )], \n]...",,,"[[[<span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D5%A9%D5%A1%D5%BC%D5%A1%D5%B0%D5%A1%D5%B6&amp;action=edit&amp;redlink=1"" title=""թառահան (page does not exist)"">թառահան</a></span>, , <span class=""mention-gloss-paren annota...",,,...,,,,,,,,,,
օրհներգ,1,"[[[<a href=""/wiki/%D6%85%D6%80%D5%B0%D5%B6%D5%A5%D5%AC#Armenian"" title=""օրհնել"">օրհնել</a>], , [(], [ōrhnel], [)], +‎ , [<a href=""/wiki/%D5%A5%D6%80%D5%A3#Armenian"" title=""երգ"">երգ</a>], , [(], [erg], [)], \n]]","[[[<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></...",,"[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">օրհներգ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">ōrhnerg</span>, ...",,,,,,...,,,,,,,,,,


In [374]:
columns_by_freq = results.count().sort_values(ascending=False)
results = results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Pronunciation,496
Etymology,446
Declension,401
Noun,301
Alternative forms,154
Adjective,137
Derived terms,126
References,74
Related terms,64
Verb,56


In [375]:
def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

results['Pronunciation'] = results['Pronunciation'].apply(process_pronunciation)

In [376]:
def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    
    return tag.text.strip()


results['Etymology'] = results['Etymology'].apply(process_basic_text)
results['Etymology'] = results['Etymology 1'].apply(process_basic_text).where(results['Etymology 1'].notna(), results['Etymology'])
results['Etymology'] = results['Etymology 2'].apply(process_basic_text).where(results['Etymology 2'].notna(), results['Etymology'])

In [377]:
parts_of_speech = ['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Conjunction',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article']

In [387]:
results[parts_of_speech].count(axis=1)[lambda x: x==0]

title   idx
այս     1      0
մը      1      0
հանուն  1      0
ավտո-   1      0
ընդդեմ  1      0
աման    2      0
այդ     1      0
վար     3      0
ինձ     1      0
dtype: int64

In [390]:
def process_definition(tag):
    if type(tag) != bs4.Tag:
        return tag
    definitions = []
    if not tag.find('ol'):
        return [{'definition': 'ERROR!!!!'}]
    for li in tag.find('ol').find_all('li'):
        if li.find('dl'):
            detail_text = li.find('dl').text
            definitions.append({'definition': li.text.replace(detail_text, '').strip(), 'detail': li.find('dl').text})
        else:
            definitions.append({'definition': li.text})
    return definitions


In [403]:
pos = 'Verb'
results[results[pos].notna()][pos].sample(10).apply(process_definition).apply(pd.Series).stack().apply(pd.Series).fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,definition,detail
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
մատակարարել,1,0,"(transitive) to provide, to supply, to purvey",Մեկնարկել է Ռուսաստանից Հայաստանին մատակարարվող գազի գնի շուրջ բանակցությունների հերթական փուլը ― Meknarkel ē Ṙusastanicʻ Hayastanin matakararvoġ gazi gni šurǰ banakcʻutʻyunneri hertʻakan pʻulə ― Another round of negotations on the price of gas s...
փայփայել,1,0,"to caress, to stroke",
փայփայել,1,1,"to show affection, fondness, care",
փայփայել,1,2,"(figuratively) to cherish, to treasure, to hold dear",հույսեր փայփայել ― huyser pʻaypʻayel ― to cherish hopes
ծառայել,1,0,to serve,բանակում ծառայել ― banakum caṙayel ― to serve in the army
խոսել,1,0,to speak,բարձր խոսել ― barjr xosel ― to speak loudly\nԵս հայերեն լավ չեմ խոսում։ ― Es hayeren lav čʻem xosum. ― I don't speak Armenian well.\nՆա անգլերեն շատ լավ է խոսում։ ― Na angleren šat lav ē xosum. ― He speaks English very well.
խոսել,1,1,to talk,Ես ուզում եմ քեզ հետ մի շատ կարևոր բանի մասին խոսել։ ― Es uzum em kʻez het mi šat karewor bani masin xosel. ― I want to talk to you about something very important.
մթնել,1,0,"(intransitive) to get dark, to become dark, to darken",մթնում է ― mtʻnum ē ― it is getting dark
արորել,1,0,to plough,"լույսի ճամփա արորել ― luysi čampʻa arorel ― (figuratively) to cultivate, to educate, to discipline\nարշալույսներ արորել ― aršaluysner arorel ― to build or create a future"
սովորել,1,0,(intransitive) to study,Synonym: ուսանել (usanel)\nսովորում եմ համալսարանում ― sovorum em hamalsaranum ― I study at the university


In [380]:
def find_element(tag, element):
    if type(tag) != bs4.Tag:
        return tag
    return tag.find(element)

In [381]:
(pd.DataFrame({col: {elem: results[col].apply(lambda x:find_element(x,elem)).count()  for elem in ['ol', 'ul']} for col in results.columns}).T['ol']
 [lambda x:x>0][lambda x:x<90].sort_values(ascending=False).index.to_list()
)

['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Conjunction',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article']