In [111]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

URL_ROOT = 'https://en.wiktionary.org'

soups = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

def get_definitions(headword, languages=['ajp', 'apc']):
    lemma = headword.strong.get_text()
    language = headword.strong.attrs['lang']
    headline = headword.find_previous(class_='mw-headline').string
    print(lemma)

    def_ol = headword.find_next_sibling('ol')
    
    results = []
    for li in def_ol.select('li'):
        result = {'lemma': lemma, 'language': language, 'headline': headline}
        definition = ''
        for s in li.strings:
            if 'dl' in list(p.name for p in s.parents):
                break
            definition += s
        result['definition'] = definition.strip()
        if li.dl:
            detail = '\n'.join(e.get_text() for e in  li.dl.find_all(class_=re.compile('^e')))
            result['detail'] = detail.strip()
        results.append(result)
    return results



In [201]:
pd.set_option('display.max_colwidth', 250)

In [168]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%...,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1789,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1790,/wiki/%D5%9B,՛
1791,/wiki/%D5%9E,՞
1792,/wiki/%D6%89,։


In [303]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

def getSubSections(section):
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    for tag in section:
        if tag.name == 'div' and 'mw-heading' in tag.attrs.get('class', []):
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    return result

In [314]:
entries = {}

for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas.sample(100).iterrows()]):
    sections = getLanguageSections(get_soup(href))
    if 'Armenian' not in sections:
        continue
    entries[title] = getSubSections(sections['Armenian'])

100%|██████████| 100/100 [01:13<00:00,  1.35it/s]


In [315]:
columns_by_freq = pd.DataFrame(entries).T.count().sort_values(ascending=False)

In [316]:
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Pronunciation,99
Etymology,90
Declension,73
Noun,56
Alternative forms,28
Derived terms,26
Adjective,23
Verb,14
Synonyms,12
Adverb,12


In [317]:
pd.DataFrame(entries).T[columns_by_freq.index] #[lambda x:x['Verb'].notna()]

Unnamed: 0,Pronunciation,Etymology,Declension,Noun,Alternative forms,Derived terms,Adjective,Verb,Synonyms,Adverb,...,Pronoun,Suffix,Conjugation,Particle,Interjection,Antonyms,Letter,Etymology 3,Punctuation mark,Conjunction
հարցնել,"[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a href=""/wiki/%D5%B0%D5%A1%D6%80%D6%81%D5%A1%D5%B6%D5%A5%D5%B4#Old_Armenian"" title=""հարցանեմ"">հարցանեմ</a>], ...","[[\n, [causative conjugation (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table border=""1px solid #000000"" class=""inflection-table"" style=""border-collapse...",,,,,"[[[<strong class=""Armn headword"" lang=""hy"">հարցնել</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">harcʻnel</span>,...",,,...,,,,,,,,,,
ցողուն,"[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[[Learned borrowing], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a href=""/wiki/%D6%81%D6%85%D5%B2%D5%B8%D6%82%D5%B6#Old_Armenian"" title=""ցօղուն"">ցօղուն</...","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">ցողուն</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">cʻoġun</span>, )]...","[[[<span class=""Armn"" lang=""hy""><a href=""/wiki/%D6%81%D6%85%D5%B2%D5%B8%D6%82%D5%B6#Armenian"" title=""ցօղուն"">ցօղուն</a></span>, , <span class=""mention-gloss-paren annotation-paren"">(</span>, <span class=""tr Latn"" lang=""hy-Latn"">cʻōġun</span>, <s...","[[[<span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D6%81%D5%B8%D5%B2%D5%B8%D6%82%D5%B6%D5%A1%D5%B5%D5%AB%D5%B6&amp;action=edit&amp;redlink=1"" title=""ցողունային (page does not exist)"">ցողունային</a></span>, , <span class=""me...",,,,,...,,,,,,,,,,
մահապատիժ,"[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[From , [<a href=""/wiki/%D5%B4%D5%A1%D5%B0#Armenian"" title=""մահ"">մահ</a>], , [(], [mah], [)], +‎ , [<a href=""/wiki/-%D5%A1-#Armenian"" title=""-ա-"">-ա-</a>], , [(], [-a-], [)], +‎ , [<a href=""/wiki/%D5%BA%D5%A1%D5%BF%D5%AB%D5%AA#Armenian"" titl...","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">մահապատիժ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">mahapatiž</spa...",,,,,,,...,,,,,,,,,,
այլևայլություն,"[[[<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">)</span>...","[[[<a href=""/wiki/%D5%A1%D5%B5%D5%AC%D6%87%D5%A1%D5%B5%D5%AC#Armenian"" title=""այլևայլ"">այլեւայլ</a>], , [(], [aylewayl], [)], +‎ , [<a href=""/wiki/-%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6#Armenian"" title=""-ություն"">-ություն</a>], , [(], [-u...","[[\n, [<small> <i>n</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">այլևայլություն</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">aylewaylu...",,,,,,,...,,,,,,,,,,
ունկ,"[[[<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">)</span>...","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a href=""/wiki/%D5%B8%D6%82%D5%B6%D5%AF%D5%B6#Old_Armenian"" title=""ունկն"">ունկն</a>], , [(], [unkn], [)], .\n]]","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">ունկ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">unk</span>, )], \n]...",,,,,"[[[<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content"">ear</span>, <span class=""ib-brac qualifier-brac"">)</span>, <span class=""ib-colon sense-qualifier-colon"">:</span>, , <span class=""Armn"" lang=""hy""><a href...",,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ուրանալ,"[[[<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">)</span>...","[[[Inherited], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""new"" href=""/w/index.php?title=%D5%B8%D6%82%D6%80%D5%A1%D5%B6%D5%A1%D5%B4&amp;action=edi...",,,,"[[[<ul><li><span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%B0%D5%A1%D5%BE%D5%A1%D5%BF%D5%B8%D6%82%D6%80%D5%A1%D6%81#Armenian"" title=""հավատուրաց"">հավատուրաց</a></span> <span class=""mention-gloss-paren annotation-paren"">(</span><span class=""tr Latn...",,"[[[<strong class=""Armn headword"" lang=""hy"">ուրանալ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">uranal</span>, )...",,,...,,,,,,,,,,
կուլ,"[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">կուլ</a>], , [(], [kul], [)], ; see it for more.\n]]","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[[<strong class=""Armn headword"" lang=""hy"">կուլ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">kul</span>, )], \n]...",,"[[[<span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%AF%D5%B8%D6%82%D5%AC_%D5%BF%D5%A1%D5%AC#Armenian"" title=""կուլ տալ"">կուլ տալ</a></span>, , <span class=""mention-gloss-paren annotation-paren"">(</span>, <span class=""tr Latn"" lang=""hy-Latn"">kul ta...",,,,,...,,,,,,,,,,
կացնահար,"[[[<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">)</span>...","[[From , [<a href=""/wiki/%D5%AF%D5%A1%D6%81%D5%B6%D5%A1%D5%B0%D5%A1%D6%80%D5%A5%D5%AC#Armenian"" title=""կացնահարել"">կացնահարել</a>], , [(], [kacʻnaharel], [)], .\n]]","[[\n, [<small>nominalized, <i>i</i>-type</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;t...",,,,"[[[<strong class=""Armn headword"" lang=""hy"">կացնահար</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">kacʻnahar</span...",,,,...,,,,,,,,,,
զերդ,"[[[<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">)</span>...","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">զերդ</a>], , [(], [zerd], [)], .\n]]",,,"[[[<span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D5%A6%D5%A5%D6%80%D5%A9&amp;action=edit&amp;redlink=1"" title=""զերթ (page does not exist)"">զերթ</a></span>, , <span class=""mention-gloss-paren annotation-paren"">(</span>, <s...",,,,"[[[<span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%AB%D5%B6%D5%B9%D5%BA%D5%A5%D5%BD#Armenian"" title=""ինչպես"">ինչպես</a></span>, , <span class=""mention-gloss-paren annotation-paren"">(</span>, <span class=""tr Latn"" lang=""hy-Latn"">inčʻpes</span>, <...",,...,,,,,,,,,,"[[[<strong class=""Armn headword"" lang=""hy"">զերդ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">zerd</span>, )], \n..."


In [325]:
def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return
    
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

pd.DataFrame(entries).T['Pronunciation'].apply(process_pronunciation)

հարցնել                          (Eastern Armenian) IPA(key): /hɑɾt͡sʰˈnel/, [hɑɾt͡sʰnél]\n(Western Armenian) IPA(key): /hɑɾt͡səˈnel/, [hɑɾt͡sʰənél]
ցողուն                                     (Eastern Armenian) IPA(key): /t͡sʰoˈʁun/, [t͡sʰoʁún]\n(Western Armenian) IPA(key): /t͡soˈʁun/, [t͡sʰoʁún]
մահապատիժ                             (Eastern Armenian) IPA(key): /mɑhɑpɑˈtiʒ/, [mɑhɑpɑtíʒ]\n(Western Armenian) IPA(key): /mɑhɑbɑˈdiʒ/, [mɑhɑbɑdíʒ]
այլևայլություն    (Eastern Armenian) IPA(key): /ɑjlevɑjluˈtʰjun/, [ɑjlevɑjlut͡sʰjún]\n(Western Armenian) IPA(key): /ɑjlevɑjluˈtʏn/, [ɑjlevɑjlutʰʏ́n]
ունկ                                                            (Eastern Armenian) IPA(key): /unk/, [uŋk]\n(Western Armenian) IPA(key): /unɡ/, [uŋɡ]
                                                                                 ...                                                                
ուրանալ                                         (Eastern Armenian) IPA(key): /uɾɑˈnɑl/, [uɾɑnɑ́l]\n(Wester

In [326]:
process_pronunciation(pd.DataFrame(entries).T['Pronunciation'][lambda x:x.notna()][lambda x:x.apply(len)>1].iloc[0])

'(Eastern Armenian) IPA(key): /heʁiˈnɑk/, [heʁinɑ́k]\n(Western Armenian) IPA(key): /heʁiˈnɑɡ/, [heʁinɑ́ɡ]'