In [244]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

URL_ROOT = 'https://en.wiktionary.org'

soups = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [245]:
pd.set_option('display.max_colwidth', 250)

In [246]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1788,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1789,/wiki/%D5%9B,՛
1790,/wiki/%D5%9E,՞
1791,/wiki/%D6%89,։


In [247]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

In [248]:
def getHeadingLevel(cs):
    levels = [c for c in cs if c != 'mw-heading']
    if len(levels) > 0:
        return levels[0]

def getSubSections(section):
    results = []
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        classes = tag.attrs.get('class', [])
        if tag.name == 'div' and 'mw-heading' in classes:
            heading_level = getHeadingLevel(classes)
            if not top_heading_level:
                top_heading_level = heading_level
            elif heading_level == top_heading_level and result:
                # the first time you encounter the top-level heading again...
                if len(result) > 1: # this means there were other headings in between
                    results.append(result)
                    result = {}
                else: # consecutive top-level headings means flat hierarchy
                    top_heading_level = '_' # non-existent heading
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            # if not heading, add to current heading's subsection 
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    results.append(result)
    return results

In [417]:
#entries = {}

In [418]:
for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas.sample(1000).iterrows()]):
#for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas[lemmas.title.isin(['աման', 'փիս', '-անք', 'համ'])].iterrows()]):
    if title in entries:
        continue
    sections = getLanguageSections(get_soup(href))
    if 'Armenian' not in sections:
        continue
    entries[title] = getSubSections(sections['Armenian'])

 27%|██▋       | 273/1000 [03:26<09:10,  1.32it/s]


KeyboardInterrupt: 

In [419]:
results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

In [420]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Etymology,Pronunciation,Verb,Inflection,Etymology 1,Noun,Usage notes,Declension,Derived terms,Etymology 2,...,Interjection,Synonyms,Proper noun,Particle,Pronoun,Antonyms,Letter,Punctuation mark,Postposition,Conjunction
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
հաճախել,1,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""new"" href=""/w/index.php?title=%D5%B5%D5%A1%D5%B3%D5%A1%D5%AD%D5%A5%D5%B4&amp;action=edit&amp;redlink=...","[[[<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></...","[[[<strong class=""Armn headword"" lang=""hy"">հաճախել</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">hačaxel</span>, ...","[[\n, [<i>-el</i>, conjugation (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table border=""1px solid #000000"" class=""inflection-table"" style=""border-colla...",,,,,,,...,,,,,,,,,,
մատ,1,,[],,,[],[],[],[],,,...,,,,,,,,,,
մատ,2,,,,,,[],,,[],[],...,,,,,,,,,,
մատ,3,,,,,,[],,[],,,...,,,,,,,,,,
վստահ,1,"[[[Learned borrowing], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">վստահ</a>], , [(], [vstah], [)], . ...","[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...",,,,,,"[[\n, [<small>nominalized, <i>i</i>-type</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;t...","[[\n, [Derived terms], \n, [\n, <ul><li><span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D5%A1%D5%B6%D5%B1%D5%B6%D5%A1%D5%BE%D5%BD%D5%BF%D5%A1%D5%B0&amp;action=edit&amp;redlink=1"" title=""անձնավստահ (page does not exist)"">անձն...",,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
համր,1,"[[[Learned borrowing], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">համր</a>], , [(], [hamr], [)], .\n]]","[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...",,,,"[[[<strong class=""Armn headword"" lang=""hy"">համր</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">hamr</span>, )], \n...",,"[[\n, [<small> <i>i</i>-type, animate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;text...",,,...,,,,,,,,,,
ազգություն,1,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""new"" href=""/w/index.php?title=%D5%A1%D5%A6%D5%A3%D5%B8%D6%82%D5%A9%D5%AB%D6%82%D5%B6&amp;action=edit&...","[[[], \n, [<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armeni...",,,,"[[[<strong class=""Armn headword"" lang=""hy"">ազգություն</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">azgutʻyun</sp...","[[Corresponds well to the usage of Russian , [<a href=""/wiki/%D0%BD%D0%B0%D1%86%D0%B8%D0%BE%D0%BD%D0%B0%D0%BB%D1%8C%D0%BD%D0%BE%D1%81%D1%82%D1%8C#Russian"" title=""национальность"">национа́льность</a>], , [(], [nacionálʹnostʹ], [)], .\n]]","[[\n, [<small> <i>n</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...",,,...,,,,,,,,,,
հառաչանք,1,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">հառաչանք</a>], , [(], [haṙačʻankʻ], [)], .\n]]","[[[<span class=""ib-brac qualifier-brac"">(</span>, <span class=""ib-content qualifier-content""><span class=""usage-label-accent""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></...",,,,"[[[<strong class=""Armn headword"" lang=""hy"">հառաչանք</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">haṙačʻankʻ</spa...",,"[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...",,,...,,,,,,,,,,
հարց,1,[],[],,,,[],,[],,,...,,,,,,,,,,


In [421]:
columns_by_freq = results.count().sort_values(ascending=False)
results = results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Pronunciation,271
Etymology,251
Declension,222
Noun,171
Alternative forms,82
Derived terms,77
Adjective,74
References,50
Related terms,35
Adverb,28


In [422]:
def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

results['Pronunciation'] = results['Pronunciation'].apply(process_pronunciation)

In [423]:
def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    
    return tag.text.strip()


results['Etymology'] = results['Etymology'].apply(process_basic_text)
results['Etymology'] = results['Etymology 1'].apply(process_basic_text).where(results['Etymology 1'].notna(), results['Etymology'])
results['Etymology'] = results['Etymology 2'].apply(process_basic_text).where(results['Etymology 2'].notna(), results['Etymology'])

In [424]:
parts_of_speech = ['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Conjunction',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article']

In [425]:
results[parts_of_speech].count(axis=1)[lambda x: x==0]

KeyError: "['Determiner', 'Numeral', 'Prefix', 'Article'] not in index"

In [426]:
def process_definition(tag):
    if type(tag) != bs4.Tag:
        return tag
    definitions = []
    if not tag.find('ol'):
        return [{'definition': 'ERROR!!!!'}]
    for li in tag.find('ol').find_all('li'):
        if li.find('dl'):
            detail_text = li.find('dl').text
            definitions.append({'definition': li.text.replace(detail_text, '').strip(), 'detail': li.find('dl').text})
        else:
            definitions.append({'definition': li.text})
    return definitions


In [433]:
pos = 'Verb'
results[results[pos].notna()][pos].sample(10).apply(process_definition).apply(pd.Series).stack().apply(pd.Series).fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,definition,detail
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
է,3,0,third-person singular present of եմ (em),Նա հայ է։ ― Na hay ē. ― He is an Armenian.\nՆա հայ չէ։ ― Na hay čʻē. ― He is not an Armenian.\nՆա գիրքը կարդալու է։ ― Na girkʻə kardalu ē. ― He will read the book.
ներել,1,0,to forgive,ների՛ր ինձ ― nerír inj ― forgive me!
պատկանել,1,0,"to belong, to belong to, to pertain",ըստ պատկանելույն ― əst patkaneluyn ― whoever it belongs to will receive it
մտածել,1,0,to think,Ինչի՞ մասին ես մտածում։ (Eastern Armenian)Inčʻi? masin es mtacum.What are you thinking about?\nԻնչի՞ մասին կը մտածես։ (Western Armenian)Inčʻi? masin kə mtaces.What are you thinking about?
մտածել,1,1,"to contemplate, to reflect upon",
մտածել,1,2,"to think, to believe",
մտածել,1,3,"to intend, to plan",
մտածել,1,4,"to worry, to be anxious",
մտածել,1,5,to have an idea,
հպարտանալ,1,0,to be proud (with instrumental),Վարուժանը հպարտանում է իր որդով։ ― Varužanə hpartanum ē ir ordov. ― Varuzhan is proud of his son.


In [380]:
def find_element(tag, element):
    if type(tag) != bs4.Tag:
        return tag
    return tag.find(element)

In [381]:
(pd.DataFrame({col: {elem: results[col].apply(lambda x:find_element(x,elem)).count()  for elem in ['ol', 'ul']} for col in results.columns}).T['ol']
 [lambda x:x>0][lambda x:x<90].sort_values(ascending=False).index.to_list()
)

['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Conjunction',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article']