In [74]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

URL_ROOT = 'https://en.wiktionary.org'

soups = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [75]:
pd.set_option('display.max_colwidth', 250)

In [76]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1789,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1790,/wiki/%D5%9B,՛
1791,/wiki/%D5%9E,՞
1792,/wiki/%D6%89,։


In [77]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

def getSubSections(section):
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        if tag.name == 'div' and 'mw-heading' in tag.attrs.get('class', []):
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    return result

In [78]:
lemmas[lemmas.title.isin(['աման', 'փիս', '-անք', 'համ'])]

Unnamed: 0,href,title
42,/wiki/%D5%A1%D5%B4%D5%A1%D5%B6,աման
117,/wiki/-%D5%A1%D5%B6%D6%84,-անք
916,/wiki/%D5%B0%D5%A1%D5%B4,համ
1700,/wiki/%D6%83%D5%AB%D5%BD,փիս


In [79]:
entries = {}

#for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas.sample(1).iterrows()]):
for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas[lemmas.title.isin(['աման', 'փիս', '-անք', 'համ'])].iterrows()]):
    sections = getLanguageSections(get_soup(href))
    if 'Armenian' not in sections:
        continue
    entries[title] = getSubSections(sections['Armenian'])

entries

100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


{'աման': {'Pronunciation': <div><ul><li class="mw-empty-elt"></li>
  <li><span class="usage-label-accent"><span class="ib-brac">(</span><span class="ib-content"><a class="extiw" href="https://en.wikipedia.org/wiki/Eastern_Armenian" title="w:Eastern Armenian">Eastern Armenian</a></span><span class="ib-brac">)</span></span> <a href="/wiki/Wiktionary:International_Phonetic_Alphabet" title="Wiktionary:International Phonetic Alphabet">IPA</a><sup>(<a href="/wiki/Appendix:Armenian_pronunciation" title="Appendix:Armenian pronunciation">key</a>)</sup>: <span class="IPA">/ɑˈmɑn/</span>, <span class="IPA">[ɑmɑ́n]</span></li>
  <li><span class="usage-label-accent"><span class="ib-brac">(</span><span class="ib-content"><a class="extiw" href="https://en.wikipedia.org/wiki/Western_Armenian" title="w:Western Armenian">Western Armenian</a></span><span class="ib-brac">)</span></span> <a href="/wiki/Wiktionary:International_Phonetic_Alphabet" title="Wiktionary:International Phonetic Alphabet">IPA</a><su

In [80]:
pd.DataFrame(entries)#]

Unnamed: 0,աման,-անք,համ,փիս
Pronunciation,"[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">...","[[[], \n, [<span class=""usage-label-accent""><span class=""ib-brac"">(</span><span class=""ib-content""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a></span><span class=""ib-brac"">..."
Etymology 1,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Middle_Armenian"" title=""w:Middle Armenian"">Middle Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Middle_Armenian"">աման</a>], , [(], [aman], [)], , from , [<a class=""extiw""...","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">-անք</a>], , [(], [-ankʻ], [)], .\n]]","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Old_Armenian"">համ</a>], , [(], [ham], [)], .\n]]","[[[], \nVia , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Ottoman_Turkish"" title=""w:Ottoman Turkish"">Ottoman Turkish</a>], , [<a href=""/wiki/%D9%BE%DB%8C%D8%B3#Ottoman_Turkish"" title=""پیس"">پیس</a>], , [(], [pis], [)], and , [<a class..."
Noun,"[[[<strong class=""Armn headword"" lang=""hy"">աման</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">aman</span>, )], \n...",,"[[[<strong class=""Armn headword"" lang=""hy"">համ</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">ham</span>, )], \n],...","[[[<strong class=""Armn headword"" lang=""hy"">փիս</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">pʻis</span>, )], , ..."
Declension,"[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te...","[[\n, [<small> <i>i</i>-type, inanimate</small>, (, <a class=""extiw"" href=""https://en.wikipedia.org/wiki/Eastern_Armenian"" title=""w:Eastern Armenian"">Eastern Armenian</a>, )], \n, [\n, <table class=""inflection-table"" style=""background:#F9F9F9;te..."
Derived terms,"[[[<ul><li><span class=""Armn"" lang=""hy""><a href=""/wiki/%D5%A1%D5%BD%D5%B2%D5%A1%D5%B4%D5%A1%D5%B6#Armenian"" title=""ասղաման"">ասղաման</a></span> <span class=""mention-gloss-paren annotation-paren"">(</span><span class=""tr Latn"" lang=""hy-Latn"">asġaman...","[[[<div class=""CategoryTreeItem""><span class=""CategoryTreeBullet""><a aria-expanded=""false"" class=""CategoryTreeToggle"" data-ct-title=""Armenian_terms_suffixed_with_-անք_(collective)""></a> </span> <a href=""/wiki/Category:Armenian_terms_suffixed_with...","[[[<span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D5%B0%D5%A1%D5%B4%D5%B6_%D5%A1%D5%BC%D5%B6%D5%A5%D5%AC&amp;action=edit&amp;redlink=1"" title=""համն առնել (page does not exist)"">համն առնել</a></span>, , <span class=""mention...","[[[<ul><li><span class=""Armn"" lang=""hy""><a class=""new"" href=""/w/index.php?title=%D6%83%D5%AB%D5%BD%D5%A1%D5%AD%D5%B8%D5%BD&amp;action=edit&amp;redlink=1"" title=""փիսախոս (page does not exist)"">փիսախոս</a></span> <span class=""mention-gloss-paren an..."
Descendants,"[[[<span class=""desc-arr"" title=""borrowed"">→</span>, Northern Kurdish: , <span class=""Latn"" lang=""kmr""><a href=""/wiki/aman#Northern_Kurdish"" title=""aman"">aman</a></span>, <style data-mw-deduplicate=""TemplateStyles:r68481116"">.mw-parser-output .d...",,,
Etymology 2,"[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Ottoman_Turkish"" title=""w:Ottoman Turkish"">Ottoman Turkish</a>], , [<a href=""/wiki/%D8%A7%D9%85%D8%A7%D9%86#Ottoman_Turkish"" title=""امان"">امان</a>], .\n]]","[[[], \n, [Inherited], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Classical_Armenian"" title=""w:Classical Armenian"">Old Armenian</a>], , [-անք], , [(], [-ankʻ], [)], , the plural of , [<a href=""/wiki/-%D5%A1%D5%B6#Old_Armenian...","[[From , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Middle_Armenian"" title=""w:Middle Armenian"">Middle Armenian</a>], , [<a class=""mw-selflink-fragment"" href=""#Middle_Armenian"">համ</a>], , [(], [ham], [)], , from , [<a class=""extiw"" h...","[[[], \n, [Borrowed], from , [<a class=""extiw"" href=""https://en.wikipedia.org/wiki/Georgian_language"" title=""w:Georgian language"">Georgian</a>], , [<a href=""/wiki/%E1%83%A4%E1%83%98%E1%83%A1%E1%83%98#Georgian"" title=""ფისი"">ფისი</a>], , [(], [p..."
Interjection,"[[[<strong class=""Armn headword"" lang=""hy"">աման</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">aman</span>, )], \n...",,,
Suffix,,"[[[<strong class=""Armn headword"" lang=""hy"">-անք</strong>, , <a href=""/wiki/Wiktionary:Armenian_transliteration"" title=""Wiktionary:Armenian transliteration"">•</a>, (, <span class=""headword-tr tr Latn"" dir=""ltr"" lang=""hy-Latn"">-ankʻ</span>, )], \...",,
References,,"[[[<span class=""citation-whole""><span class=""cited-source""><a class=""extiw"" href=""https://en.wikipedia.org/wiki/Manuk_Abeghian"" title=""w:Manuk Abeghian"">Abeġyan, Manuk</a> (<span class=""None"" lang=""und"">1965</span>) <cite>Hayocʻ lezvi tesutʻyun</...",,


In [81]:
results = pd.DataFrame(entries).T

In [82]:
columns_by_freq = results.count().sort_values(ascending=False)
results = results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Pronunciation,4
Etymology 1,4
Declension,4
Derived terms,4
Etymology 2,4
Noun,3
Alternative forms,2
Further reading,2
Descendants,1
Interjection,1


In [83]:
def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

results['Pronunciation'] = results['Pronunciation'].apply(process_pronunciation)

In [84]:
def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    
    return tag.text.strip()


results['Etymology'] = results['Etymology'].apply(process_basic_text)
results['Etymology 1'] = results['Etymology 1'].apply(process_basic_text)
results['Etymology 2'] = results['Etymology 2'].apply(process_basic_text)

KeyError: 'Etymology'

In [None]:
results

Unnamed: 0,Pronunciation,Etymology 1,Declension,Derived terms,Etymology 2,Noun,Alternative forms,Further reading,Descendants,Interjection,Suffix,References,Conjunction,Usage notes,Adjective,Adverb
աման,,,[],[],,[],,,[],[],,,,,,
-անք,,,[],[],,,,,,,[],[],,,,
համ,,,[],[],,[],[],[],,,,,[],[],,
փիս,,,[],[],,[],[],[],,,,,,,[],[]


In [None]:
results[results['Etymology 2'].notna()].T.columns

Index(['աման', '-անք', 'համ', 'փիս'], dtype='object')