In [1]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

from markdownify import markdownify as md

pd.set_option('display.max_colwidth', 250)

URL_ROOT = 'https://en.wiktionary.org'

soups = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [2]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1791,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1792,/wiki/%D5%9B,՛
1793,/wiki/%D5%9E,՞
1794,/wiki/%D6%89,։


In [32]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

def getHeadingLevel(cs):
    levels = [c for c in cs if c != 'mw-heading']
    if len(levels) > 0:
        return levels[0]

def getSubSections(section):
    results = []
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        classes = tag.attrs.get('class', [])
        if tag.name == 'div' and 'mw-heading' in classes:
            heading_level = getHeadingLevel(classes)
            if not top_heading_level:
                top_heading_level = heading_level
            elif heading_level == top_heading_level and result:
                # the first time you encounter the top-level heading again...
                if len(result) > 1: # this means there were other headings in between
                    results.append(result)
                    result = {}
                else: # consecutive top-level headings means flat hierarchy
                    top_heading_level = '_' # non-existent heading
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            # if not heading, add to current heading's subsection 
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    results.append(result)
    return results

def find_element(tag, element):
    if type(tag) != bs4.Tag:
        return
    return tag.find(element)

def process_table(tag):
    if type(tag) != bs4.Tag:
        return tag
    return md(str(tag), strip=['a']).strip()

def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    return tag.text.strip()

def process_definition(tag):
    if type(tag) != bs4.Tag:
        return tag
    definitions = []
    if not tag.find('ol'):
        return [{'definition': 'ERROR!!!!'}]
    for li in tag.find('ol').find_all('li'):
        if li.find('dl'):
            detail_text = li.find('dl').text
            definitions.append({'definition': li.text.replace(detail_text, '').strip(), 'detail': li.find('dl').text})
        else:
            definitions.append({'definition': li.text})
    return definitions

entries = {}

In [42]:
for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas.sample(100).iterrows()]):
#for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas[lemmas.title.isin(['աման', 'փիս', '-անք', 'համ'])].iterrows()]):
    if title in entries:
        continue
    sections = getLanguageSections(get_soup(href))
    if 'Armenian' not in sections:
        continue
    entries[title] = getSubSections(sections['Armenian'])

100%|██████████| 10/10 [00:06<00:00,  1.46it/s]


In [43]:
results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

In [44]:
raw_results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

# get all potential definition fields (fields that contain <ol>)
(pd.DataFrame({col: {elem: raw_results[col].apply(lambda x:find_element(x,elem)).count() for elem in ['ol', 'table', 'ul']} for col in results.columns}).T
 #[lambda x:x>0][lambda x:x<90].sort_values(ascending=False).index.to_list()
)

Unnamed: 0,ol,table,ul
Alternative forms,0,0,5
Etymology,0,0,0
Pronunciation,0,4,9
Noun,7,0,0
Declension,0,8,0
Adjective,2,0,0
References,1,0,2
Usage notes,0,0,0
Proper noun,1,0,0
Derived terms,0,0,3


In [25]:
columns_by_freq = results.count().sort_values(ascending=False)
results = results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Etymology,103
Pronunciation,100
Declension,82
Noun,54
Adjective,40
Alternative forms,36
References,22
Derived terms,19
Related terms,15
Synonyms,13


In [46]:
results['Declension'] = results['Declension'].apply(process_table)
results['Inflection'] = results['Inflection'].apply(process_table)

In [47]:
results['Declension'][lambda x:x.notna()]

title            idx
խեր              1      nominalized, *i*-type (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | խեր (xer) | խերեր (xerer) |\n| dative | խերի (xeri) | խերերի (xereri) |\n| ablative | խերից (xericʻ) | խերերից (xerericʻ) |\n| instrumen...
քյար             1      *i*-type, inanimate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | քյար (kʻyar) | քյարեր (kʻyarer) |\n| dative | քյարի (kʻyari) | քյարերի (kʻyareri) |\n| ablative | քյարից (kʻyaricʻ) | քյարերից (kʻyarericʻ...
Բայդեն           1      *i*-type, animate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | Բայդեն (Bayden) | Բայդեններ (Baydenner) |\n| dative | Բայդենի (Baydeni) | Բայդենների (Baydenneri) |\n| ablative | Բայդենից (Baydenicʻ) | Բայ...
ընտրություն      1      *n*-type, inanimate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ընտրություն (əntrutʻyun) 

In [9]:
results['Pronunciation'] = results['Pronunciation'].apply(process_pronunciation)

In [10]:


results['Etymology'] = results['Etymology'].apply(process_basic_text)
results['Etymology'] = results['Etymology 1'].apply(process_basic_text).where(results['Etymology 1'].notna(), results['Etymology'])
results['Etymology'] = results['Etymology 2'].apply(process_basic_text).where(results['Etymology 2'].notna(), results['Etymology'])

In [38]:
parts_of_speech = [c for c in ['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article', 
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Pronoun',
 'Particle',
 'Conjunction',
 'Preposition',
 'Prefix',
 'Letter',
 'Determiner',
 'Punctuation mark',
 'Numeral'] if c in results.columns]

parts_of_speech

['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Interjection',
 'Preposition',
 'Numeral',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Pronoun',
 'Particle',
 'Preposition',
 'Letter',
 'Numeral']

In [12]:
for col in (parts_of_speech and results.columns):
    results[col] = results[col].apply(process_definition)

In [17]:
import random


pos = 'Verb' #random.choice(parts_of_speech and results.columns)

results[results[pos].notna()][pos].sample(3).apply(pd.Series).stack().apply(pd.Series).fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,definition,detail
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
տնկել,1,0,(transitive) to plant (to place in soil in order that it may live and grow),ծառ տնկել ― caṙ tnkel ― to plant a tree
տնկել,1,1,"(transitive) to plant, to implant, to fix to place or set something firmly",դրոշ տնկել ― droš tnkel ― to plant a flag
տնկել,1,2,"(transitive) to stretch out, to stick out, to extend, to raise (e.g. a hand, the nose, the tail, the penis)","Synonyms: մեկնել (meknel), ցցել (cʻcʻel)"
նայած,1,0,resultative participle of նայել (nayel),
պատահել,1,0,"(intransitive) to meet accidentally, bump into",
պատահել,1,1,"(intransitive) to happen, occur",ի՞նչ է պատահել ― i?nčʻ ē patahel ― what has happened?
