In [111]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

URL_ROOT = 'https://en.wiktionary.org'

soups = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

def get_definitions(headword, languages=['ajp', 'apc']):
    lemma = headword.strong.get_text()
    language = headword.strong.attrs['lang']
    headline = headword.find_previous(class_='mw-headline').string
    print(lemma)

    def_ol = headword.find_next_sibling('ol')
    
    results = []
    for li in def_ol.select('li'):
        result = {'lemma': lemma, 'language': language, 'headline': headline}
        definition = ''
        for s in li.strings:
            if 'dl' in list(p.name for p in s.parents):
                break
            definition += s
        result['definition'] = definition.strip()
        if li.dl:
            detail = '\n'.join(e.get_text() for e in  li.dl.find_all(class_=re.compile('^e')))
            result['detail'] = detail.strip()
        results.append(result)
    return results



In [71]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%...,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1789,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1790,/wiki/%D5%9B,՛
1791,/wiki/%D5%9E,՞
1792,/wiki/%D6%89,։


In [72]:
def get_sections(soup):
    
    subselectors = ','.join(f'.headword[lang="{language}"]' for language in languages)
    return soup.select(f'div#bodyContent h2 ~ p:has({subselectors})')


In [73]:
headings = random_soup.select('div#bodyContent div.mw-heading2')

In [74]:
headings[0].find('h2').text

'Armenian'

In [122]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

In [123]:
def get_random_soup():
    idx = random.randint(0, len(lemmas)-1)
    row = lemmas.iloc[idx]
    print(row.title)
    return get_soup(row.href)
    
random_soup = get_random_soup()

getLanguageSections(random_soup)

վստահ


{'Armenian': [<div class="mw-heading mw-heading3"><h3 id="Etymology">Etymology</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=%D5%BE%D5%BD%D5%BF%D5%A1%D5%B0&amp;action=edit&amp;section=2" title="Edit section: Etymology"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div>,
  <p><a href="/wiki/Appendix:Glossary#learned_borrowing" title="Appendix:Glossary">Learned borrowing</a> from <span class="etyl"><a class="extiw" href="https://en.wikipedia.org/wiki/Classical_Armenian" title="w:Classical Armenian">Old Armenian</a></span> <i class="Armn mention" lang="xcl"><a class="mw-selflink-fragment" href="#Old_Armenian">վստահ</a></i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-tr tr Latn" lang="xcl-Latn">vstah</span><span class="mention-gloss-paren annotation-paren">)</span>. <a href="/wiki/Appendix:Glossary#doublet" title="Appendix:Glossary">Doublet</a> of <i class="Armn ment

In [124]:
getLanguageSections(random_soup)['Armenian'][0]

<div class="mw-heading mw-heading3"><h3 id="Etymology">Etymology</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=%D5%BE%D5%BD%D5%BF%D5%A1%D5%B0&amp;action=edit&amp;section=2" title="Edit section: Etymology"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div>

In [149]:
def getSubSections(section):
    result = {}
    current_header = None
    current_subsection = []
    for tag in section:
        if tag.name == 'div' and 'mw-heading' in tag.attrs.get('class', []):
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = []
        else:
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    return result


In [151]:
getSubSections(getLanguageSections(random_soup)['Armenian']).keys()

dict_keys(['Etymology', 'Pronunciation', 'Adjective', 'Declension', 'Adverb', 'Derived terms', 'Related terms', 'See also'])

In [138]:
[(x.text.replace(r'.edit.$', ''), x.attrs.get('class', [])) for x in getLanguageSections(random_soup)['Armenian']]

[('Etymology[edit]', ['mw-heading', 'mw-heading3']),
 ('Learned borrowing from Old Armenian վստահ (vstah). Doublet of վստա (vsta) and բստա (bsta).\n',
  []),
 ('Pronunciation[edit]', ['mw-heading', 'mw-heading3']),
 ('\n(Eastern Armenian) IPA(key): /vəsˈtɑh/, [vəstɑ́h]\n(Western Armenian) IPA(key): /vəsˈdɑh/, [vəstɑ́h]\nHyphenation: վըս‧տահ',
  []),
 ('Adjective[edit]', ['mw-heading', 'mw-heading3']),
 ('վստահ • (vstah) (superlative ամենավստահ)\n', []),
 ("sure, certain, confident\nՎստահ չեմ, թե հասկանում ես։ ― Vstah čʻem, tʻe haskanum es. ― I'm not sure you understand.",
  []),
 ('Declension[edit]', ['mw-heading', 'mw-heading4']),
 ('\nnominalized, i-type (Eastern Armenian)\n\n\n\n\n\nsingular\n\nplural\n\n\nnominative\n\nվստահ (vstah)\n\nվստահներ (vstahner)\n\n\ndative\n\nվստահի (vstahi)\n\nվստահների (vstahneri)\n\n\nablative\n\nվստահից (vstahicʻ)\n\nվստահներից (vstahnericʻ)\n\n\ninstrumental\n\nվստահով (vstahov)\n\nվստահներով (vstahnerov)\n\n\nlocative\n\n—\n\n—\n\n\n\n\ndefinite fo