In [1]:
soups = {}

In [2]:
from bs4 import BeautifulSoup
import bs4
import copy
from markdownify import markdownify as md
import pandas as pd
import pickle
import random
import re
import requests
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

pd.set_option('display.max_colwidth', 250)

URL_ROOT = 'https://en.wiktionary.org'

def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [3]:
pickled_soups = pickle.load(open('soups.pickle', 'rb'))
soups |= pickled_soups
len(soups)

1804

In [4]:
[k for k in soups.keys() if k.startswith('/wiki/Category:')]

['/wiki/Category:Armenian_terms_with_usage_examples']

In [5]:
len(soups)

1804

In [244]:
#pickle.dump(soups, open('soups.pickle', 'wb'))

In [6]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

def get_all_urls_in_category(category_url):
    return [lp['href'] for cp in get_category_pages(category_url) for lp in get_lemma_pages(cp)]

get_all_urls_in_category('/wiki/Category:Armenian_terms_with_usage_examples')

['/wiki/%D5%A1',
 '/wiki/%D5%A1%CC%88',
 '/wiki/%D5%A1%CC%8A',
 '/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF',
 '/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE',
 '/wiki/%D5%A1%D5%A6%D5%A1%D5%BF',
 '/wiki/%D5%A1%D5%A6%D5%A1%D5%BF%D5%A1%D5%B4%D5%A1%D6%80%D5%BF',
 '/wiki/%D5%A1%D5%A6%D5%A3',
 '/wiki/%D5%A1%D5%A6%D5%A3%D5%A1%D5%B4%D5%AB%D5%BB%D5%B5%D5%A1%D5%B6',
 '/wiki/%D5%A1%D5%A6%D5%A3%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6',
 '/wiki/%D5%A1%D5%A6%D5%A4%D5%A1%D5%AF',
 '/wiki/%D5%A1%D5%A6%D5%A4%D5%A5%D6%81%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6',
 '/wiki/%D5%A1%D5%A9%D5%AC%D5%A5%D5%BF%D5%AB%D5%AF%D5%A1',
 '/wiki/%D5%A1%D5%AC%D5%A1%D5%B4',
 '/wiki/%D5%A1%D5%AC%D5%A1%D6%80%D5%A5%D5%AC',
 '/wiki/%D5%A1%D5%AC%D5%A5%D5%B0%D5%A5%D6%80%D5%B1',
 '/wiki/%D5%A1%D5%AC%D5%AB%D6%84',
 '/wiki/%D5%A1%D5%AC%D5%BA%D5%A1%D5%AF%D5%A1',
 '/wiki/%D5%A1%D5%AD',
 '/wiki/%D5%A1%D5%AD%D5%B8%D6%80%D5%AA%D5%A1%D5%AF',
 '/wiki/%D5%A1%D5%AD%D6%80',
 '/wiki/%D5%A1%D5%AE%D5%A5%D5%AC',
 '/wiki/-%D5%A

In [7]:
for url in tqdm(get_all_urls_in_category('/wiki/Category:Armenian_terms_with_usage_examples')):
    get_soup(url)

100%|██████████| 1795/1795 [00:00<00:00, 227909.90it/s]


In [8]:
PARTS_OF_SPEECH = ['Noun', 'Verb', 'Adverb', 'Particle', 'Pronoun', 'References', 'Suffix', 'Proper noun', 'Postposition', 'Interjection', 'Preposition', 'Determiner', 'Punctuation mark', 
                   'Numeral', 'Prefix', 'Article', 'References', 'Suffix', 'Proper noun', 'Postposition', 'Pronoun', 'Particle', 'Conjunction', 'Preposition', 'Prefix', 'Letter', 'Determiner', 'Punctuation mark', 'Numeral']

In [34]:
def get_categories(s):
    return [a['title'].replace('Category:','').strip() for a in s.select_one('div#catlinks').select('li a')]

def get_title(s):
    return s.find('h1').text

def get_language_sections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

def get_heading_level(classes):
    levels = [c for c in classes if c != 'mw-heading']
    if len(levels) > 0:
        return levels[0]

def get_subsections(section):
    results = []
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        classes = tag.attrs.get('class', [])
        if tag.name == 'div' and 'mw-heading' in classes:
            heading_level = get_heading_level(classes)
            if not top_heading_level:
                top_heading_level = heading_level
            elif heading_level == top_heading_level and result:
                # the first time you encounter the top-level heading again...
                if len(result) > 1: # this means there were other headings in between
                    results.append(result)
                    result = {}
                else: # consecutive top-level headings means flat hierarchy
                    top_heading_level = '_' # non-existent heading
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            # if not heading, add to current heading's subsection 
            current_subsection.append(copy.copy(tag))
    if current_header:
        result[current_header] = current_subsection
    results.append(result)
    return results

def find_element(tag, element):
    if type(tag) != bs4.Tag:
        return
    return tag.find(element)

def process_table(tag):
    if type(tag) != bs4.Tag:
        return tag
    return md(str(tag), strip=['a']).strip()

def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    return tag.text.strip()

def process_definition(tag):
    if type(tag) != bs4.Tag:
        return tag
    definitions = []
    if not tag.find('ol'):
        return [{'definition': 'ERROR!!!!'}]
    for li in tag.find('ol').find_all('li'):
        if li.find('dl'):
            detail_text = li.find('dl').text
            definitions.append({'definition': li.text.replace(detail_text, '').strip(), 'detail': li.find('dl').text})
        else:
            definitions.append({'definition': li.text})
    return definitions

def process_language_section(s, language):
    language_sections = get_language_sections(s)
    if not language in language_sections:
        return []
    section = language_sections[language]

    subsections = get_subsections(section)
    return subsections

In [35]:
def process_value(key, value):
    if key=='Pronunciation':
        return process_pronunciation(value)
    if key in ['Declension', 'Inflection']:
        return process_table(value)
    if key in PARTS_OF_SPEECH:
        return process_definition(value)
    return process_basic_text(value)

def process_subsection(s):
    return {key: process_value(key, value) for key, value in s.items()}

def process_subsections(ss):
    return [process_subsection(s) for s in ss]

In [46]:
lemma_soups = [v for k,v in soups.items() if k.startswith('/wiki/') and not '/Category:' in k]

random_soup = random.choice(lemma_soups)
print(get_title(random_soup))

հաջողություն


In [37]:
subsections = process_language_section(random_soup, 'Armenian')

In [40]:
process_language_section(random_soup, 'Armenian')

[{'Alternative forms': <div><ul><li><span class="Armn" lang="hy"><a class="new" href="/w/index.php?title=%D5%BF%D5%A5%D5%B2%D5%A5%D5%AF%D5%A1%D5%BF%D5%B8%D6%82%D5%A1%D5%AF%D5%A1%D5%B6&amp;action=edit&amp;redlink=1" title="տեղեկատուական (page does not exist)">տեղեկատուական</a></span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr Latn" lang="hy-Latn">teġekatuakan</span><span class="mention-gloss-paren annotation-paren">)</span> — <span class="ib-content"><a class="extiw" href="https://en.wikipedia.org/wiki/Classical_Armenian_orthography" title="w:Classical Armenian orthography">traditional orthography</a></span></li></ul></div>,
  'Etymology': <div><p><i class="Armn mention" lang="hy"><a href="/wiki/%D5%BF%D5%A5%D5%B2%D5%A5%D5%AF%D5%A1%D5%BF%D5%B8%D6%82#Armenian" title="տեղեկատու">տեղեկատու</a></i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-tr tr Latn" lang="hy-Latn">teġekatu</span><span class="mention-gloss-paren annotati

In [42]:
process_subsections(subsections)

[{'Alternative forms': 'տեղեկատուական (teġekatuakan) — traditional orthography',
  'Etymology': 'տեղեկատու (teġekatu) +\u200e -ական (-akan)',
  'Pronunciation': '(Eastern Armenian) IPA(key): /teʁekɑtvɑˈkɑn/, [teʁekɑtvɑkɑ́n]\n(Western Armenian) IPA(key): /deʁeɡɑdvɑˈɡɑn/, [deʁeɡɑdvɑɡɑ́n]\nHyphenation: տե‧ղե‧կատ‧վա‧կան',
  'Adjective': 'տեղեկատվական • (teġekatvakan)\ninformation; informational; data\nտեղեկատվական պատերազմ ― teġekatvakan paterazm ― information warfare, media war\nտեղեկատվական հիմնապաշար ― teġekatvakan himnapašar ― information database',
  'Declension': 'nominalized, *i*-type (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | տեղեկատվական (teġekatvakan) | տեղեկատվականներ (teġekatvakanner) |\n| dative | տեղեկատվականի (teġekatvakani) | տեղեկատվականների (teġekatvakanneri) |\n| ablative | տեղեկատվականից (teġekatvakanicʻ) | տեղեկատվականներից (teġekatvakannericʻ) |\n| instrumental | տեղեկատվականով (teġekatvakanov) | տեղեկատվականներով (teġekatv

In [49]:
raw_results = pd.DataFrame([subsection | {"title": get_title(s)} for s in tqdm(lemma_soups[:100]) for subsection in process_language_section(s, 'Armenian')])

100%|██████████| 100/100 [00:01<00:00, 90.87it/s]


In [51]:
# get all potential definition fields (fields that contain <ol>)
elements_by_field = (pd.DataFrame({col: {elem: raw_results[col].apply(lambda x:find_element(x,elem)).count() for elem in ['ol', 'table', 'ul']} for col in raw_results.columns}).T
 #[lambda x:x>0][lambda x:x<90].sort_values(ascending=False).index.to_list()
)
elements_by_field

Unnamed: 0,ol,table,ul
Pronunciation,0,85,100
Etymology 1,0,0,0
Letter,3,0,0
title,0,0,0
See also,0,2,10
Etymology 2,0,0,0
Verb,10,0,0
Etymology 3,0,0,0
Interjection,5,0,0
References,0,0,18


In [52]:
elements_by_field[elements_by_field.ul>0].sort_values('ul', ascending=False)

Unnamed: 0,ol,table,ul
Pronunciation,0,85,100
Alternative forms,0,0,24
Derived terms,0,0,22
References,0,0,18
Related terms,0,0,12
See also,0,2,10
Synonyms,0,0,8
Further reading,0,0,3
Descendants,0,0,2
Antonyms,0,0,2


In [53]:
elements_by_field[elements_by_field.table>0].sort_values('table', ascending=False)

Unnamed: 0,ol,table,ul
Pronunciation,0,85,100
Declension,0,74,0
Inflection,0,8,0
See also,0,2,10
Conjugation,0,1,0


In [56]:
columns_by_freq = raw_results.count().sort_values(ascending=False)
results = raw_results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
title,109
Pronunciation,100
Etymology,90
Declension,74
Noun,44
Adjective,36
Derived terms,26
Alternative forms,24
References,18
Adverb,13


In [67]:
results = raw_results.copy()
for c in tqdm(results.columns):
    results[c] = results[c].apply(lambda x: process_value(c, x))

100%|██████████| 30/30 [00:02<00:00, 12.82it/s]


In [70]:
results[results.count().sort_values().index[::-1]]

Unnamed: 0,title,Pronunciation,Etymology,Declension,Noun,Adjective,Derived terms,Alternative forms,References,Adverb,...,Pronoun,Further reading,Determiner,Letter,Descendants,Particle,Antonyms,Etymology 3,Conjugation,Conjunction
0,ա,"(Eastern Armenian) IPA(key): /ɑ/, [ɑ]\n(Western Armenian) IPA(key): /ɑ/, [ɑ]",,,,,,,,,...,,,,"[{'definition': 'The first letter of the Armenian alphabet, called այբ (ayb). Represents open back unrounded vowel: [ɑ]. Transliterated as a.', 'detail': 'Ա-ն Բ-ից ջոկել ― A-n B-icʻ ǰokel ― to know one's ABCs; to be literate Ա և Ք ― A ew Kʻ ― fro...",,,,,,
1,ա,,,,,,,,,,...,,,,,,,,,,
2,ա,,,,,,,,,,...,,,,,,,,,,
3,ա,,,,,,,,[{'definition': 'ERROR!!!!'}],,...,,,,,,,,,,
4,ա̈,IPA(key): [æ],Letter ա (a) with an umlaut above.,,,,,,[{'definition': 'ERROR!!!!'}],,...,,,,"[{'definition': 'A letter used in Armenian dialectology. Represents the near-open front vowel: [æ]. Transliterated as ä. Present in the Syunik, Artsakh, Hadrut, Agulis, Akhaltsikhe, and Van regional dialects. Formerly written as ՠ.', 'detail': 'ա...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,անհրաժեշտություն,"(Eastern Armenian) IPA(key): /ɑnhəɾɑʒeʃtuˈtʰjun/, [ɑnhəɾɑʒeʃtut͡sʰjún]\n(Eastern Armenian, colloquial) IPA(key): /ɑnɾɑʒeʃtuˈtʰjun/, [ɑnɾɑʒeʃtut͡sʰjún]\n(Eastern Armenian, colloquial) IPA(key): /ɑnɾɑʒeʃtuˈtʰjun/, [ɑnɾɑʒeʃtut͡sʰjún]\n(Western Armen...",անհրաժեշտ (anhražešt) +‎ -ություն (-utʻyun),"*n*-type, inanimate (Eastern Armenian)\n\n\n\n| | singular | plural |\n| --- | --- | --- |\n| nominative | անհրաժեշտություն (anhražeštutʻyun) | անհրաժեշտություններ (anhražeštutʻyunner) |\n| dative | անհրաժեշտության (anhražeštutʻyan) | անհրաժեշտո...","[{'definition': 'necessity, need', 'detail': 'անհրաժեշտության դեպքում ― anhražeštutʻyan depkʻum ― in case of need կենսական անհրաժեշտություն ― kensakan anhražeštutʻyun ― vital necessity առաջին անհրաժեշտության առարկաներ ― aṙaǰin anhražeštutʻyan aṙa...",,,,,,...,,,,,,,,,,
105,անհույս,"(Eastern Armenian) IPA(key): /ɑnˈhujs/, [ɑnhújs]\n(Western Armenian) IPA(key): /ɑnˈhujs/, [ɑnhújs]",From Old Armenian անյոյս (anyoys); see it for more.,"nominalized, *i*-type (Eastern Armenian)\n\n\n\n| | singular | plural |\n| --- | --- | --- |\n| nominative | անհույս (anhuys) | անհույսներ (anhuysner) |\n| dative | անհույսի (anhuysi) | անհույսների (anhuysneri) |\n| ablative | անհույսից (anhuysi...",,"անհույս • (anhuys) (superlative ամենաանհույս)\nhopeless, despairing, desperate\nանհույս իրավիճակ ― anhuys iravičak ― desperate situation",,անյոյս (anyoys),,"[{'definition': 'hopelessly, despairingly, desperately, beyond hope'}]",...,,,,,,,,,,
106,անհուն,"(Eastern Armenian) IPA(key): /ɑnˈhun/, [ɑnhún]\n(Western Armenian) IPA(key): /ɑnˈhun/, [ɑnhún]\nHyphenation: ան‧հուն",From Old Armenian անհուն (anhun).,"nominalized, *i*-type (Eastern Armenian)\n\n\n\n| | singular | plural |\n| --- | --- | --- |\n| nominative | անհուն (anhun) | անհուններ (anhunner) |\n| dative | անհունի (anhuni) | անհունների (anhunneri) |\n| ablative | անհունից (anhunicʻ) | անհո...",,"անհուն • (anhun) (superlative ամենաանհուն)\nbottomless\nանհուն ծով ― anhun cov ― bottomless sea\n(figuratively) infinite, limitless\nանհուն սեր ― anhun ser ― limitless love",,,,,...,,,,,,,,,,
107,անձ,"(Eastern Armenian) IPA(key): /ɑnd͡z/, [ɑnd͡z]\n(Western Armenian) IPA(key): /ɑnt͡s/, [ɑnt͡sʰ]",From Old Armenian անձն (anjn).,"*i*-type, animate (Eastern Armenian)\n\n\n\n| | singular | plural |\n| --- | --- | --- |\n| nominative | անձ (anj) | անձեր (anjer) |\n| dative | անձի (anji) | անձերի (anjeri) |\n| ablative | անձից (anjicʻ) | անձերից (anjericʻ) |\n| instrumental ...","[{'definition': 'person', 'detail': 'գործող անձ ― gorcoġ anj ― character, personage պաշտոնական անձ ― paštonakan anj ― an official մասնավոր անձ ― masnavor anj ― private person'}, {'definition': '(law) entity', 'detail': 'իրավաբանական անձ ― iravaba...",,,,,,...,,,,,,,,,,


# Scratchpad

In [60]:
lemma_soups = [(k,v) for k,v in soups.items() if k.startswith('/wiki/') and not '/Category:' in k]

In [131]:
lemma_soups = [(k,v) for k,v in soups.items() if k.startswith('/wiki/') and not '/Category:' in k]

k, random_soup = random.choice(lemma_soups)
print(k)
[e['title'] for e in random_soup.select_one('div#catlinks').select('li a')]

/wiki/%D5%BE%D5%A1%D6%80%D5%AA


['Category:Armenian terms derived from Old Armenian',
 'Category:Armenian terms with IPA pronunciation',
 'Category:Armenian lemmas',
 'Category:Armenian adjectives',
 'Category:Armenian terms with usage examples',
 'Category:Armenian adverbs',
 'Category:Old Armenian terms borrowed from Parthian',
 'Category:Old Armenian terms derived from Parthian',
 'Category:Old Armenian doublets',
 'Category:Old Armenian lemmas',
 'Category:Old Armenian nouns',
 'Category:Old Armenian terms with usage examples',
 'Category:Old Armenian adjectives',
 'Category:Armenian undefined derivations',
 'Category:Armenian links with redundant wikilinks',
 'Category:Parthian terms in nonstandard scripts',
 'Category:Old Armenian links with redundant wikilinks']

In [132]:
k, random_soup = random.choice(lemma_soups)
print(k)


/wiki/%D5%95%D5%B0%D5%A1%D5%B6%D5%B5%D5%A1%D5%B6


In [134]:
random_soup.find('h1')

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="Armn">Օհանյան</span></h1>

In [89]:
categories = pd.Series([e['title'] for _,soup in lemma_soups for e in soup.select_one('div#catlinks').select('li a')]).value_counts()

In [120]:
pd.Series([re.match(r'([A-Z][a-z]+ )+', c[9:]).group().strip() for c in categories.index.to_list() if re.match(r'([A-Z][a-z]+ )+', c[9:])]).value_counts()[lambda x:x>2]

Armenian           227
Old Armenian       186
Requests            34
Middle Armenian     33
Pages               16
Macedonian           4
Translingual         4
Parthian             4
Entries              3
Ottoman Turkish      3
Name: count, dtype: int64