In [18]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from tqdm import tqdm

from bs4 import BeautifulSoup
import bs4

import pickle

from markdownify import markdownify as md

pd.set_option('display.max_colwidth', 250)

URL_ROOT = 'https://en.wiktionary.org'

soup = {}
def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [19]:
pickled_soup = pickle.load(open('soups.pickle', 'rb'))
soup |= pickled_soup
len(soup)

In [21]:
#pickle.dump(soup, open('soups.pickle', 'wb'))

In [10]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

lps = ([lp for cp in get_category_pages('/wiki/Category:Armenian_terms_with_usage_examples')
    for lp in get_lemma_pages(cp)])
lemmas = pd.DataFrame(lps)
lemmas

Unnamed: 0,href,title
0,/wiki/%D5%A1,ա
1,/wiki/%D5%A1%CC%88,ա̈
2,/wiki/%D5%A1%CC%8A,ա̊
3,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF,աբեթասունկ
4,/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE,ագռավ
...,...,...
1790,/wiki/%D5%96%D6%80%D5%B8%D6%82%D5%B6%D5%A6%D5%A5,Ֆրունզե
1791,/wiki/%D5%9B,՛
1792,/wiki/%D5%9E,՞
1793,/wiki/%D6%89,։


In [11]:
def getLanguageSections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

def getHeadingLevel(cs):
    levels = [c for c in cs if c != 'mw-heading']
    if len(levels) > 0:
        return levels[0]

def getSubSections(section):
    results = []
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        classes = tag.attrs.get('class', [])
        if tag.name == 'div' and 'mw-heading' in classes:
            heading_level = getHeadingLevel(classes)
            if not top_heading_level:
                top_heading_level = heading_level
            elif heading_level == top_heading_level and result:
                # the first time you encounter the top-level heading again...
                if len(result) > 1: # this means there were other headings in between
                    results.append(result)
                    result = {}
                else: # consecutive top-level headings means flat hierarchy
                    top_heading_level = '_' # non-existent heading
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            # if not heading, add to current heading's subsection 
            current_subsection.append(tag)
    if current_header:
        result[current_header] = current_subsection
    results.append(result)
    return results

def find_element(tag, element):
    if type(tag) != bs4.Tag:
        return
    return tag.find(element)

def process_table(tag):
    if type(tag) != bs4.Tag:
        return tag
    return md(str(tag), strip=['a']).strip()

def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    return tag.text.strip()

def process_definition(tag):
    if type(tag) != bs4.Tag:
        return tag
    definitions = []
    if not tag.find('ol'):
        return [{'definition': 'ERROR!!!!'}]
    for li in tag.find('ol').find_all('li'):
        if li.find('dl'):
            detail_text = li.find('dl').text
            definitions.append({'definition': li.text.replace(detail_text, '').strip(), 'detail': li.find('dl').text})
        else:
            definitions.append({'definition': li.text})
    return definitions

entries = {}

In [15]:
for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas.iterrows()]):
#for title, href in tqdm([row[['title','href']].to_list() for _, row in lemmas[lemmas.title.isin(['աման', 'փիս', '-անք', 'համ'])].iterrows()]):
    if title in entries:
        continue
    sections = getLanguageSections(get_soup(href))
    if 'Armenian' not in sections:
        continue
    entries[title] = getSubSections(sections['Armenian'])

100%|██████████| 1795/1795 [29:56<00:00,  1.00s/it]  


In [62]:
results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

In [70]:
raw_results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

# get all potential definition fields (fields that contain <ol>)
elements_by_field = (pd.DataFrame({col: {elem: raw_results[col].apply(lambda x:find_element(x,elem)).count() for elem in ['ol', 'table', 'ul']} for col in results.columns}).T
 #[lambda x:x>0][lambda x:x<90].sort_values(ascending=False).index.to_list()
)
elements_by_field

Unnamed: 0,ol,table,ul
Pronunciation,0,776,1782
Etymology 1,0,0,0
Letter,7,0,0
See also,0,10,54
Etymology 2,0,0,0
Verb,225,0,6
Etymology 3,0,0,0
Interjection,21,0,1
References,52,0,210
Etymology,0,4,0


In [67]:
elements_by_field[elements_by_field.ul>0].sort_values('ul', ascending=False)

Unnamed: 0,ol,table,ul
Pronunciation,0,776,1782
Alternative forms,0,0,521
Derived terms,0,0,415
Related terms,0,0,226
References,52,0,210
Synonyms,0,0,179
Further reading,0,0,90
See also,0,10,54
Descendants,0,0,52
Antonyms,0,0,32


In [68]:
elements_by_field[elements_by_field.table>0].sort_values('table', ascending=False)

Unnamed: 0,ol,table,ul
Declension,0,1429,0
Pronunciation,0,776,1782
Inflection,0,150,0
Conjugation,0,42,0
See also,0,10,54
Etymology,0,4,0
Pronoun,35,1,1


In [71]:
columns_by_freq = results.count().sort_values(ascending=False)
results = results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Pronunciation,1782
Etymology,1624
Declension,1430
Noun,1049
Alternative forms,521
Adjective,503
Derived terms,451
References,262
Related terms,226
Verb,225


In [55]:
results['Declension'] = results['Declension'].apply(process_table)
results['Inflection'] = results['Inflection'].apply(process_table)

In [56]:
results['Declension'][lambda x:x.notna()]

title       idx
աբեթասունկ  1      *i*-type, inanimate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | աբեթասունկ (abetʻasunk) | աբեթասնկեր (abetʻasnker) |\n| dative | աբեթասնկի (abetʻasnki) | աբեթասնկերի (abetʻasnkeri) |\n| ablative | աբեթա...
ագռավ       1      *i*-type, animate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ագռավ (agṙav) | ագռավներ (agṙavner) |\n| dative | ագռավի (agṙavi) | ագռավների (agṙavneri) |\n| ablative | ագռավից (agṙavicʻ) | ագռավներից (a...
ազատ        1      *i*-type, animate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ազատ (azat) | ազատներ (azatner) |\n| dative | ազատի (azati) | ազատների (azatneri) |\n| ablative | ազատից (azaticʻ) | ազատներից (azatnericʻ) ...
ազատամարտ   2      nominalized, *i*-type (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ազատամարտ (azatamart) | ազատամարտներ (azatamartn

In [57]:
results['Pronunciation'] = results['Pronunciation'].apply(process_pronunciation)

In [58]:
results['Etymology'] = results['Etymology'].apply(process_basic_text)
results['Etymology'] = results['Etymology 1'].apply(process_basic_text).where(results['Etymology 1'].notna(), results['Etymology'])
results['Etymology'] = results['Etymology 2'].apply(process_basic_text).where(results['Etymology 2'].notna(), results['Etymology'])

In [59]:
parts_of_speech = [c for c in ['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article', 
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Pronoun',
 'Particle',
 'Conjunction',
 'Preposition',
 'Prefix',
 'Letter',
 'Determiner',
 'Punctuation mark',
 'Numeral'] if c in results.columns]

parts_of_speech

['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Pronoun',
 'Particle',
 'Conjunction',
 'Preposition',
 'Prefix',
 'Letter',
 'Determiner',
 'Punctuation mark',
 'Numeral']

In [60]:
for col in (parts_of_speech and results.columns):
    results[col] = results[col].apply(process_definition)

In [61]:
import random


pos = 'Verb' #random.choice(parts_of_speech and results.columns)

results[results[pos].notna()][pos].sample(3).apply(pd.Series).stack().apply(pd.Series).fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,definition,detail
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
պրծնել,1,0,"(intransitive) to be relieved, freed, to be released from or of a difficulty, unwanted task, responsibility etc.",Synonym: ազատվել (azatvel)\nԵ՞րբ ենք պրծնելու այս փորձանքից։ ― E?rb enkʻ prcnelu ays pʻorjankʻicʻ. ― When will we be freed from this misfortune?\nլավ պրծանք (colloquial) ― lav prcankʻ ― we got off easy; it's a good thing that
պրծնել,1,1,"(intransitive) to finish, to end, to be over","Synonyms: ավարտվել (avartvel), վերջանալ (verǰanal)\nԿինոն պրծավ։ ― Kinon prcav. ― The movie ended."
պրծնել,1,2,"(transitive) to finish, to end; to complete (work, lessons, etc.)","Synonyms: ավարտել (avartel), վերջացնել (verǰacʻnel)"
պրծնել,1,3,"(slang, vulgar) to come, to cum, to achieve orgasm",
գեղարվեստականացնել,1,0,"causative of գեղարվեստականանալ (geġarvestakananal)\n(transitive) to fictionalize, make artistic, spice up",Ո՞վ չի ցանկանում մի փոքր գեղարվեստականացնել իր սեռական կյանքը։ ― O?v čʻi cʻankanum mi pʻokʻr geġarvestakanacʻnel ir seṙakan kyankʻə. ― Who doesn't want to spice up their sex life a little?
գեղարվեստականացնել,1,1,"(transitive) to fictionalize, make artistic, spice up",Ո՞վ չի ցանկանում մի փոքր գեղարվեստականացնել իր սեռական կյանքը։ ― O?v čʻi cʻankanum mi pʻokʻr geġarvestakanacʻnel ir seṙakan kyankʻə. ― Who doesn't want to spice up their sex life a little?
ջարդել,1,0,to break into pieces,
ջարդել,1,1,to cut up into pieces,
ջարդել,1,2,"to cut, to destroy by cutting (i.e. a forest)",
ջարդել,1,3,to defeat severely,
