In [234]:
soups = {}

In [263]:
from bs4 import BeautifulSoup
import bs4
import copy
from markdownify import markdownify as md
import pandas as pd
import pickle
import random
import re
import requests
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

pd.set_option('display.max_colwidth', 250)

URL_ROOT = 'https://en.wiktionary.org'

def get_soup(path):
    if path in soups:
        return soups[path]
    
    res = requests.get(f"{URL_ROOT}{path}")
    soup = BeautifulSoup(res.text)
    soups[path] = soup
    return soup

# e.g. get_category_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_category_pages(start_path):
    paths = [start_path]    
    next_path = start_path
    soup = get_soup(next_path)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_path = next_link['href']
        paths.append(next_path)
        soup = get_soup(next_path)
        next_link = soup.find('a', string='next page')
    return paths

# e.g. get_lemma_pages("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples")
def get_lemma_pages(path):
    soup = get_soup(path)
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

In [264]:
pickled_soups = pickle.load(open('soups.pickle', 'rb'))
soups |= pickled_soups
len(soups)

1804

In [139]:
[k for k in soups.keys() if k.startswith('/wiki/Category:')]

['/wiki/Category:Armenian_terms_with_usage_examples',
 '/wiki/Category:Old_Armenian_terms_with_usage_examples']

In [237]:
len(soups)

9

In [244]:
#pickle.dump(soups, open('soups.pickle', 'wb'))

In [235]:
# Category:Armenian_terms_with_usage_examples
# Category:Old_Armenian_terms_with_usage_examples

def get_all_urls_in_category(category_url):
    return [lp['href'] for cp in get_category_pages(category_url) for lp in get_lemma_pages(cp)]

get_all_urls_in_category('/wiki/Category:Armenian_terms_with_usage_examples')

['/wiki/%D5%A1',
 '/wiki/%D5%A1%CC%88',
 '/wiki/%D5%A1%CC%8A',
 '/wiki/%D5%A1%D5%A2%D5%A5%D5%A9%D5%A1%D5%BD%D5%B8%D6%82%D5%B6%D5%AF',
 '/wiki/%D5%A1%D5%A3%D5%BC%D5%A1%D5%BE',
 '/wiki/%D5%A1%D5%A6%D5%A1%D5%BF',
 '/wiki/%D5%A1%D5%A6%D5%A1%D5%BF%D5%A1%D5%B4%D5%A1%D6%80%D5%BF',
 '/wiki/%D5%A1%D5%A6%D5%A3',
 '/wiki/%D5%A1%D5%A6%D5%A3%D5%A1%D5%B4%D5%AB%D5%BB%D5%B5%D5%A1%D5%B6',
 '/wiki/%D5%A1%D5%A6%D5%A3%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6',
 '/wiki/%D5%A1%D5%A6%D5%A4%D5%A1%D5%AF',
 '/wiki/%D5%A1%D5%A6%D5%A4%D5%A5%D6%81%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6',
 '/wiki/%D5%A1%D5%A9%D5%AC%D5%A5%D5%BF%D5%AB%D5%AF%D5%A1',
 '/wiki/%D5%A1%D5%AC%D5%A1%D5%B4',
 '/wiki/%D5%A1%D5%AC%D5%A1%D6%80%D5%A5%D5%AC',
 '/wiki/%D5%A1%D5%AC%D5%A5%D5%B0%D5%A5%D6%80%D5%B1',
 '/wiki/%D5%A1%D5%AC%D5%AB%D6%84',
 '/wiki/%D5%A1%D5%AC%D5%BA%D5%A1%D5%AF%D5%A1',
 '/wiki/%D5%A1%D5%AD',
 '/wiki/%D5%A1%D5%AD%D5%B8%D6%80%D5%AA%D5%A1%D5%AF',
 '/wiki/%D5%A1%D5%AD%D6%80',
 '/wiki/%D5%A1%D5%AE%D5%A5%D5%AC',
 '/wiki/-%D5%A

In [239]:
for url in tqdm(get_all_urls_in_category('/wiki/Category:Armenian_terms_with_usage_examples')):
    get_soup(url)

100%|██████████| 1795/1795 [00:25<00:00, 71.21it/s]  


In [265]:
def get_categories(s):
    return [a['title'].replace('Category:','').strip() for a in s.select_one('div#catlinks').select('li a')]

def get_title(s):
    return s.find('h1').text

def get_language_sections(soup):
    headings = soup.select('div#bodyContent div.mw-heading2')
    
    heading_map = {}

    for heading in headings:
        language = heading.find('h2').text
        content = []
        for sib in heading.next_siblings:
            if sib == '\n':
                continue
            if type(sib) == bs4.element.Comment:
                continue
            if sib in headings:
                break
            content.append(sib)
        heading_map[language] = content
    return heading_map

def get_heading_level(classes):
    levels = [c for c in classes if c != 'mw-heading']
    if len(levels) > 0:
        return levels[0]

def get_subsections(section):
    results = []
    result = {}
    current_header = None
    current_subsection = bs4.Tag(name='div')
    top_heading_level = None
    for tag in section:
        classes = tag.attrs.get('class', [])
        if tag.name == 'div' and 'mw-heading' in classes:
            heading_level = get_heading_level(classes)
            if not top_heading_level:
                top_heading_level = heading_level
            elif heading_level == top_heading_level and result:
                # the first time you encounter the top-level heading again...
                if len(result) > 1: # this means there were other headings in between
                    results.append(result)
                    result = {}
                else: # consecutive top-level headings means flat hierarchy
                    top_heading_level = '_' # non-existent heading
            if current_header:
                result[current_header] = current_subsection
            current_header = tag.text.replace('[edit]', '')
            current_subsection = bs4.Tag(name='div')
        else:
            # if not heading, add to current heading's subsection 
            current_subsection.append(copy.copy(tag))
    if current_header:
        result[current_header] = current_subsection
    results.append(result)
    return results

def find_element(tag, element):
    if type(tag) != bs4.Tag:
        return
    return tag.find(element)

def process_table(tag):
    if type(tag) != bs4.Tag:
        return tag
    return md(str(tag), strip=['a']).strip()

def process_pronunciation(tag):
    if type(tag) != bs4.Tag:
        return tag
    return '\n'.join([li.text for li in tag.select('li') if not 'Audio' in li.text and not li.text == ''])

def process_basic_text(tag):
    if type(tag) != bs4.Tag:
        return tag
    return tag.text.strip()

def process_definition(tag):
    if type(tag) != bs4.Tag:
        return tag
    definitions = []
    if not tag.find('ol'):
        return [{'definition': 'ERROR!!!!'}]
    for li in tag.find('ol').find_all('li'):
        if li.find('dl'):
            detail_text = li.find('dl').text
            definitions.append({'definition': li.text.replace(detail_text, '').strip(), 'detail': li.find('dl').text})
        else:
            definitions.append({'definition': li.text})
    return definitions

entries = {}

In [213]:
def process_language_section(s, language):
    language_sections = get_language_sections(s)
    if not language in language_sections:
        return {}
    section = language_sections[language]

    subsections = get_subsections(section)

    return {'language': language, 'subsections': subsections} 

In [268]:
lemma_soups = [v for k,v in soups.items() if k.startswith('/wiki/') and not '/Category:' in k]

random_soup = random.choice(lemma_soups)
print(k)
print(get_title(random_soup))

/wiki/%D5%95%D5%B0%D5%A1%D5%B6%D5%B5%D5%A1%D5%B6
վհատություն


In [272]:
process_language_section(random_soup, 'Armenian')

.
.
.
.
.
.


{'language': 'Armenian',
 'subsections': [{'Etymology': <div><p>From <span class="etyl"><a class="extiw" href="https://en.wikipedia.org/wiki/Classical_Armenian" title="w:Classical Armenian">Old Armenian</a></span> <i class="Armn mention" lang="xcl"><a class="new" href="/w/index.php?title=%D5%BE%D5%B0%D5%A1%D5%BF%D5%B8%D6%82%D5%A9%D5%AB%D6%82%D5%B6&amp;action=edit&amp;redlink=1" title="վհատութիւն (page does not exist)">վհատութիւն</a></i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-tr tr Latn" lang="xcl-Latn">vhatutʻiwn</span><span class="mention-gloss-paren annotation-paren">)</span>.
   </p></div>,
   'Pronunciation': <div><ul><li><span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content"><span class="usage-label-accent"><a class="extiw" href="https://en.wikipedia.org/wiki/Eastern_Armenian" title="w:Eastern Armenian">Eastern Armenian</a></span></span><span class="ib-brac qualifier-brac">)</span> <a href="/wiki/Wiktionary:

{}

In [26]:
raw_results = pd.DataFrame([dict(title=k, idx=i+1, **r) for k,v in entries.items() for i,r in enumerate(v)]).set_index(['title', 'idx'])

# get all potential definition fields (fields that contain <ol>)
elements_by_field = (pd.DataFrame({col: {elem: raw_results[col].apply(lambda x:find_element(x,elem)).count() for elem in ['ol', 'table', 'ul']} for col in results.columns}).T
 #[lambda x:x>0][lambda x:x<90].sort_values(ascending=False).index.to_list()
)
elements_by_field

Unnamed: 0,ol,table,ul
Pronunciation,0,776,1782
Etymology 1,0,0,0
Letter,7,0,0
See also,0,10,54
Etymology 2,0,0,0
Verb,225,0,6
Etymology 3,0,0,0
Interjection,21,0,1
References,52,0,210
Etymology,0,4,0


In [27]:
elements_by_field[elements_by_field.ul>0].sort_values('ul', ascending=False)

Unnamed: 0,ol,table,ul
Pronunciation,0,776,1782
Alternative forms,0,0,521
Derived terms,0,0,416
Related terms,0,0,226
References,52,0,210
Synonyms,0,0,179
Further reading,0,0,90
See also,0,10,54
Descendants,0,0,52
Antonyms,0,0,32


In [68]:
elements_by_field[elements_by_field.table>0].sort_values('table', ascending=False)

Unnamed: 0,ol,table,ul
Declension,0,1429,0
Pronunciation,0,776,1782
Inflection,0,150,0
Conjugation,0,42,0
See also,0,10,54
Etymology,0,4,0
Pronoun,35,1,1


In [28]:
columns_by_freq = results.count().sort_values(ascending=False)
results = results[columns_by_freq.index] #[lambda x:x['Verb'].notna()]
columns_by_freq.to_frame().style.bar()

Unnamed: 0,0
Pronunciation,1782
Etymology,1624
Declension,1430
Noun,1049
Alternative forms,521
Adjective,503
Derived terms,452
References,262
Related terms,226
Verb,225


In [29]:
results['Declension'] = results['Declension'].apply(process_table)
results['Inflection'] = results['Inflection'].apply(process_table)

In [31]:
results['Declension'][lambda x:x.notna()]

title       idx
աբեթասունկ  1      *i*-type, inanimate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | աբեթասունկ (abetʻasunk) | աբեթասնկեր (abetʻasnker) |\n| dative | աբեթասնկի (abetʻasnki) | աբեթասնկերի (abetʻasnkeri) |\n| ablative | աբեթա...
ագռավ       1      *i*-type, animate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ագռավ (agṙav) | ագռավներ (agṙavner) |\n| dative | ագռավի (agṙavi) | ագռավների (agṙavneri) |\n| ablative | ագռավից (agṙavicʻ) | ագռավներից (a...
ազատ        1      *i*-type, animate (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ազատ (azat) | ազատներ (azatner) |\n| dative | ազատի (azati) | ազատների (azatneri) |\n| ablative | ազատից (azaticʻ) | ազատներից (azatnericʻ) ...
ազատամարտ   2      nominalized, *i*-type (Eastern Armenian)\n\n\n\n|  | singular | plural |\n| --- | --- | --- |\n| nominative | ազատամարտ (azatamart) | ազատամարտներ (azatamartn

In [32]:
results['Pronunciation'] = results['Pronunciation'].apply(process_pronunciation)

In [33]:
results['Etymology'] = results['Etymology'].apply(process_basic_text)
results['Etymology'] = results['Etymology 1'].apply(process_basic_text).where(results['Etymology 1'].notna(), results['Etymology'])
results['Etymology'] = results['Etymology 2'].apply(process_basic_text).where(results['Etymology 2'].notna(), results['Etymology'])

In [35]:
parts_of_speech = [c for c in ['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article', 
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Pronoun',
 'Particle',
 'Conjunction',
 'Preposition',
 'Prefix',
 'Letter',
 'Determiner',
 'Punctuation mark',
 'Numeral'] if c in results.columns]

parts_of_speech

['Verb',
 'Adverb',
 'Particle',
 'Pronoun',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Interjection',
 'Preposition',
 'Determiner',
 'Punctuation mark',
 'Numeral',
 'Prefix',
 'Article',
 'References',
 'Suffix',
 'Proper noun',
 'Postposition',
 'Pronoun',
 'Particle',
 'Conjunction',
 'Preposition',
 'Prefix',
 'Letter',
 'Determiner',
 'Punctuation mark',
 'Numeral']

In [36]:
for col in (parts_of_speech and results.columns):
    results[col] = results[col].apply(process_definition)

In [44]:
import random


pos = 'Verb' #random.choice(parts_of_speech and results.columns)

results[results[pos].notna()][pos].sample(3).apply(pd.Series).stack().apply(pd.Series).fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,definition,detail
title,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
համարձակվել,1,0,to dare do something,չհամարձակվես ― čʻhamarjakves ― don't you dare
խոսել,1,0,to speak,բարձր խոսել ― barjr xosel ― to speak loudly\nԵս հայերեն լավ չեմ խոսում։ ― Es hayeren lav čʻem xosum. ― I don't speak Armenian well.\nՆա անգլերեն շատ լավ է խոսում։ ― Na angleren šat lav ē xosum. ― He speaks English very well.
խոսել,1,1,to talk,Ես ուզում եմ քեզ հետ մի շատ կարևոր բանի մասին խոսել։ ― Es uzum em kʻez het mi šat karewor bani masin xosel. ― I want to talk to you about something very important.
բնադրել,1,0,(of birds) to nest,Այս թռչունը բնադրում է առավելապես լեռնային շրջաններում: ― Ays tʻṙčʻunə bnadrum ē aṙavelapes leṙnayin šrǰannerum: ― This bird nests chiefly in mountainous regions.


# Scratchpad

In [60]:
lemma_soups = [(k,v) for k,v in soups.items() if k.startswith('/wiki/') and not '/Category:' in k]

In [131]:
lemma_soups = [(k,v) for k,v in soups.items() if k.startswith('/wiki/') and not '/Category:' in k]

k, random_soup = random.choice(lemma_soups)
print(k)
[e['title'] for e in random_soup.select_one('div#catlinks').select('li a')]

/wiki/%D5%BE%D5%A1%D6%80%D5%AA


['Category:Armenian terms derived from Old Armenian',
 'Category:Armenian terms with IPA pronunciation',
 'Category:Armenian lemmas',
 'Category:Armenian adjectives',
 'Category:Armenian terms with usage examples',
 'Category:Armenian adverbs',
 'Category:Old Armenian terms borrowed from Parthian',
 'Category:Old Armenian terms derived from Parthian',
 'Category:Old Armenian doublets',
 'Category:Old Armenian lemmas',
 'Category:Old Armenian nouns',
 'Category:Old Armenian terms with usage examples',
 'Category:Old Armenian adjectives',
 'Category:Armenian undefined derivations',
 'Category:Armenian links with redundant wikilinks',
 'Category:Parthian terms in nonstandard scripts',
 'Category:Old Armenian links with redundant wikilinks']

In [132]:
k, random_soup = random.choice(lemma_soups)
print(k)


/wiki/%D5%95%D5%B0%D5%A1%D5%B6%D5%B5%D5%A1%D5%B6


In [134]:
random_soup.find('h1')

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="Armn">Օհանյան</span></h1>

In [89]:
categories = pd.Series([e['title'] for _,soup in lemma_soups for e in soup.select_one('div#catlinks').select('li a')]).value_counts()

In [120]:
pd.Series([re.match(r'([A-Z][a-z]+ )+', c[9:]).group().strip() for c in categories.index.to_list() if re.match(r'([A-Z][a-z]+ )+', c[9:])]).value_counts()[lambda x:x>2]

Armenian           227
Old Armenian       186
Requests            34
Middle Armenian     33
Pages               16
Macedonian           4
Translingual         4
Parthian             4
Entries              3
Ottoman Turkish      3
Name: count, dtype: int64