In [98]:
import random
import re
from urllib.parse import urlparse, parse_qs, parse_qsl, quote

import pandas as pd
import requests

from bs4 import BeautifulSoup

URL_ROOT = 'https://en.wiktionary.org'

def get_soup(path):
    res = requests.get(f"{URL_ROOT}{path}")
    return BeautifulSoup(res.text)

def get_category_pages(start_url):
    paths = []    
    next_url = start_url
    soup = get_soup(next_url)
    next_link = soup.find('a', string='next page')
    while next_link:
        next_url = next_link['href']
        qs = parse_qs(urlparse(next_url).query)
        if 'pagefrom' in qs:
            paths.append(quote(qs['pagefrom'][0]))
        else:
            print(f'Error! URL is {start_url}')
            break
        soup = get_soup(next_url)
        next_link = soup.find('a', string='next page')
    return paths

def get_lemma_pages(soup):
    return [{'href': a['href'], 'title': a['title']} for a in soup.css.select('div#mw-pages li a')]

soups = {}

In [99]:
pages = get_lemma_pages(get_soup("/wiki/Category:South_Levantine_Arabic_terms_with_usage_examples"))
random.choice(pages)

{'href': '/wiki/%D8%A8%D8%AF%D9%84', 'title': 'بدل'}

In [121]:
def get_random_soup():
    page = random.choice(pages)
    print(page)
    if page['title'] in soups:
        return soups[page['title']]
    else:
        soup = get_soup(page['href'])
        if type(soup) == BeautifulSoup:
            soups[page['title']] = soup
        return soup

soup = get_random_soup()

{'href': '/wiki/%D8%AC%D9%87%D8%A7%D8%B2', 'title': 'جهاز'}


In [122]:
headword = soup.select_one('div#bodyContent h2:has(span[id$="Levantine_Arabic"]) ~ p:has(.headword)')
header = headword.find_previous_sibling(['h3', 'h4'])
header_name = header.name

def_ol = headword.find_next_sibling('ol')
print(def_ol.prettify())

<ol>
 <li>
  <a href="/wiki/appliance" title="appliance">
   appliance
  </a>
  ,
  <a href="/wiki/device" title="device">
   device
  </a>
  <dl>
   <dd>
    <span class="h-usage-example">
     ‏
     <i class="Arab mention e-example" lang="ajp">
      <strong class="selflink">
       جهاز
      </strong>
      <a href="/wiki/%D8%AA%D9%84%D9%81%D8%B2%D9%8A%D9%88%D9%86#South_Levantine_Arabic" title="تلفزيون">
       التلفزيون
      </a>
     </i>
     ‎‎ ―
     <i class="e-transliteration tr Latn" lang="ajp-Latn">
      jihāz it-talfizyōn
     </i>
     ―
     <span class="e-translation">
      television set
     </span>
    </span>
   </dd>
   <dd>
    <span class="h-usage-example">
     ‏
     <i class="Arab mention e-example" lang="ajp">
      <strong class="selflink">
       جهاز
      </strong>
      <a href="/wiki/%D8%AA%D9%88%D9%82%D9%8A%D8%AA#South_Levantine_Arabic" title="توقيت">
       توقيت
      </a>
     </i>
     ‎‎ ―
     <i class="e-transliteration tr Latn" lang="ajp-L

In [149]:
import yaml

print(yaml.dump([{'href': '/wiki/l%E1%BB%8D_m%E1%BB%8D', 'title': 'lọ mọ'},
 {'href': '/wiki/lo%E1%BA%A1n_x%E1%BA%A1', 'title': 'loạn xạ'},
 {'href': '/wiki/l%E1%BB%8Fm', 'title': 'lỏm'}], allow_unicode=True))

- href: /wiki/l%E1%BB%8D_m%E1%BB%8D
  title: lọ mọ
- href: /wiki/lo%E1%BA%A1n_x%E1%BA%A1
  title: loạn xạ
- href: /wiki/l%E1%BB%8Fm
  title: lỏm



In [29]:
query = urlparse('/w/index.php?title=Category:Vietnamese_adverbs&pagefrom=D%EF%80%80ANG+KHO%EF%80%80NG%0A%C4%91ang+kh%C3%B4ng#mw-pages').query
quote(parse_qs(query)['pagefrom'][0])

'D%EF%80%80ANG%20KHO%EF%80%80NG%0A%C4%91ang%20kh%C3%B4ng'

In [79]:
soup.select_one('div#bodyContent h2:has(span[id$="Levantine_Arabic"]) ~ p:has(.headword)').parent

<div class="mw-parser-output"><div class="disambig-see-also"><i>See also:</i> <b class="Arab"><a href="/wiki/%D8%A3%D8%B3%D8%AA%D9%81%D8%A7%D8%AF" title="أستفاد">أستفاد</a></b>‎</div>
<div aria-labelledby="mw-toc-heading" class="toc" id="toc" role="navigation"><input class="toctogglecheckbox" id="toctogglecheckbox" role="button" style="display:none" type="checkbox"/><div class="toctitle" dir="ltr" lang="en"><h2 id="mw-toc-heading">Contents</h2><span class="toctogglespan"><label class="toctogglelabel" for="toctogglecheckbox"></label></span></div>
<ul>
<li class="toclevel-1 tocsection-1"><a href="#Arabic"><span class="tocnumber">1</span> <span class="toctext">Arabic</span></a>
<ul>
<li class="toclevel-2 tocsection-2"><a href="#Etymology"><span class="tocnumber">1.1</span> <span class="toctext">Etymology</span></a></li>
<li class="toclevel-2 tocsection-3"><a href="#Pronunciation"><span class="tocnumber">1.2</span> <span class="toctext">Pronunciation</span></a></li>
<li class="toclevel-2 t

In [69]:
headers = [span.parent for span in soup.select('div#bodyContent h2 span[id$="Levantine_Arabic"]')]

for header in headers:
    print(header.span.string)
    for el in header.next_siblings:
        if el.name == 'h2':
            break
        if not el.name:
            continue            
        print(el.name, el.get_text())
        print("=" * 20)

South Levantine Arabic
table Rootف ي د‎2 terms
h3 Etymology[edit]
p From Arabic اِسْتَفَادَ‎ (istafāda).

h3 Pronunciation[edit]
ul IPA(key): /is.ta.faːd/, [ɪs.taˈfæːd]
Audio (al-Lidd)(file)
h3 Verb[edit]
p استفاد • (istafād) (form X, present بستفيد‎ (bistafīd))

ol to benefit or profit from, to make use of
‏بصراحة بستفيد كتير من محاضراته.‎‎bi-ṣarāḥa bastafīd ktīr min muḥāḍarātoHonestly, I get a lot out of his lectures.
h4 Conjugation[edit]
table 

    Conjugation of استفاد (istafād)




singular



plural




1st person

2nd person

3rd person

1st person

2nd person

3rd person


past

m

استفدت (istafadt)

استفدت (istafadt)

استفاد (istafād)

استفدنا (istafadna)

استفدتو (istafadtu)

استفادو (istafādu)


f

استفدتي (istafadti)

استفادت (istafādat)


present

m

بستفيد (bastafīd)

بتستفيد (btistafīd)

بستفيد (bistafīd)

منستفيد (mnistafīd)

بتستفيدو (btistafīdu)

بستفيدو (bistafīdu)


f

بتستفيدي (btistafīdi)

بتستفيد (btistafīd)


subjunctive

m

أستفيد (ʔastafīd)

تستفيد (tistafīd)

In [93]:
[a['href'] for a in soup.css.select('div#mw-pages li a')]

['/wiki/9x',
 '/wiki/A',
 '/wiki/a',
 '/wiki/a_c%C3%B2ng',
 '/wiki/A_Di_%C4%90%C3%A0',
 '/wiki/A_Di_%C4%90%C3%A0_Ph%E1%BA%ADt',
 '/wiki/a_dua',
 '/wiki/a_giao',
 '/wiki/a_ha',
 '/wiki/a_ho%C3%A0n',
 '/wiki/A_La_H%C3%A1n',
 '/wiki/A_L%E1%BB%8Bch_S%C6%A1n',
 '/wiki/A_L%E1%BB%8Bch_S%C6%A1n_%C4%90%E1%BA%AFc_L%E1%BB%99',
 '/wiki/a_l%C3%B4',
 '/wiki/a_m%C3%B3c',
 '/wiki/a_phi%E1%BA%BFn',
 '/wiki/A_Ph%C3%BA_H%C3%A3n',
 '/wiki/A_Q',
 '/wiki/a_t%C3%B2ng',
 '/wiki/A_Tu_La',
 '/wiki/%C3%A0',
 '/wiki/%C3%A0_th%E1%BA%BF_%C3%A0',
 '/wiki/%C3%A0_u%C3%B4m',
 '/wiki/%E1%BA%A2',
 '/wiki/%E1%BA%A3',
 '/wiki/%E1%BA%A2_R%E1%BA%ADp',
 '/wiki/%C3%A1',
 '/wiki/%C3%81',
 '/wiki/%C3%81_C%C4%83n_%C4%90%C3%ACnh',
 '/wiki/%C3%81_Ch%C3%A2u',
 '/wiki/%C3%81_ch%C3%A2u',
 '/wiki/%C3%81_%C4%90%C3%B4ng',
 '/wiki/%C3%A1_%C4%91%C3%B9',
 '/wiki/%C3%A1_h%E1%BA%ADu',
 '/wiki/%C3%A1_ho%C3%A1_th%E1%BA%A1ch',
 '/wiki/%C3%A1_h%C3%B3a_th%E1%BA%A1ch',
 '/wiki/%C3%A1_kh%E1%BA%A9u',
 '/wiki/%C3%A1_kh%C3%B4i',
 '/wiki/%C3%A1_kim',
 '