In [59]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

home_url = 'https://web.archive.org/web/20170319093420/http://www.uzbek-glossary.com/'

with urllib.request.urlopen(home_url) as response:
    html = response.read()
soup = BeautifulSoup(html, 'html.parser')

cat_url_map = [{'category': ul.parent.b.string, 
                'url': ul.a.attrs['href'].split('#')[0]} for ul in soup.body('ul', class_='domains')]

cat_url_map

[{'category': 'Basic Concepts', 'url': 'bc.htm'},
 {'category': 'Space & Time', 'url': 'st.htm'},
 {'category': 'Forces & Motion', 'url': 'fm.htm'},
 {'category': 'Matter & Nature', 'url': 'mn.htm'},
 {'category': 'Life & Humans', 'url': 'lh.htm'},
 {'category': 'Perception & Cognition', 'url': 'pc.htm'},
 {'category': 'Meaning & Spirit', 'url': 'ms.htm'},
 {'category': 'Character & Emotions', 'url': 'ce.htm'},
 {'category': 'Action & Modality', 'url': 'am.htm'},
 {'category': 'Attitudes', 'url': 'at.htm'},
 {'category': 'Communauty', 'url': 'cm.htm'},
 {'category': 'Conflict & Justice', 'url': 'cj.htm'},
 {'category': 'Work & Possession', 'url': 'wp.htm'},
 {'category': 'Daily Life (I)', 'url': 'd1.htm'},
 {'category': 'Daily Life (II)', 'url': 'd2.htm'},
 {'category': 'Locutions', 'url': 'lo.htm'},
 {'category': 'Proper Names', 'url': 'pn.htm'}]

In [65]:
pd.Series([td.attrs['class'][0] for h2 in soup.find_all('h2') 
                      for td in h2.find_next_sibling('table').find_all('td')]).value_counts()

mid           2215
top           1011
bot            849
mono           807
out            620
top_ex         277
bot_ex         196
bot_bot_ex      81
mid_ex_t        76
mid_ex_s        76
mono_mid        14
mono_left       14
mono_right      14
dtype: int64

In [172]:
def extractCardsFromUrl(cat_url = 'ms.htm'):
    with urllib.request.urlopen(home_url + cat_url) as response:
        html = response.read()
    cat_soup = BeautifulSoup(html, 'html.parser')

    cards = []
    current_card = {}
    in_example = False
    example_text = []

    for h2 in cat_soup.find_all('h2'):
        #print(h2.text)
        for tr in h2.find_next_sibling('table').find_all('tr'):
            for i, td in enumerate(tr.find_all('td')):
                # item name in column 0
                if i == 0:
                    if td.text.strip():
                        #print('ITEM', td.text)
                        item_attributes = {'item': td.text.strip(), 
                                           'item_bold': bool(td.b), 
                                           'domain': h2.text.strip()}

                # desc (and example!) in column 1
                if i == 1:
                    if td.attrs['class'][0] == 'top_ex':
                        in_example = True
                        example_text = []

                    if in_example:
                        #print('EXAMPLE', td.text)
                        example_text.append(td.text.strip())
                    else:
                        if td.text.strip():
                            if 'item' in current_card:
                                cards.append(current_card)
                            #print('DESC', td.text)
                            current_card = item_attributes.copy()
                            current_card['desc'] = td.text.strip()
                            current_card['desc_bold'] = bool(td.b)
                            current_card['meaning'] = []

                    if td.attrs['class'][0] in ['bot_ex', 'bot_bot_ex']:
                        in_example = False
                        current_card['example'] = example_text

                # meaning in column 2
                if i == 2:
                    if td.text.strip():
                        current_card['meaning'].append(td.text.strip())

    result = pd.DataFrame(cards)
    result['url'] = cat_url
    
    return result

In [173]:
pagesCards = []

for page in cat_url_map[:-2]:
    print(page['category'])
    pagesCards.append(extractCardsFromUrl(page['url']))
    
AllCards = pd.concat(pagesCards).reset_index(drop=True)

Basic Concepts
Space & Time
Forces & Motion
Matter & Nature
Life & Humans
Perception & Cognition
Meaning & Spirit
Character & Emotions
Action & Modality
Attitudes
Communauty
Conflict & Justice
Work & Possession
Daily Life (I)
Daily Life (II)


In [177]:
AllCards[AllCards.desc_bold].sample(20)

Unnamed: 0,item,item_bold,domain,desc,desc_bold,meaning,example,url
7863,KO'R-,True,Acts and Results,v.int | VX+ib ~,True,"[try to X, attempt to X]","[Sigaretingizdan chekib ko'raychi., Let me try...",am.htm
6933,JINNI,True,Character,"n, adj",True,[mad; madman],"[Jinni keldi., The madman has come.]",ce.htm
9624,OILA,True,Family,n,True,"[family, household]","[Bola oila a'zosidir., The child is a member o...",cm.htm
8905,MARHAMAT,True,Behaviour,NX+ga marhamat qil-,True,[be considerate to X],"[Unga marhamat qiling!, Be considerate to him ...",at.htm
177,aralashtir-,True,Relations,v.t | NX+ni NY+ga ~,True,[mix X with Y],,bc.htm
8176,kelish-,True,Acts and Results,v.int | NX+ga ~,True,"[agree, reach an agreement on X]","[Ular bilan kelishish mumkin emas., It is not ...",am.htm
12974,GALSTUK,True,Clothing,n,True,[tie],,d1.htm
7549,QAT'IY,True,"Will, Ability and Necessity",adj,True,"[firm, resolute, determined]","[Bu yerga kelishim qat'iydir., It is definite ...",am.htm
6282,BOS-,True,Communication,v.t,True,[print],"[Bu maqolani bosish kerak., This article must ...",ms.htm
4341,AZOB,True,Health and Medicine,his qil-,True,"[feel, sense (perception,sensation,impression)]",,lh.htm


In [187]:
AllCards.groupby('item')['url'].nunique().sort_values(ascending=False)[:50]

item
O'T-              5
KO'R-             5
TUR-              5
CHIQ-             5
BOS-              5
OL-               5
BO'L-             5
JON               4
YUZ               4
BALAND            4
OG'IZ (og'z+i)    4
TUSH-             4
qaytar-           4
KEL-              4
TASHLA-           4
OCH-              4
o'chir-           4
FARQ              4
pishiq            4
BUZ-              4
YUT-              4
BO'Y              4
O'RIN (o'rn+i)    4
TORT-             4
KET-              4
YAQIN             4
ochil-            4
YOZ-              4
BOSH              4
YOQ-              4
TO'P              4
TO'G'RI           4
HAQ               4
o'tkaz-           4
BIL-              4
ajral-            4
ajrat-            4
QOL-              4
kelish-           4
QARA-             4
HISOB             4
kutil-            4
ko'chir-          4
KUT-              4
PAST              4
KIR-              4
BEKOR             3
AYLAN-            3
UZAT-             3
TO'Q           