In [7]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

home_url = 'https://web.archive.org/web/20170319093420/http://www.uzbek-glossary.com/'

with urllib.request.urlopen(home_url) as response:
    html = response.read()
soup = BeautifulSoup(html, 'html.parser')

cat_url_map = [{'category': ul.parent.b.string, 
                'url': ul.a.attrs['href'].split('#')[0]} for ul in soup.body('ul', class_='domains')]

cat_url_map

[{'category': 'Basic Concepts', 'url': 'bc.htm'},
 {'category': 'Space & Time', 'url': 'st.htm'},
 {'category': 'Forces & Motion', 'url': 'fm.htm'},
 {'category': 'Matter & Nature', 'url': 'mn.htm'},
 {'category': 'Life & Humans', 'url': 'lh.htm'},
 {'category': 'Perception & Cognition', 'url': 'pc.htm'},
 {'category': 'Meaning & Spirit', 'url': 'ms.htm'},
 {'category': 'Character & Emotions', 'url': 'ce.htm'},
 {'category': 'Action & Modality', 'url': 'am.htm'},
 {'category': 'Attitudes', 'url': 'at.htm'},
 {'category': 'Communauty', 'url': 'cm.htm'},
 {'category': 'Conflict & Justice', 'url': 'cj.htm'},
 {'category': 'Work & Possession', 'url': 'wp.htm'},
 {'category': 'Daily Life (I)', 'url': 'd1.htm'},
 {'category': 'Daily Life (II)', 'url': 'd2.htm'},
 {'category': 'Locutions', 'url': 'lo.htm'},
 {'category': 'Proper Names', 'url': 'pn.htm'}]

In [82]:
def extractCardsFromUrl(cat_url = 'ms.htm'):
    with urllib.request.urlopen(home_url + cat_url) as response:
        html = response.read()
    cat_soup = BeautifulSoup(html, 'html.parser')

    cards = []
    current_card = {}
    in_example = False
    example_text = []

    for h2 in cat_soup.find_all('h2'):
        #print(h2.text)
        for tr in h2.find_next_sibling('table').find_all('tr'):
            for i, td in enumerate(tr.find_all('td')):
                # item name in column 0
                if i == 0:
                    if td.text.strip():
                        #print('ITEM', td.text)
                        item_attributes = {'item': td.text.strip(), 
                                           'item_bold': bool(td.b), 
                                           'domain': h2.text.strip()}

                # desc (and example!) in column 1
                if i == 1:
                    if td.attrs['class'][0] == 'top_ex':
                        in_example = True
                        example_text = []

                    if in_example:
                        #print('EXAMPLE', td.text)
                        example_text.append(td.text.strip())
                    else:
                        if td.text.strip():
                            if 'item' in current_card:
                                cards.append(current_card)
                            #print('DESC', td.text)
                            current_card = item_attributes.copy()
                            current_card['desc'] = td.text.strip()
                            current_card['desc_bold'] = bool(td.b)
                            current_card['meaning'] = []
                            current_card['example'] = []

                    if td.attrs['class'][0] in ['bot_ex', 'bot_bot_ex']:
                        in_example = False
                        current_card['example'] = example_text

                # meaning in column 2
                if i == 2:
                    if td.text.strip():
                        current_card['meaning'].append(td.text.strip())

    result = pd.DataFrame(cards)
    result['url'] = cat_url
    
    return result

In [89]:
pagesCards = []

for page in cat_url_map[:-2]:
    print(page['category'])
    pagesCards.append(extractCardsFromUrl(page['url']))
    
AllCards = pd.concat(pagesCards).reset_index(drop=True)

example_explode = AllCards.example.apply(pd.Series)
example_explode = example_explode[range(4)].rename(columns=lambda i: 'example_' + str(i+1))
AllCards = AllCards.join(example_explode)

AllCards.meaning = AllCards.meaning.apply(lambda x: ' | '.join(x))

AllCards

Basic Concepts
Space & Time
Forces & Motion
Matter & Nature
Life & Humans
Perception & Cognition
Meaning & Spirit
Character & Emotions
Action & Modality
Attitudes
Communauty
Conflict & Justice
Work & Possession
Daily Life (I)
Daily Life (II)


In [106]:
BoldCards = AllCards[AllCards.desc_bold]
BoldCards.sample(20)

Unnamed: 0,item,item_bold,domain,desc,desc_bold,meaning,example,url,example_1,example_2,example_3,example_4
11744,PUL,True,Economy,n,True,money,"[Bu kitob necha pul turadi?, How much does thi...",wp.htm,Bu kitob necha pul turadi?,How much does this book cost?,,
7172,qiziqtir-,True,Emotions,v.t,True,"interest, attract, fascinate",[],ce.htm,,,,
1551,SOAT,True,Time and Duration,~ numX+dan numY minut o'tdi,True,it is Y past X,[],st.htm,,,,
2707,bo'shat-,True,Properties,v.t,True,"empty, clear",[],mn.htm,,,,
2165,yotqiz-,True,Motion,v.t,True,"lay down | lay out, spread out",[],fm.htm,,,,
116,CHUNKI,True,Entities and Existence,conj,True,because,"[Siz majlisga borishingiz kerak, chunki u yerd...",bc.htm,"Siz majlisga borishingiz kerak, chunki u yerda...",You must go to the meeting because people are ...,,
7877,G'AYRAT,True,Acts and Results,n,True,"energy, willpower, zeal","[Uning g'ayrati bor odam., He is an energetic ...",am.htm,Uning g'ayrati bor odam.,He is an energetic man.,,
13293,ACHCHIQ,True,Food,adj,True,"bitter, hot, spicy, strong; sour","[Qalampir achchiqdir., Pepper is hot.]",d2.htm,Qalampir achchiqdir.,Pepper is hot.,,
1035,TEKIS,True,Shape,ad,True,smooth(ly); level,"[Bu qog'oz tekis., This paper is smooth., Bu y...",st.htm,Bu qog'oz tekis.,This paper is smooth.,Bu yer tekis.,This place is level.
351,MURAKKAB,True,Category and Order,adj,True,"complex, complicated","[Bu murakkab ish., This is a complicated matter.]",bc.htm,Bu murakkab ish.,This is a complicated matter.,,


In [109]:
# .to_csv('uzbek_bold.tsv',sep='\t', index=False)

BoldCards[['item', 'desc', 'domain', 'meaning'
           , 'example_1', 'example_2', 'example_3', 'example_4']].to_csv('uzbek_bold.tsv',sep='\t', index=True)