# BPCorpus: Generating corpus assets

Project repository: https://github.com/k-nem/bpcorpus

In [2391]:
import os
import re
from bs4 import BeautifulSoup
import csv

## Original files

Original HTML files were scraped from knihi.com based on author list.

The files and scraping scripts can be found in ['bpcorpus-collect' repository](https://github.com/k-nem/bpcorpus-collect).

Local path:

In [2428]:
path = '../../../parsed'

## Collect full file list

In [2389]:
def getFiles(path):
    """Walk through subfolders and retrieve HTML file list"""
    htmls = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith('.html') and 'page' not in name:
                htmls.append(root+'/'+name)
                
    htmls = sorted(htmls)
                
    return htmls

In [2390]:
allFiles = getFiles(path)
print(allFiles[:5], '\n\n', len(allFiles))

['../../../parsed/003_Ян Баршчэўскі/003_0004.html', '../../../parsed/003_Ян Баршчэўскі/003_0005.html', '../../../parsed/003_Ян Баршчэўскі/003_0007.html', '../../../parsed/003_Ян Баршчэўскі/003_0008.html', '../../../parsed/003_Ян Баршчэўскі/003_0009.html'] 

 5948


## Original location URL

In [2392]:
# get original URL from CSV download log
with open('../../../parsed/bpc_alllinks.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    links = [row for row in reader]

In [2393]:
links[3][3]

'003_0003'

In [1503]:
def getUrl(path):
    """Get original location URL"""
    nm = path.split('/')[-1].split('.')[0]
    url = 'https://knihi.com/'
    for l in links:
        if l[3] == nm:
            url = 'https://knihi.com/' + l[10]
    
    return url

In [2298]:
getUrl('../../../parsed/003_Ян Баршчэўскі/003_0007.html')

'https://knihi.com//Jan_Barsceuski/Dzvie_biarozy.html'

In [1508]:
getUrl('incorrect file name')

'https://knihi.com/'

## Filter out irrelevant files 

Not all files are relevant for the corpus, since only poetic forms need to be added. To determine the genres and other metadata I parse the HTML with regular expression and extract the values from JS/HTML style comments like these (the format is determined by the knihi.com source files):
<!-- HEADER_FIELD Authors: Ян Чачот -->
<!-- HEADER_FIELD CreationYear: 1825-1846? -->

In [2394]:
def metaP(file): 
    """Get metadata from the files with relevant genre value to determine the final file list"""
    
    # genre list
    genres = ['Верш', 'Паэма', 'Басня', 'Балада']
    
    with open(file,'r') as f:
        raw = f.read()
    
    # the string below shows that there is no table of contents (as in a collection of works), so only singular texts are retrieved
    if '<!-- TOC_BEGIN -->\n<!-- TOC_END -->' in raw:

        metadict = {}
        meta = re.findall('(?<=<!-- HEADER_FIELD ).*(?= -->)', raw)

        if meta: 
            for m in meta:
                metadict[m.split(':')[0]] = m.split(':')[1].lstrip()

        if 'StyleGenre' in metadict:

            if '/' in metadict['StyleGenre']:

                if ',' in metadict['StyleGenre']:
                    metadict['StyleGenre'] = metadict['StyleGenre'].split(',')
                    metadict['StyleGenre'] = [item.split('/')[1].capitalize() for item in metadict['StyleGenre'] if len(item.split('/')) > 1]

                elif ';' in metadict['StyleGenre']:
                    metadict['StyleGenre'] = metadict['StyleGenre'].split(';')
                    metadict['StyleGenre'] = [item.split('/')[1].capitalize() for item in metadict['StyleGenre'] if len(item.split('/')) > 1]

                else:
                    metadict['StyleGenre'] = metadict['StyleGenre'].split('/')[1].capitalize()

            if not isinstance(metadict['StyleGenre'], list):     
                if metadict['StyleGenre'] in genres:
                    
                    return metadict, file

            else:
                styleList = [item for item in metadict['StyleGenre'] if item in genres]
                if len(styleList) > 0:
                    metadict['StyleGenre'] = styleList[0]
                    
                    return metadict, file
                
def getAllP(filelist):
    """Apply getP() to a file list"""
    
    allMetaList = []
    finalFileList = []
    
    for file in filelist:
        item = metaP(file)
        if item:
            allMetaList.append(item[0])
            finalFileList.append(item[1])
        else:
            continue
            
    return allMetaList, finalFileList

In [2395]:
metaP('../../../parsed/013_Уладзіслаў Сыракомля/013_0004.html')

({'Authors': 'Уладзіслаў Сыракомля',
  'CreationYear': '[1911-1912]?',
  'FirstPublicationYear': '1913; 1932',
  'LangOrig': 'pol',
  'Pravapis': 'A2008',
  'PublicationYear': '1997',
  'StyleGenre': 'Балада',
  'Title': 'Каралі',
  'Title2': '(Пацеркі)',
  'Translation': 'Янка Купала'},
 '../../../parsed/013_Уладзіслаў Сыракомля/013_0004.html')

In [2396]:
pList = getAllP(allFiles)
pList[0][0]

{'Authors': 'Ян Баршчэўскі',
 'CreationYear': '13.07.1841',
 'Edition': 'Ян Баршчэўскі. Выбраныя творы. Менск, МФ «Беларускі кнігазбор», 1998.',
 'FirstPublicationYear': '1842',
 'LangOrig': 'pol',
 'Pravapis': 'A1957',
 'PublicationYear': '1998',
 'SectionAuthor': 'Балады',
 'StyleGenre': 'Балада',
 'Title': 'Дзве бярозы',
 'Title2': 'З народных паданняў',
 'Translation': 'Кастусь Цьвірка'}

In [2403]:
pList[1][-5:]

['../../../parsed/261_Яўгенія Янішчыц/261_0012.html',
 '../../../parsed/261_Яўгенія Янішчыц/261_0014.html',
 '../../../parsed/261_Яўгенія Янішчыц/261_0015.html',
 '../../../parsed/265_Леанід Галубовіч/265_0001.html',
 '../../../parsed/265_Леанід Галубовіч/265_0002.html']

In [2404]:
len(pList[1])

3068

This list will be used for corpus generation. Folder structure is arbitrary since metadata is located in the file.

In [1356]:
# for i in pList[0]:
#     if 'LangOrig' in i:
#         if i['LangOrig'] == 'yid':
#             print(i)

In [1357]:
# all original languages for translated works
set([i['LangOrig'] for i in pList[0] if 'LangOrig' in i])

{'deu', 'lat', 'pol', 'rus', 'ukr', 'yid'}

## Author index ID
This dictionary will be used to provide author IDs.

In [1363]:
def allAuthors(metalist):
    """Get author work count"""
    
    authors = {}

    for item in metalist:
        if 'Authors' in item.keys():
            if item['Authors'] not in authors.keys():
                authors[item['Authors']] = 1
                
            else:
                authors[item['Authors']] += 1
                
        if 'Translation' in item.keys():
            if item['Translation'] not in authors.keys():
                authors[item['Translation']] = 1
                
            else:
                authors[item['Translation']] += 1

    return authors

def authIds(authlist):
    """Generate author IDs and name dictionaries for tree population"""
    
    order = sorted(authlist, key=lambda x: str(x.split()[-1]))
    authIds = {}
    
    for i, name in enumerate(order):

        names = name.split()

        if len(names) == 2:
            forename = names[0]
            surname = names[1]
            names = {'Forename': forename, 'Surname': surname}
            authIds[name] = {'name': names, 'id': i + 1}
            
        else:
            authIds[name] = {'name': name, 'id': i + 1}
        
    return authIds 

In [2405]:
authCounts = allAuthors(pList[0])
authCounts['Цётка']

4

In [2408]:
authCounts

{'Ян Баршчэўскі': 8,
 'Кастусь Цьвірка': 1,
 'Уладзімір Мархель': 8,
 'Рыгор Барадулін': 23,
 'Ян Чачот': 3,
 'Уладзіслаў Сыракомля': 11,
 'Янка Купала': 860,
 'Кастусь Каліноўскі': 1,
 'Францішак Багушэвіч': 36,
 'Міхась Машара': 1,
 'Сяржук Сокалаў-Воюш': 2,
 'Максім Танк': 24,
 'Куба Паўтаржыцкі': 1,
 'Пятро Бітэль': 1,
 'Алег Лойка': 4,
 'Янка Лучына': 2,
 'Генадзь Тумаш': 1,
 'Цётка': 4,
 'Кароль Балінскі': 1,
 'Герман Гесэ': 1,
 'М. Ісакоўскі': 1,
 'Аляксей Кальцоў': 1,
 'Марыя Канапніцкая': 11,
 'Ян Каспровіч': 1,
 'М. Красільнікаў': 1,
 'Юзаф Крашэўскі': 2,
 'Іван Крылоў': 1,
 'Раіса Кудашава': 1,
 'Адам Міцкевіч': 9,
 'Невядомы': 6,
 'Мікалай Някрасаў': 5,
 "Валяр'ян Паляшчук": 1,
 'Іда Пілецкая': 1,
 'Карнель Уейскі': 1,
 'Дзмітрый Цэнзар': 1,
 'Фёдар Цютчаў': 1,
 'Грыцька Чупрынка': 2,
 'Тарас Шаўчэнка': 35,
 'Піліп Шкулёў': 1,
 'Л. Явалкоўская-Кашуцкая': 1,
 'Яскулка': 1,
 'Якуб Колас': 808,
 'Сяргей Гарадзецкі': 2,
 'Максім Рыльскі': 1,
 'Паўло Тычына': 1,
 'Георг Хэрвег':

In [2411]:
authDict = authIds(authCounts.keys())
[authDict['Кастусь Цьвірка'], authDict['Цётка']]

[{'name': {'Forename': 'Кастусь', 'Surname': 'Цьвірка'}, 'id': 77},
 {'name': 'Цётка', 'id': 80}]

## Check the most used tags within texts (optional)

The following code doesn't serve any function in the generation of the corpus apart from collecting information about the most used tags.

In [1368]:
def allTags(path):
    """Full tag list for a file"""
    
    with open(path,'r') as f:
        raw = f.read()
        
    pat = re.compile('(?<=(<!-- BOOK_BEGIN -->))(\s.*\s?)*(?=(<!-- BOOK_END -->))')
    tags = re.search(pat, raw)
    
    if tags: 
        soup = BeautifulSoup(tags.group(0),'html.parser')   
        taglist = [[tag.name, tag.text, tag.attrs] for tag in soup.find_all()]
    
    return taglist

In [1673]:
allTags('../../../parsed/013_Уладзіслаў Сыракомля/013_0004.html')[:10]

[['p', 'З польскай — У. Сыракомлі', {}],
 ['i', 'З польскай — У. Сыракомлі', {}],
 ['p', '\xa0', {}],
 ['p', 'Як ішоў я ў бой кіпячы,', {}],
 ['p', 'Як прашчаўся з хаткай,', {}],
 ['p', 'Тут Гануля мая з плачам:', {}],
 ['p', '«Пойдзеш гінуць, братка!', {}],
 ['p', 'Але буду я маліцца', {}],
 ['p', 'Па табе з трывогі,', {}],
 ['p', 'Ты ж прынось за то гасцінца —', {}]]

In [2412]:
def tagCounts(files):  
    """Count non-empty tags"""
    tagCount = {}
    divs = []
    sups = []
    links = []
    cen = []
    bs = []
    ps = []

    for file in files:
        with open(file,'r') as f:
            raw = f.read()

        pat = re.compile('(?<=(<!-- BOOK_BEGIN -->))(\s.*\s?)*(?=(<!-- BOOK_END -->))')
        tags = re.search(pat, raw)

        if tags:
            soup = BeautifulSoup(tags.group(0),'html.parser')   
            taglist = [[tag.name, tag.text, tag.attrs] for tag in soup.find_all()]

            for tag in taglist:
                if tag[1].strip():
                    if tag[0] == 'div':
                        if tag[2]['class']:
                            if tag[2]['class'] == ['POETRY']:
                                continue  

                            else:
                                divs.append([file, tag])

                                if tag[0] not in tagCount.keys():
                                    tagCount[tag[0]] = 1

                                else:
                                    tagCount[tag[0]] += 1

                        else:
                                divs.append([file, tag])

                                if tag[0] not in tagCount.keys():
                                    tagCount[tag[0]] = 1

                                else:
                                    tagCount[tag[0]] += 1

                    elif tag[0] == 'sup':
                        sups.append([file, tag])

                        if tag[0] not in tagCount.keys():
                            tagCount[tag[0]] = 1

                        else:
                            tagCount[tag[0]] += 1

                    elif tag[0] == 'a':
                        links.append([file, tag])

                        if tag[0] not in tagCount.keys():
                            tagCount[tag[0]] = 1

                        else:
                            tagCount[tag[0]] += 1

                    elif tag[0] == 'center':
                        cen.append([file, tag])

                        if tag[0] not in tagCount.keys():
                            tagCount[tag[0]] = 1

                        else:
                            tagCount[tag[0]] += 1

                    elif tag[0] == 'b':
                        bs.append([file, tag])

                        if tag[0] not in tagCount.keys():
                            tagCount[tag[0]] = 1

                        else:
                            tagCount[tag[0]] += 1

                    elif tag[0] == 'p':
                        if tag[1] != '\xa0':
                            ps.append([file, tag])

                        if tag[0] not in tagCount.keys():
                            tagCount[tag[0]] = 1

                        else:
                            tagCount[tag[0]] += 1

    return tagCount, divs, sups, links, cen, bs, ps

In [2413]:
tagInfo = tagCounts(pList[1])

In [2414]:
tagInfo[0]

{'p': 127450, 'sup': 101, 'b': 159, 'a': 94, 'div': 163, 'center': 42}

In [2422]:
tagInfo[1][:3]

[['../../../parsed/028_Янка Купала/028_0493.html',
  ['div', '\n1 Суомі — Фінляндыя.\n\n', {'class': ['endnote-block']}]],
 ['../../../parsed/028_Янка Купала/028_0493.html',
  ['div', '1 Суомі — Фінляндыя.\n', {'class': ['footnote']}]],
 ['../../../parsed/028_Янка Купала/028_0561.html',
  ['div',
   '\nЕўрапейскія дзяржавы пастанавілі запрасіць на міжнародную нараду ў Геную т. Леніна.\n\xa0\nЗ газет 1921 г.\n',
   {'class': ['EPIGRAPH']}]]]

In [2423]:
tagInfo[2][:3]

[['../../../parsed/003_Ян Баршчэўскі/003_0007.html', ['sup', '1', {}]],
 ['../../../parsed/028_Янка Купала/028_0493.html', ['sup', '1', {}]],
 ['../../../parsed/028_Янка Купала/028_0493.html', ['sup', '1', {}]]]

In [2424]:
tagInfo[3][:3]

[['../../../parsed/028_Янка Купала/028_0493.html',
  ['a', '1', {'href': '#footnote-1'}]],
 ['../../../parsed/028_Янка Купала/028_0493.html',
  ['a', '1', {'id': 'footnote-1'}]],
 ['../../../parsed/032_Алесь Гарун/032_0016.html',
  ['a', '42', {'href': '#footnote-1'}]]]

In [2425]:
tagInfo[4][:3]

[['../../../parsed/029_Якуб Колас/029_0401.html', ['center', '*', {}]],
 ['../../../parsed/029_Якуб Колас/029_0401.html', ['center', '*', {}]],
 ['../../../parsed/029_Якуб Колас/029_0401.html', ['center', '*', {}]]]

In [2430]:
tagInfo[5][:3]

[['../../../parsed/003_Ян Баршчэўскі/003_0007.html', ['b', 'Каментары', {}]],
 ['../../../parsed/003_Ян Баршчэўскі/003_0008.html', ['b', 'Каментары', {}]],
 ['../../../parsed/003_Ян Баршчэўскі/003_0009.html', ['b', 'Каментары', {}]]]

In [2420]:
tagInfo[6][:3]

[['../../../parsed/003_Ян Баршчэўскі/003_0007.html',
  ['p', 'Ведае вёска: так шчыра Марылю', {}]],
 ['../../../parsed/003_Ян Баршчэўскі/003_0007.html',
  ['p', 'Ясь, маладую, кахае!', {}]],
 ['../../../parsed/003_Ян Баршчэўскі/003_0007.html',
  ['p', 'З Полацка возіць ёй стужкі штохвілю,', {}]]]

In [2421]:
[[p[0], p[1][1]] for p in tagInfo[6] if p[1][1] == '* * *'][:5]

[['../../../parsed/019_Янка Лучына/019_0001.html', '* * *'],
 ['../../../parsed/019_Янка Лучына/019_0001.html', '* * *'],
 ['../../../parsed/019_Янка Лучына/019_0001.html', '* * *'],
 ['../../../parsed/019_Янка Лучына/019_0001.html', '* * *'],
 ['../../../parsed/019_Янка Лучына/019_0001.html', '* * *']]

## Build XML tree I

Generation a XML tree with TEI header and verse structure on stanza level `<lg>` (tokenization and lemmatization will be performed at a later stage and a wider context is necessary for higher accuracy of tagging). Meta elements like epigraphs, footnotes and headings are placed into `<seg>` tags so that they are not included in lemmatization

In [1390]:
def getRaw(path):
    """Open file"""
    with open(path,'r') as f:
        raw = f.read()   
    return raw

In [2307]:
def meta(raw): 
    """Get metadata (without filtering)"""

    metadict = {}
    meta = re.findall('(?<=<!-- HEADER_FIELD ).*(?= -->)', raw)

    if meta: 
        for m in meta:
            metadict[m.split(':')[0]] = m.split(':')[1].lstrip()

            
    if 'Authors' in metadict:
        if isinstance(authDict[metadict['Authors']]['name'], dict):
            metadict['Forename'] = authDict[metadict['Authors']]['name']['Forename']
            metadict['Surname'] = authDict[metadict['Authors']]['name']['Surname']
            
        metadict['aid'] = authDict[metadict['Authors']]['id']
        
        
    if 'Translation' in metadict:
        if isinstance(authDict[metadict['Translation']]['name'], dict):
            metadict['trForename'] = authDict[metadict['Translation']]['name']['Forename']
            metadict['trSurname'] = authDict[metadict['Translation']]['name']['Surname']
            
        metadict['tid'] = authDict[metadict['Translation']]['id']

    if 'StyleGenre' in metadict:

        if '/' in metadict['StyleGenre']:

            if ',' in metadict['StyleGenre']:
                metadict['StyleGenre'] = metadict['StyleGenre'].split(',')
                metadict['StyleGenre'] = [item.split('/')[1].capitalize() for item in metadict['StyleGenre'] if len(item.split('/')) > 1]

            elif ';' in metadict['StyleGenre']:
                metadict['StyleGenre'] = metadict['StyleGenre'].split(';')
                metadict['StyleGenre'] = [item.split('/')[1].capitalize() for item in metadict['StyleGenre'] if len(item.split('/')) > 1]

            else:
                metadict['StyleGenre'] = metadict['StyleGenre'].split('/')[1].capitalize()

        else:
            styleList = [item for item in metadict['StyleGenre'] if item in genres]
            metadict['StyleGenre'] = styleList[0]
            
        if isinstance(metadict['StyleGenre'], list):
            metadict['StyleGenre'] = metadict['StyleGenre'][0]
            
    attrs = ['Authors', 'CreationYear', 'Edition', 'FirstPublicationYear', 'LangOrig', 'PublicationYear', 'StyleGenre', 'Title', 'Translation', 'Forename', 'Surname', 'trForename', 'trSurname', 'tid']
    for a in attrs:
        if a not in metadict.keys():
            metadict[a] = None
    
    return metadict
                

In [2308]:
meta(getRaw('../../../parsed/013_Уладзіслаў Сыракомля/013_0004.html'))

{'Authors': 'Уладзіслаў Сыракомля',
 'CreationYear': '[1911-1912]?',
 'FirstPublicationYear': '1913; 1932',
 'LangOrig': 'pol',
 'Pravapis': 'A2008',
 'PublicationYear': '1997',
 'StyleGenre': 'Балада',
 'Title': 'Каралі',
 'Title2': '(Пацеркі)',
 'Translation': 'Янка Купала',
 'Forename': 'Уладзіслаў',
 'Surname': 'Сыракомля',
 'aid': 70,
 'trForename': 'Янка',
 'trSurname': 'Купала',
 'tid': 47,
 'Edition': None}

In [2309]:
meta(getRaw('../../../parsed/026_Цётка/026_0002.html'))

{'Authors': 'Цётка',
 'CreationYear': '1905',
 'Edition': 'Беларуская літаратура ',
 'FirstPublicationYear': '1905?',
 'Pravapis': 'A1957',
 'PublicationYear': '2004',
 'SectionAuthor': 'Вершы',
 'Source': 'скан',
 'StyleGenre': 'Верш',
 'Title': 'Вера беларуса',
 'Uploaded': '2011-08-04T12',
 'Year': '1905',
 'aid': 80,
 'LangOrig': None,
 'Translation': None,
 'Forename': None,
 'Surname': None,
 'trForename': None,
 'trSurname': None,
 'tid': None}

In [2388]:
def genTree(raw, treemeta, fileName):
    """Step 1. Generate TEI header and line group structure after collecting metadata"""
    
    #raw = getRaw(file)
    #treemeta = meta(raw)
    newtree = BeautifulSoup('', 'xml')
    newtree.append(newtree.new_tag('TEI'))
    
    # teiHeader
    
    head = newtree.new_tag('teiHeader')
    head.append(newtree.new_tag('fileDesc'))
    
    # titleStmt
    
    titlest = newtree.new_tag('titleStmt')
    
    if 'Title' in treemeta:
        if treemeta['Title']:
            titlest.append(newtree.new_tag('title'))
            titlest.title.string = treemeta['Title']
        
    if 'Title2' in treemeta: 
        if treemeta['Title2']:
            titlest.append(newtree.new_tag('subtitle'))
            titlest.subtitle.string = treemeta['Title2']
    
    titlest.append(newtree.new_tag('author'))
    titlest.author.append(newtree.new_tag('persName'))
    
    if 'Authors' in treemeta:
        if treemeta['Authors']:
            titlest.author.persName['ident'] = treemeta['aid']
            if treemeta['Forename']:
                titlest.author.persName.append(newtree.new_tag('forename'))
                titlest.author.persName.forename.string = treemeta['Forename']
                titlest.author.persName.append(newtree.new_tag('surname'))
                titlest.author.persName.surname.string = treemeta['Surname']

            else:
                nm = newtree.new_tag('name')
                nm.string = treemeta['Authors']
                titlest.author.persName.append(nm)
        
    head.fileDesc.append(titlest)
        
    # sourceDesc
    
    src = newtree.new_tag('sourceDesc')
    
    if 'Edition' in treemeta:
        if treemeta['Edition']:
            origs = newtree.new_tag('bibl')
            origs['type'] = 'originalSource'
            origs.string = treemeta['Edition']
            src.append(origs)
        
    digs = newtree.new_tag('bibl')
    digs['type'] = 'digitalSource'
    digs.append(newtree.new_tag('ptr'))
    digs.ptr['target'] = getUrl(fileName)
    src.append(digs)
    
    head.fileDesc.append(src)
    
    # profileDesc
    
    prof = newtree.new_tag('profileDesc')
    tc = newtree.new_tag('textClass')
    prof.append(tc)
    kw = newtree.new_tag('keywords')
    tc.append(kw)
    
    lu = newtree.new_tag('langUsage')
    lang = newtree.new_tag('language')
    lang['ident'] = 'be'
    lang.string = 'Беларуская'
    lu.append(lang)
    prof.append(lu)
    
    form = newtree.new_tag('term')
    form['type'] = 'form'
    form.string = 'Паэзія'
    kw.append(form)
    
    if 'StyleGenre' in treemeta:  
        if treemeta['StyleGenre']: 
            genre = newtree.new_tag('term')
            genre['type'] = 'genre'
            genre.string = treemeta['StyleGenre']
            kw.append(genre)
        
    head.fileDesc.append(prof)
    
    # textDesc
    
    txtd = newtree.new_tag('textDesc')
    
    if 'creationYear' in treemeta:
        if treemeta['creationYear']: 
            txtd.append(newtree.new_tag('creationYear'))
            txtd.creationYear['when'] = treemeta['CreationYear']
            txtd.creationYear.string = treemeta['CreationYear']
        
    if 'publicationYear' in treemeta:
        if treemeta['publicationYear']: 
            txtd.append(newtree.new_tag('publicationYear'))
            txtd.publicationYear['when'] = treemeta['PublicationYear']
            txtd.publicationYear.string = treemeta['PublicationYear']

    if 'FirstPublicationYear' in treemeta:
        if treemeta['FirstPublicationYear']: 
            txtd.append(newtree.new_tag('firstPublicationYear'))
            txtd.firstPublicationYear['when'] = treemeta['FirstPublicationYear']
            txtd.firstPublicationYear.string = treemeta['FirstPublicationYear']
    
    if 'Translation' in treemeta:
        if treemeta['Translation']:
            tra = newtree.new_tag('translator')
            tra['ident'] = treemeta['tid']
            if treemeta['trForename']:
                tra.append(newtree.new_tag('forename'))
                tra.forename.string = treemeta['trForename']
                tra.append(newtree.new_tag('surname'))
                tra.surname.string = treemeta['trSurname']

            else:
                nm = newtree.new_tag('name')
                nm.string = treemeta['Translation']
                tra.append(nm)
                
            txtd.append(tra)
        
    if 'LangOrig' in treemeta:
        if treemeta['LangOrig']:
            lNames = {'deu':['de', 'Нямецкая' ], 'lat': ['la', 'Лацінская'], 'pol': ['pl', 'Польская'], 'rus': ['ru', 'Руская'], 'ukr': ['uk', 'Украінская'], 'yid': ['yi', 'Ідыш']}

            lo = newtree.new_tag('originalLanguage')
            lo['ident'] = lNames[treemeta['LangOrig']][0]
            lo.string = lNames[treemeta['LangOrig']][1]
            txtd.append(lo)
    
    head.fileDesc.append(txtd)

    newtree.TEI.append(head)
    
    body = newtree.new_tag('body')
    newtree.TEI.append(body)
    
    # parse body
    
    pat = re.compile('(?<=(<!-- BOOK_BEGIN -->))(\s.*\s?)*(?=(<!-- BOOK_END -->))')
    textbody = re.search(pat, raw)

    soup = BeautifulSoup(textbody.group(0),'html.parser')
    tags = [[tag.name, tag.text, tag.attrs] for tag in soup.find_all()]
    
    lg = newtree.new_tag('lg')
    lg.string = ''
    
    if 'EPIGRAPH' in [tag[2]['class'][0] for tag in tags if tag[0]== 'div' and tag[2]['class']]:
        for i,item in enumerate(tags):
            if item[2]:
                if 'class' in item[2]:
                    if item[2]['class'][0] == 'CLEAR':
                        for tag in tags[:i]:
                            if tag[2]:
                                if 'class' in tag[2]:
                                    if tag[2]['class'][0] == 'EPIGRAPH':
                                        seg = newtree.new_tag('seg')
                                        seg['type'] = 'epigraph'

                                        for line in tag[1].strip().split('\n'):
                                            l = newtree.new_tag('l')
                                            l['type'] = 'ep'
                                            l.string = line
                                            seg.append(l)

                                        newtree.body.append(seg)

                        tags = tags[i+1:]
                    
    cseg = None
    
    if ['b','Каментары', {}] in tags:
        for i,item in enumerate (tags):
            if ['b','Каментары', {}] == item:
                coms = [tag[1] for tag in tags[i+1:] if tag[0] == 'p']
                cseg = newtree.new_tag('seg')
                cseg['type'] = 'footnotes'
                
                for c in [c for c in coms if c != '\xa0']:
                    l = newtree.new_tag('l')
                    dig = re.search('^\d{1,2}[\.|\)]?(?=[^\d])', c)
                        
                    if dig:
                        s = newtree.new_tag('num')
                        s.string = dig.group().strip('.)')
                        l.append(s)

                        com = newtree.new_tag('footnote')
                        com.string = c.replace(dig.group(),'').strip()
                        l.append(com)

                    else:
                        com = newtree.new_tag('footnote')
                        com.string = c
                        l.append(com)

                    cseg.append(l) 

                tags = tags[:i-1]
  

    for i,tag in enumerate(tags): 
        if i == len(tags) - 1 and lg.string != '':        
            newtree.body.append(lg)
            
        else:
            if tag[0] == 'p':   
                if tag[1] == '\xa0':
                    if lg.string:
                        newtree.body.append(lg)
                        lg = newtree.new_tag('lg')
                        lg.string = ''
                    
                elif tag[1] == '*' or tag[1] == '* * *':
                    if lg.string:
                        newtree.body.append(lg)
                        lg = newtree.new_tag('lg')
                        lg.string = ''
                        
                    seg = newtree.new_tag('seg')
                    seg.string = tag[1]
                    seg['type'] = 'divider'
                    newtree.body.append(seg)
                    
                elif tags[i + 1][0] == 'i' and tags[i + 1][1] == tag[1]:
                    if lg.string:
                        newtree.body.append(lg)
                        lg = newtree.new_tag('lg')
                        lg.string = ''
                            
                    seg = newtree.new_tag('seg')
                    seg.string = tag[1]
                    seg['type'] = 'italic'
                    newtree.body.append(seg)
                    
                elif tags[i + 1][0] == 'b' and tags[i + 1][1] == tag[1]:
                    if lg.string:
                        newtree.body.append(lg)
                        lg = newtree.new_tag('lg')
                        lg.string = ''
                            
                    seg = newtree.new_tag('seg')
                    seg.string = tag[1]
                    seg['type'] = 'header'
                    newtree.body.append(seg)
                    
                elif tag[1] == ' ':
                    continue

                else:
                    year = re.findall('[1|2]\d{3}', tag[1])
                    
                    if year:
                        if lg.string:
                            newtree.body.append(lg)
                            lg = newtree.new_tag('lg')
                            lg.string = ''
                        
                        seg = newtree.new_tag('seg')
                        seg.string = tag[1]
                        seg['type'] = 'timestamp'
                        newtree.body.append(seg)
                        
                    else: 
                        lg.string += tag[1] + '\n'
     

            elif tag[0] == 'div':
                
                if tag[2]['class'][0] == 'endnote-block' and tag[1].strip():
                    if lg.string:
                        newtree.body.append(lg)
                        
                    coms = [tag[1] for tag in tags[i+1:] if tag[0] == 'p']

                    cseg = newtree.new_tag('seg')
                    cseg['type'] = 'footnotes'

                    for c in [c for c in coms if c != '\xa0']:
                        l = newtree.new_tag('l')
                        dig = re.search('^\d{1,2}[\.|\)]?(?=[^\d])', c)

                        if dig:
                            s = newtree.new_tag('num')
                            s.string = dig.group().strip('.)')
                            l.append(s)

                            com = newtree.new_tag('footnote')
                            com.string = c.replace(dig.group(),'').strip()
                            l.append(com)

                        else:
                            com = newtree.new_tag('footnote')
                            com.string = c
                            l.append(com)


                        cseg.append(l) 

                    newtree.body.append(cseg)
                    break
                    
            elif tag[0] == 'center':
                if lg.string:
                    newtree.body.append(lg)
                    lg = newtree.new_tag('lg')
                    lg.string = ''

                seg = newtree.new_tag('seg')
                seg.string = tag[1]
                
                if tag[1] == '*' or tag[1] == '* * *':
                    seg['type'] = 'divider'
                    newtree.body.append(seg)
                    
#             elif tag[0] == 'b':
                
#                 if lg.string:
#                     newtree.body.append(lg)
#                     lg = newtree.new_tag('lg')
#                     lg.string = ''

#                 b = newtree.new_tag('seg')
#                 b.string = tag[1]
#                 b['type'] = 'heading'
#                 newtree.body.append(b)
                    
    if cseg != None:
        newtree.body.append(cseg)
    
    return newtree

In [2376]:
test1 = '../../../parsed/003_Ян Баршчэўскі/003_0007.html'
test2 = '../../../parsed/028_Янка Купала/028_0493.html'
test3 = '../../../parsed/031_Максім Багдановіч/031_0012.html'
test4 = '../../../parsed/032_Алесь Гарун/032_0016.html'
test5 = '../../../parsed/032_Алесь Гарун/032_0028.html'
test6 = '../../../parsed/032_Алесь Гарун/032_0147.html'

genTree(getRaw(test6), meta(getRaw(test6)), test6)

<?xml version="1.0" encoding="utf-8"?>
<TEI><teiHeader><fileDesc><titleStmt><title>«Сталася здарэнне дый каля крыніцы...»</title><author><persName ident="21"><forename>Алесь</forename><surname>Гарун</surname></persName></author></titleStmt><sourceDesc><bibl type="originalSource">Гарун А. Сэрцам пачуты звон</bibl><bibl type="digitalSource"><ptr target="https://knihi.com//Ales_Harun/Stalasia_zdarennie_dyj_kala_krynicy_spz.html"/></bibl></sourceDesc><profileDesc><textClass><keywords><term type="form">Паэзія</term><term type="genre">Верш</term></keywords></textClass><langUsage><language ident="be">Беларуская</language></langUsage></profileDesc><textDesc><firstPublicationYear when="1929">1929</firstPublicationYear></textDesc></fileDesc></teiHeader><body><seg type="epigraph"><l type="ep">Ja? konia poi?31.</l></seg><lg>Сталася здарэнне дый каля крыніцы
У хлопца-баламута, ў сіроткі дзявіцы.
Ясь спаткаў Марысю, што вадзіцу брала,
Ды і ну дурыці, каб памандравала.
«Забяры, Марыля, мамчына багацц

In [2049]:
getAllTags(test3)[:10]

[['div',
  '\nЦветы последние милей\nРоскошных  первенцев полей.\nА. Пушкін\n',
  {'class': ['EPIGRAPH']}],
 ['p', 'Цветы последние милей', {}],
 ['p', 'Роскошных  первенцев полей.', {}],
 ['p', 'А. Пушкін', {'class': ['sign']}],
 ['div', '', {'class': ['CLEAR']}],
 ['p', '\xa0', {}],
 ['p', 'Плакала лета, зямлю пакідаючы;', {}],
 ['p', 'Ціха ліліся сьлязінкі на поле.', {}],
 ['p', 'Але прыгожаю восеньню яснаю', {}],
 ['p', 'Там, дзе упалі яны, вырасталі', {}]]

## Generate simplified TXT

Generation a simplified TXT file for NLP processing, including POS-tagging. Only verses proper are included in this file, headings, epigraphs, footnotes are removed

In [2321]:
tree = genTree(getRaw('../../../parsed/028_Янка Купала/028_0493.html'), meta('../../../parsed/028_Янка Купала/028_0493.html'), '../../../parsed/028_Янка Купала/028_0493.html')

In [2314]:
def simpleTxt(tree):
    """Generate simple TXT from <lg> only. Meta elements are omitted"""
    
    pmarks = '.,;:!?-–—()\"[]«»0123456789' # punctuation marks
    txt = ''    
    for lg in tree.body.find_all('lg'):
        for char in lg.string.lower():
            if char in pmarks:
                continue
            else:
                txt += char
        txt += '\n'
        
    return txt

In [2320]:
simpleTxt(tree)

'на поўначы сумнай у фіншчыне дзіўнай\nракой з вадаспадам заліўся пакат\nвуоксаю рэчка завецца у фінаў\nіматрай завуць вадаспад\n\nклакочуць рагочуць іматрыны хвалі\nна цэлыя вёрсты шумгоман стаіць\nнем толечы каменны бераг як з сталі\nі зараснік хвойны маўчыць\n\nшалее іматра між каменных глыбаў\nза хваляю хвалю імчыць к нізіне\nяк хмар недаступных махнатыя скібы\nадна адну схопіць піргне\n\nадна з адной рынуцца ўглыб як магілу\nтам скруцяцца ўзнімуцца клубам дугой\nрассыплюцца пухам рассеюцца пылам\nзноў выскачаць к небу гарой\n\nзірнуць ззіхануцца сыпнуцца на скалы\nўсім дантаўскім процьмам на здзіў\nі люнуць на волю забыўшыся шалаў\nплывуць паміж пустак і ніў\n\nдругія іх зменяць і пеняцца ў зломе\nадвечністым шумам калышуць прастор\nсвабодай сваёю і роднай суомі\nсягнуць быццам хочуць да зор\n\nстаіш і глядзіш на бунтоўныя воды\nі сэрца лялеецца ў сцішнай жальбе\nўсё слухаеш нема як стогнуць нягодай\nды як бы ўсё клічуць цябе\n\nхадзі к нам бяспутнік кінь долю на свеце\nспачын век

In [1667]:
with open('../../simplified.txt', 'w') as s:
    s.write(simpleTxt(tree))

## POS-tagging

Tagging is done by UDPipe/UFAL with the use of [UD Belarusian HSE](https://github.com/UniversalDependencies/UD_Belarusian-HSE) tagger model

In [570]:
import subprocess

In [562]:
!git clone https://github.com/ufal/udpipe

fatal: destination path 'udpipe' already exists and is not an empty directory.


In [564]:
%cd ./udpipe/src
!make 

/Users/nemkovich/Studies/HSE/BPCorpus/_convert/udpipe/src
clang++ -o .build/udpipe.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c udpipe.cpp
clang++ -o .build/utils-options.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c utils/options.cpp
clang++ -o .build/morphodita-derivator-derivation_formatter.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c morphodita/derivator/derivation_formatter.cpp
clang++ -o .build/morphodita-derivator-derivator_dictionary.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c morphodita/derivator/derivator_dictionary.cpp
clang++ -o .build/morphodita-morpho-czech_morpho.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=

clang++ -o .build/parsito-transition-transition_system_projective.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c parsito/transition/transition_system_projective.cpp
clang++ -o .build/parsito-transition-transition_system_swap.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c parsito/transition/transition_system_swap.cpp
clang++ -o .build/parsito-tree-tree.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c parsito/tree/tree.cpp
clang++ -o .build/parsito-tree-tree_format.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c parsito/tree/tree_format.cpp
clang++ -o .build/parsito-tree-tree_format_conllu.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=g

clang++ -o .build/tokenizer-multiword_splitter_trainer.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c tokenizer/multiword_splitter_trainer.cpp
clang++ -o .build/trainer-trainer.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c trainer/trainer.cpp
clang++ -o .build/trainer-trainer_morphodita_parsito.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c trainer/trainer_morphodita_parsito.cpp
clang++ -o .build/trainer-training_failure.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -stdlib=libc++ -O3 -I. -c trainer/training_failure.cpp
clang++ -o .build/unilib-uninorms.osx-clang-normal.o -MMD -MP -std=c++11 -W -Wall -mtune=generic -fvisibility=hidden -mmacosx-version-min=10.7 -st

In [2429]:
# tagger file to be used for processing
tagger = '../../tagger/be-ud-2.7-tagger-20210115.udpipe'

In [577]:
!echo "Гэта" | ./udpipe --tokenize --tag '../../tagger/be-ud-2.7-tagger-20210115.udpipe'

Loading UDPipe model: done.
# newdoc
# newpar
# sent_id = 1
# text = Гэта
1	Гэта	гэта	PRON	_	Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing|PronType=Dem	_	_	_	SpacesAfter=\n



In [572]:
output = subprocess.check_output(["./udpipe", "--tokenize", "--tag", tagger], input=txt[0], text=True) 

In [575]:
output

'# newdoc\n# newpar\n# sent_id = 1\n# text = На поўначы сумнай, у Фіншчыне дзіўнай, Ракой з вадаспадам заліўся пакат; Вуоксаю рэчка завецца у фінаў, Іматрай завуць вадаспад.\n1\tНа\tна\tADP\tIN\t_\t_\t_\t_\t_\n2\tпоўначы\tпоўнач\tNOUN\tNN\tAnimacy=Inan|Case=Loc|Gender=Fem|Number=Sing\t_\t_\t_\t_\n3\tсумнай\tсумны\tADJ\tJJL\tCase=Gen|Degree=Pos|Gender=Fem|Number=Sing\t_\t_\t_\tSpaceAfter=No\n4\t,\t,\tPUNCT\tPUNCT\t_\t_\t_\t_\t_\n5\tу\tу\tADP\tIN\t_\t_\t_\t_\t_\n6\tФіншчыне\tФіншчына\tPROPN\tNNP\tAnimacy=Inan|Case=Loc|Gender=Fem|Number=Sing\t_\t_\t_\t_\n7\tдзіўнай\tдзіўны\tADJ\tJJL\tCase=Ins|Degree=Pos|Gender=Fem|Number=Sing\t_\t_\t_\tSpaceAfter=No\n8\t,\t,\tPUNCT\tPUNCT\t_\t_\t_\t_\tSpacesAfter=\\n\n9\tРакой\tрака\tNOUN\tNN\tAnimacy=Inan|Case=Ins|Gender=Fem|Number=Sing\t_\t_\t_\t_\n10\tз\tз\tADP\tIN\t_\t_\t_\t_\t_\n11\tвадаспадам\tвадаспад\tNOUN\tNN\tAnimacy=Inan|Case=Ins|Gender=Masc|Number=Sing\t_\t_\t_\t_\n12\tзаліўся\tзаліцца\tVERB\tVBC\tAspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|T

The output is produced in tab-separated UD [CoNLL-U format](https://universaldependencies.org/format.html)

In [1440]:
def getPos(text):
    """Transform ConNLL-U format into Python data structures"""
    
    output = subprocess.check_output(["./udpipe", "--tokenize", "--tag", tagger], input=text, text=True) 
    lemlist = [line.strip().split('\t') for line in output.split('\n') if '\t' in line]
    posTags = []

    for l in lemlist:
        if '=' in l[9]:
            space = l[9].split('=')[1]
        else:
            space = None

        posTags.append([l[1], l[2].lower(), l[3], l[4], l[5], space])
        
    return posTags

In [1442]:
getPos('лепшы')

[['лепшы',
  'лепшы',
  'ADJ',
  'ORD',
  'Case=Nom|Degree=Pos|Gender=Masc|Number=Sing',
  '\\n']]

In [2316]:
def posTag(tree):
    """Tokenize previously generated simple tree and populate it with POS tags"""
    
    newtree = tree
    lgStr = ''
    lgs = [lg for lg in tree.body.find_all_next('lg')]
    
    for lg in lgs: 
        lgStr += lg.string + '\n\n'
        
    posTags = getPos(lgStr)
    
    posLgs = [] #stanza list
    posLg = [] #stanza
    posL = [] #single line
    
    for pos in posTags:         
        if pos[5] == '\\n':
            posL.append([pos[0], pos[1], pos[2], pos[3], pos[4], None])
            posLg.append(posL)
            posL = []
            
        elif pos[5] == '\\n\\n\\n':
            posL.append([pos[0], pos[1], pos[2], pos[3], pos[4], None])
            posLg.append(posL)
            posLgs.append(posLg)
            posL = []
            posLg = []
            
        else:
            posL.append(pos)
            
    if len(posLgs) == len(lgs):        

        for i,lg in enumerate(lgs):
            lg.string = ''
            
            for posTags in posLgs[i]:
                
                line = newtree.new_tag('l')

                for pos in posTags:                     
                    if pos[2] == 'PUNCT':
                        pc = newtree.new_tag('pc')
                        pc.string = pos[0]
                        
                        if pos[5]:
                            if 'No' in pos[5]: 
                                pc['join'] = 'right'
                        
                        line.append(pc)

                    else:
                        dig = re.search('(?=[\w]*)\d{1,2}', pos[0])
                        if not dig:
                            w = newtree.new_tag('w')
                            w.string = pos[0]
                            w['lemma'] = pos[1]
                            w['pos'] = pos[2]
                            w['ppos'] = pos[3]
                            if pos[4] != '_':
                                w['msd'] = pos[4]

                            if pos[5]:
                                if 'No' in pos[5]: 
                                    w['join'] = 'right'
                                    
                            line.append(w)
                                    
                        else:
                            s = newtree.new_tag('sup')
                            s.string = dig.group()
                            
                            w = newtree.new_tag('w')
                            w.string = pos[0].replace(dig.group(),'')
                            w['lemma'] = pos[0].replace(dig.group(),'')
                            w['pos'] = pos[2]
                            w['ppos'] = pos[3]
                            if pos[4] != '_':
                                w['msd'] = pos[4]
                                
                            if pos[5]:
                                if 'No' in pos[5]: 
                                    s['join'] = 'right'
                            
                            line.append(w)
                            line.append(s)
                        
                lg.append(line)
                    
    
    return newtree

In [2318]:
tpath = '../../../parsed/013_Уладзіслаў Сыракомля/013_0004.html'
testtree = posTag(genTree(getRaw(tpath), meta(tpath), tpath))
testtree

<?xml version="1.0" encoding="utf-8"?>
<TEI><teiHeader><fileDesc><titleStmt><author><persName/></author></titleStmt><sourceDesc><bibl type="digitalSource"><ptr target="https://knihi.com//Uladzislau_Syrakomla/Karali.html"/></bibl></sourceDesc><profileDesc><textClass><keywords><term type="form">Паэзія</term></keywords></textClass><langUsage><language ident="be">Беларуская</language></langUsage></profileDesc><textDesc/></fileDesc></teiHeader><body><seg type="italic">З польскай — У. Сыракомлі</seg><lg><l><w lemma="як" msd="Degree=Pos" pos="ADV" ppos="WRB">Як</w><w lemma="ісці" msd="Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act" pos="VERB" ppos="VBC">ішоў</w><w lemma="я" msd="Case=Nom|Number=Sing|Person=1|PronType=Prs" pos="PRON" ppos="PRP">я</w><w lemma="у" pos="ADP" ppos="IN">ў</w><w lemma="бы" msd="Case=Loc|Degree=Pos|Gender=Fem|Number=Sing" pos="ADJ" ppos="JJL">бой</w><w join="right" lemma="кіпяча" msd="Aspect=Imp|Tense=Pres|VerbForm=Conv|Voice=Act" pos="

In [2319]:
with open ('../../postest.xml', 'w') as s:
    s.write(str(testtree))

## Generate search index string
Index string is the text that will be stored in FTS5 virtual table in database for search purposes.

In [1818]:
inpath = '../../../parsed/028_Янка Купала/028_0493.html'

In [1819]:
testtree = posTag(genTree(inpath)[1])

In [1820]:
def searchInd(fulltree):
    """Collect tokens and lemmas"""
    indexList = []
    
    for w in fulltree.body.find_all(['w']):
        if w.string:
            token = w.string.lower()
            if w['lemma']:
                lemma = w['lemma'].lower()
                indexList.extend([token, lemma])

            else:
                indexList.extend(token)

    indexString = ' '.join(set(indexList))      
    return indexString

In [1821]:
searchInd(testtree)

'заліцца цябе бераг глядзіш дзіўнай нас не спаўём рассеюцца казка сцішнай іматр свеце свабодай цэлыя фіншчына клакокаць нізіне бунтоўныя ў ды воляю другі кінь небу зложыць клічуць забыццё сонцам за скібы піргнуць бы долю сягнуць скруціцца стаіць зваць хвалі мы свабода адвечністым ўсё рака стаіш вясць кіпучай схопіць рынуцца да зора хмар неба быць вадзіца рассецца вёрст дзіўны нем толечы вёрсты магіла ніў ззіхануцца слухаеш глыбаў прастор вуоксаю глядзіць клікаць люнуць забыўшыся нам скруцяцца шала злажыць змяніць ніва процьм вада шум забыцці іх каменных будзем хвалю хмара зор твае на усё між цэлы шумам калысаць ўсім дантаўскі маўчаць роднай процьмам гутарка жальбе рассыплюцца іматрыны зломе бунтоўны суомі вадаспад песню пылам сцюдзёнай кіпучая шалее векавечны дам хадзець як звацца свет калышуць іматра народ завуць дамо вадзіцай хадзі гарой сцішны адзін сумнай нашы клуб пясняр скіба фінаў сыпнуцца грудзі лялеецца каменны і шалаў сцюдзёны адну зараснік клубам там быццам доля сваёя сталь 

## Generate HTML for website view
New HTML markups includes styling of semantic elements and POS-tags.

In [1928]:
def mkHtml(tree):
    """Generate HTML for website viewing"""

    htmltree = BeautifulSoup('', 'html')

    for i,item in enumerate(tree.body.find_all(['lg', 'seg'])):

        br = htmltree.new_tag('br')

        if item.name == 'lg':  
            st = htmltree.new_tag('span')
            st['class'] = 'stanza'
            st.string = ''

            for line in item.find_all('l'):
                p = htmltree.new_tag('p')
                p.string = ''

                for token in line.find_all():
                    ssp = htmltree.new_tag('span')
                    ssp['class'] = 'space'
                    ssp.string = ' '

                    if token.name == 'w':
                        wsp = htmltree.new_tag('span')
                        wsp.string = token.text
                        wsp['class'] = 'token'
                        wsp['title'] = ' '.join(token.attrs.values())
                        p.append(wsp)

                        if 'join' not in token.attrs:
                            p.append(ssp)


                    elif token.name == 'pc':
                        psp = htmltree.new_tag('span')
                        psp.string = token.text
                        psp['class'] = 'pc'
                        p.append(psp)

                        if 'join' not in token.attrs:
                            p.append(ssp)

                    elif token.name == 'sup':
                        sup = htmltree.new_tag('sup')
                        sup.string = token.text
                        p.append(sup)

                        if 'join' not in token.attrs:
                            p.append(ssp)

                st.append(p)

            htmltree.append(st)
            htmltree.append(br)

        if item.name == 'seg':
            seg = htmltree.new_tag('span')

            if 'type' in item.attrs:
                seg['class'] = item['type']
                
            if seg['class'] == 'footnotes':
                p = htmltree.new_tag('p')
                p.string = ''
                for item in tree.body.find_all(['num','footnote']):
                    if item.name == 'num':
                        sup = htmltree.new_tag('sup')
                        sup.string = item.text
                        p.append(sup)
                        p.append(ssp)

                    elif item.name == 'footnote':
                        ftnt = htmltree.new_tag('span')
                        ftnt['class'] = 'footnote'
                        ftnt.string = item.text
                        p.append(ftnt)
                        seg.append(p)
                        p = htmltree.new_tag('p')
                        p.string = ''
                        
            elif seg['class'] == 'epigraph': 
                for line in item.find_all('l'):
                    p = htmltree.new_tag('p')
                    p.string = line.text
                    seg.append(p)
                
            else:
                seg.string = item.text

            htmltree.append(seg)
            htmltree.append(br)
            
    return htmltree

In [2014]:
html = mkHtml(posTag(genTree('../../../parsed/019_Янка Лучына/019_0001.html')[1]))

In [2015]:
html.find('span', {'class' : 'epigraph'})

In [2016]:
with open('../../html2.html', 'w', encoding='utf-8') as h:
    h.write(str(html))

## Create database

In [1935]:
import sqlite3
from sqlite3 import Error

In [1938]:
dbfile = '../../bpc.sqlite3'

In [1941]:
def createDb(dbfile):
    """Create DB"""
    conn = None
    try:
        conn = sqlite3.connect(dbfile)
        print(sqlite3.version)
    except Error as e:
        print(e)
    finally:
        if conn:
            conn.close()

In [1940]:
createDb(dbfile)

2.6.0


In [2089]:
def connect(dbfile):
    """DB connection"""
    
    conn = None
    try:
        conn = sqlite3.connect(dbfile, isolation_level = None)
        return conn
    except Error as e:
        print(e)

    return conn

In [2090]:
def execute(dbfile, sql):
    """Execute request"""
    
    cur = connect(dbfile).cursor()
    cur.execute(sql) 
    cur.close()
    connect(dbfile).close() 

In [2091]:
def getData(dbfile, sql):
    """Retrieve DB rows"""
    
    conn = sqlite3.connect(dbfile)
    cur = conn.cursor()
    cur.execute(sql) 
    rows = cur.fetchall()
    cur.close()
    connect(dbfile).close() 
    
    if rows:
        return rows

## Create DB tables

### `authors` table

Schema:
- id - primary key
- name
- forename
- surname
- life_born
- life_died
- gender
- wiki_url

4 first values will be populated from `authDict`.

In [1937]:
authDict['Максім Багдановіч']

{'name': {'Forename': 'Максім', 'Surname': 'Багдановіч'}, 'id': 2}

In [1948]:
schA = 'CREATE TABLE IF NOT EXISTS authors (id integer PRIMARY KEY, \
                                            name text NOT NULL, \
                                            forename text, \
                                            surname text, \
                                            life_born integer, \
                                            life_died integer, \
                                            gender integer, \
                                            wiki_url text);'

In [1951]:
# create authors table
execute(dbfile, schA)

In [1958]:
getData(dbfile, 'PRAGMA table_info(authors)')

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 1, None, 0),
 (2, 'forename', 'TEXT', 0, None, 0),
 (3, 'surname', 'TEXT', 0, None, 0),
 (4, 'life_born', 'INTEGER', 0, None, 0),
 (5, 'life_died', 'INTEGER', 0, None, 0),
 (6, 'gender', 'INTEGER', 0, None, 0),
 (7, 'wiki_url', 'TEXT', 0, None, 0)]

In [1982]:
authDict['Францішак Багушэвіч']

{'name': {'Forename': 'Францішак', 'Surname': 'Багушэвіч'}, 'id': 3}

In [1979]:
def auTab(adict):
    """Populate authors table"""
    
    conn = sqlite3.connect(dbfile, isolation_level = None)
    cur = conn.cursor()
    cur.execute('delete from authors') # reset table 

    for a in [(adict[i]['id'], i, adict[i]['name']) for i in adict.keys()]:
        aid = a[0]
        name = a[1]
        if isinstance(a[2], dict):
            forename = a[2]['Forename']
            surname = a[2]['Surname'] 
            cur.execute('insert into authors (id, name, forename, surname) values (?, ?, ?, ?)', \
                        (aid, name, forename, surname))
        else:
            cur.execute('insert into authors (id, name) values (?, ?)', (aid, name))
            
    print('+')

In [1980]:
auTab(authDict)

+


In [2086]:
getData(dbfile, 'SELECT * FROM authors WHERE id = 3;')

[(3, 'Францішак Багушэвіч', 'Францішак', 'Багушэвіч', None, None, None, None)]

### `text_meta` table

- id - primary key
- author_id - foreign key `authors (id)`
- author_name
- title
- editions - bibl. information
- genre
- pub_year - publication year
- cr_year - creation year
- fp_year - first published
- tr_author_id - translator, foreign key `authors (id)`
- tr_author_name
- tr_lang - original language, translation only
- orfl - original file name for debugging

This table will be populated with `meta()` function.

In [2058]:
lNames = {'deu':['de', 'Нямецкая' ], \
          'lat': ['la', 'Лацінская'], \
          'pol': ['pl', 'Польская'], \
          'rus': ['ru', 'Руская'], \
          'ukr': ['uk', 'Украінская'], \
          'yid': ['yi', 'Ідыш']}

In [2059]:
schTm = 'CREATE TABLE IF NOT EXISTS text_meta (id integer PRIMARY KEY, \
                                                author_id integer, \
                                                author_name text, \
                                                title text, \
                                                edition text, \
                                                genre text, \
                                                pub_year integer, \
                                                cr_year integer, \
                                                fp_year integer, \
                                                tr_author_id integer, \
                                                tr_author_name text, \
                                                tr_lang text, \
                                                orfl text, \
                                                FOREIGN KEY (author_id) REFERENCES authors (id), \
                                                FOREIGN KEY (tr_author_id) REFERENCES authors (id));'

In [2060]:
# create meta table
execute(dbfile, schTm)

In [2061]:
getData(dbfile, 'PRAGMA table_info(text_meta)')

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'author_id', 'INTEGER', 0, None, 0),
 (2, 'author_name', 'TEXT', 0, None, 0),
 (3, 'title', 'TEXT', 0, None, 0),
 (4, 'edition', 'TEXT', 0, None, 0),
 (5, 'genre', 'TEXT', 0, None, 0),
 (6, 'pub_year', 'INTEGER', 0, None, 0),
 (7, 'cr_year', 'INTEGER', 0, None, 0),
 (8, 'fp_year', 'INTEGER', 0, None, 0),
 (9, 'tr_author_id', 'INTEGER', 0, None, 0),
 (10, 'tr_author_name', 'TEXT', 0, None, 0),
 (11, 'tr_lang', 'TEXT', 0, None, 0),
 (12, 'orfl', 'TEXT', 0, None, 0)]

### `text_file` table

- id - primary key, foreign key `text_meta (id)`
- xml
- txt 
- html

This table contains all file assets.

In [2002]:
schTf = 'CREATE TABLE IF NOT EXISTS text_files (id integer NOT NULL PRIMARY KEY, \
                                                xml text, \
                                                txt text, \
                                                html text, \
                                                FOREIGN KEY (id) REFERENCES text_meta (id));'

In [2003]:
# create file table
execute(dbfile, schTf)

In [2004]:
getData(dbfile, 'PRAGMA table_info(text_files)')

[(0, 'id', 'INTEGER', 1, None, 1),
 (1, 'xml', 'TEXT', 0, None, 0),
 (2, 'txt', 'TEXT', 0, None, 0),
 (3, 'html', 'TEXT', 0, None, 0)]

### Virtual search index table

The table will use SQLite3 FTS5 extention.

- id - foreign key `text_meta (id)`
- author
- title
- wordlist - string created by `searchInd()` function

In [2005]:
vt = 'CREATE VIRTUAL TABLE search USING fts5(id, author, title, wordlist);'

In [2006]:
execute(dbfile, vt)

In [2085]:
getData(dbfile, 'PRAGMA table_info(search)')

[(0, 'id', '', 0, None, 0),
 (1, 'author', '', 0, None, 0),
 (2, 'title', '', 0, None, 0),
 (3, 'wordlist', '', 0, None, 0)]

# Corpus database population

The results of the previously defined functions are generated for every file and inserted into database.

- `meta()` is the source of `text_meta` table values
- `genTree() -> posTag()`, `simpleTxt()`, `mkHtml()` are written to `text_files` as well as to OS file system
- `searchInd()` and some `meta()` values are stored in `search` virtual table

In [2052]:
import os.path
from datetime import datetime

In [2055]:
folders = ['../../assets', '../../assets/xml', '../../assets/txt', '../../assets/html']

In [2338]:
for f in folders:
    if not os.path.isdir(f):
        os.mkdir(f)

In [2111]:
# test values
execute(dbfile, 'INSERT INTO text_meta (id, orfl) values  (2, "031_0012")')

In [2112]:
execute(dbfile, 'delete from text_meta')

In [2278]:
def addedFiles(dbfile):
    """Get last ID and added files list"""
    rows = getData(dbfile, 'select * from text_meta order by id desc')
    
    if rows:
        lastId = rows[0][0]
        files = [row[12] for row in rows]
        
    else:
        lastId = 0
        files = []
    
    return lastId, files

In [2279]:
addedFiles(dbfile)

(20,
 ['003_0011',
  '003_0010',
  '003_0009',
  '003_0008',
  '003_0007',
  '013_0008',
  '013_0006',
  '013_0005',
  '013_0004',
  '013_0003',
  '013_0002',
  '013_0001',
  '007_0004',
  '007_0002',
  '007_0001',
  '003_0011',
  '003_0010',
  '003_0009',
  '003_0008',
  '003_0007'])

In [2323]:
meta(getRaw(test1))

{'Authors': 'Ян Баршчэўскі',
 'CreationYear': '13.07.1841',
 'Edition': 'Ян Баршчэўскі. Выбраныя творы. Менск, МФ «Беларускі кнігазбор», 1998.',
 'FirstPublicationYear': '1842',
 'LangOrig': 'pol',
 'Pravapis': 'A1957',
 'PublicationYear': '1998',
 'SectionAuthor': 'Балады',
 'StyleGenre': 'Балада',
 'Title': 'Дзве бярозы',
 'Title2': 'З народных паданняў',
 'Translation': 'Кастусь Цьвірка',
 'Forename': 'Ян',
 'Surname': 'Баршчэўскі',
 'aid': 7,
 'trForename': 'Кастусь',
 'trSurname': 'Цьвірка',
 'tid': 77}

In [2337]:
def populate(filelist, dbfile):
    
    added = addedFiles(dbfile)
    
    txid = added[0] + 1
    done = added[1]
    
    cur = connect(dbfile).cursor()
    
    for file in filelist:

        fname = None
        fname = file.split('/')[-1].split('.')[0]
        
        raw = None
        metadict = None
        tree = None
        txt = None
        xml = None
        search = None
        html = None

        if fname not in done:

            # generate files
            
            raw = getRaw(file)
            metadict = meta(raw)
            tree = genTree(raw, metadict, file)
            txt = simpleTxt(tree)
            xml = posTag(tree)
            search = searchInd(xml)
            html = mkHtml(xml)
            
            # text_meta
            
            metaTuple = (txid, \
                            metadict['aid'], \
                            metadict['Authors'], \
                            metadict['Title'], \
                            metadict['Edition'], \
                            metadict['StyleGenre'], \
                            metadict['PublicationYear'], \
                            metadict['CreationYear'], \
                            metadict['FirstPublicationYear'], \
                            metadict['tid'], \
                            metadict['Translation'], \
                            metadict['LangOrig'], \
                            fname)

            cur.execute('INSERT INTO text_meta ( id, \
                                                author_id, \
                                                author_name, \
                                                title, \
                                                edition, \
                                                genre, \
                                                pub_year, \
                                                cr_year, \
                                                fp_year, \
                                                tr_author_id, \
                                                tr_author_name, \
                                                tr_lang, \
                                                orfl) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', metaTuple)
            
            
            # text_files
            
            fileTuple = (txid, str(xml), str(txt), str(html))
            
            cur.execute('INSERT INTO text_files ( id, \
                                                xml, \
                                                txt, \
                                                html ) values (?, ?, ?, ?)', fileTuple)
            # search
            
            searchTuple = (txid, metadict['Authors'].lower(), metadict['Title'].lower(), search)
            
            cur.execute('INSERT INTO search ( id, \
                                                author, \
                                                title, \
                                                wordlist ) values (?, ?, ?, ?)', searchTuple)
            
            
            # save files
            
            with open('../../assets/xml/' + str(txid) + '.xml', 'w') as xmlfile:
                xmlfile.write(str(xml))
                
            with open('../../assets/txt/' + str(txid) + '.txt', 'w') as txtfile:
                txtfile.write(str(txt))
                
            with open('../../assets/html/' + str(txid) + '.html', 'w') as htmlfile:
                htmlfile.write(str(html))
            
            
            txid += 1
            done.append(fname)
            

    cur.close()
    connect(dbfile).close() 

In [2387]:
populate(pList[1][:3068], dbfile)

In [2355]:
texts = pList[1][:3068]
len(texts)

3068

In [2373]:
pList[1][1869]

'../../../parsed/032_Алесь Гарун/032_0147.html'

In [2328]:
for i,f in enumerate(pList[1]):
    if '032_0015' in f:
        print(pList[1][i + 1], i)

../../../parsed/032_Алесь Гарун/032_0016.html 1754
../../../parsed/selected/032_0016.html 3123


In [2358]:
for i,f in enumerate(pList[1]):
    if '032_0015' in f:
        print(pList[1][i + 1], i)

-----

## Search

In [2382]:
req = 'SELECT id FROM search WHERE wordlist MATCH "суомі";'

In [2385]:
results = [r[0] for r in getData(dbfile, req)]
results

[490, 747]