# Russian Synodal Bible (1885)

## Import

### Libraries

In [1]:
import os
import re
import json
import time
import numpy as np
import pandas as pd 
from lxml import etree
import natasha
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, NewsSyntaxParser, NewsNERTagger, PER, NamesExtractor, Doc
from bs4 import BeautifulSoup
import requests

In [2]:
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)

### Definitions

In [41]:
texts = './texts/fiction/'
bibleTXT = './texts/bible/sinodalnyi-perevod.txt' 
booksJSON = './texts/bible/booksDict.json'
bibleJSON = './texts/bible/bible.json'
bibleXML = './texts/bible/bible.xml'

libCols = ['author','pub_year','title','text']
tokenOHCO = ['title','part_num','para_num', 'sent_num', 'token_num']
bibleOHCO = ['test', 'book', 'chap', 'verse']
tokenCols = ['p_id', 'start', 'stop', 'text', 'token_id', 'head_id', 'rel', 'pos', 'lemma', 'anim', 'aspect', 'case', 'degree', 'gender', 'mood', 'number', 'person', 'tense', 'verb_form', 'voice']

chap_lines_re = '^===\s(\d{1,3})\s===$'
book_lines_re = '^==\s(.+)\s==$'
matt_name = 'От Матфея святое благовествование'

In [4]:
with open(booksJSON) as json_file: 
    booksDict = json.load(json_file)

## Pre-Processing

### Import Text into DF

In [None]:
with open(bibleTXT, 'r', encoding='windows-1251') as f: 
    bibleText = f.readlines()

bibliiaDf = pd.DataFrame(bibleText).rename(columns={0:'text'}).dropna()
bibliiaDf

### Tidy Up

In [None]:
# remove blank lines and select puncutation, keeping only header & clean text lines
bibliiaDf.loc[:,'text'] = bibliiaDf.loc[:,'text'].str.replace(r'\n', '')
bibliiaDf.loc[:,'text'] = bibliiaDf.loc[:,'text'].str.replace('\]|\[|_|-|', '')
bibliiaDf = bibliiaDf.loc[bibliiaDf.text != '']
bibliiaDf

### Find Parts

In [None]:
# find df rows containing all the book and chapter title lines 
# as well as the OT/NT split on the 1st page of Matthew
chap_lines = bibliiaDf.loc[bibliiaDf.text.str.contains(chap_lines_re)].index
book_lines = bibliiaDf.loc[bibliiaDf.text.str.contains(book_lines_re)].index
test_line = bibliiaDf.loc[bibliiaDf.text.str.contains(matt_name)].index

# combine the title lines for extraction later
title_lines = test_line.append([chap_lines, book_lines]).sort_values()
title_lines

### Assign OHCO Labels

In [None]:
# ['test']
bibliiaDf[bibleOHCO[0]] = np.where(bibliiaDf.index<test_line[0], 'O', 'N')
# ['book']
bibliiaDf[bibleOHCO[1]] = bibliiaDf.loc[book_lines].text.str.extract(book_lines_re)
# ['chap']
bibliiaDf[bibleOHCO[2]] = bibliiaDf.loc[chap_lines].text.str.extract(chap_lines_re)
# ['verse'] by splitting verse num at beginning from text in current verse lines
bibliiaDf[[bibleOHCO[3],'text']] = bibliiaDf.text.str.split(' ', 1, expand=True).iloc[:, [0, 1]]
# fill in book and chapter titles to cells below them
bibliiaDf[bibleOHCO[1:3]] = bibliiaDf[['book','chap']].ffill()
# drop title lines and reset index to give verse id num 
bibliiaDf = bibliiaDf.drop(title_lines, axis=0).reset_index()#.set_index(bibleOHCO)
bibliiaDf.index = range(1,len(bibliiaDf)+1)
bibliiaDf.index.name = 'v_id'
# reorder columns
bibliiaDf = bibliiaDf[['test', 'book', 'chap', 'verse', 'text']]
bibliiaDf

### Swap Full Book Name for Abbreviation

In [None]:
book_nums = [(x+1,y) for x,y in enumerate(bibliiaDf.book.unique())]

In [None]:
bibliiaDf.book = [booksDict[str(x)]['eng_abbr'] for x,y in book_nums for name in bibliiaDf.book if y == name]
bibliiaDf.sample(10)

### Export to JSON

In [None]:
bibliiaDf.to_json(bibleJSON, orient='index')

### Make XML

In [None]:
root = etree.Element("bible")
print(root)
print(root.tag)
#bibleTree = etree.ElementTree()

In [None]:
for v in BibDf.index.to_list(): 
    

In [None]:
BibDf

In [None]:
root.append(etree.Element("testament"))

In [None]:
root.append(etree.Element(""))

In [None]:
etree.tostring(root, pretty_print=True)

## Processing

In [5]:
BibDf = pd.read_json(bibleJSON, orient='index')
BibDf.index.name = 'v_id'
BibTextDf = BibDf[['text']]
BibLibDf = BibDf[bibleOHCO]
BibDf

Unnamed: 0_level_0,test,book,chap,verse,text
v_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,O,Gen,1,1,В начале сотворил Бог небо и землю.
2,O,Gen,1,2,"Земля же была безвидна и пуста, и тьма над без..."
3,O,Gen,1,3,И сказал Бог: да будет свет. И стал свет.
4,O,Gen,1,4,"И увидел Бог свет, что он хорош, и отделил Бог..."
5,O,Gen,1,5,"И назвал Бог свет днем, а тьму ночью. И был ве..."
...,...,...,...,...,...
37106,N,Rev,22,17,И Дух и невеста говорят: прииди! И слышавший д...
37107,N,Rev,22,18,И я также свидетельствую всякому слышащему сло...
37108,N,Rev,22,19,и если кто отнимет что от слов книги пророчест...
37109,N,Rev,22,20,"Свидетельствующий сие говорит: ей, гряду скоро..."


In [6]:
#testsDict = dict(enumerate(BibDf.test.unique()))
testsDict = dict([(value, key) for key, value in dict(enumerate(BibDf.test.unique())).items()])
booksDict = dict([(value, key) for key, value in dict(enumerate(BibDf.book.unique())).items()])

In [7]:
TestsDf = pd.DataFrame([(x, ' '.join(y)) for (x,y) in BibDf.groupby(bibleOHCO[:1]).text], columns=['test', 'text']).set_index(np.arange(1,len(BibDf.test.unique())+1))
TestsDf.index.name = 't_id'
TestsDf 

Unnamed: 0_level_0,test,text
t_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,N,"Родословие Иисуса Христа, Сына Давидова, Сына ..."
2,O,В начале сотворил Бог небо и землю. Земля же б...


In [8]:
BooksDf = pd.DataFrame([(x, ' '.join(y)) for (x,y) in BibDf.groupby(bibleOHCO[:2]).text], columns=[('test', 'book'), 'text'])
BooksDf[['test','book']] = pd.DataFrame(list(BooksDf[('test', 'book')]), index=BooksDf.index, columns=bibleOHCO[:2])
del BooksDf[('test', 'book')]
BooksDf = BooksDf.replace({"test": testsDict, "book": booksDict}).sort_values(by=bibleOHCO[:2], ascending=[True, True])
BooksDf = BooksDf.reset_index().drop(['index'], axis=1).set_index(np.arange(1,len(BibDf.book.unique())+1))
BooksDf.index.name = 'b_id'
BooksDf = BooksDf.replace({"test":dict(enumerate(BibDf.test.unique())), "book":dict(enumerate(BibDf.book.unique()))})
#BooksDf = BooksDf.reset_index().set_index(bibleOHCO[:2])
BooksDf 

Unnamed: 0_level_0,text,test,book
b_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,В начале сотворил Бог небо и землю. Земля же б...,O,Gen
2,"Вот имена сынов Израилевых, которые вошли в Е...",O,Ex
3,И воззвал Господь к Моисею и сказал ему из ски...,O,Lev
4,"И сказал Господь Моисею в пустыне Синайской, в...",O,Num
5,"Сии суть слова, которые говорил Моисей всем Из...",O,Deut
...,...,...,...
72,"Павел, волею Божиею Апостол Иисуса Христа, по ...",N,2Tim
73,"Павел, раб Божий, Апостол же Иисуса Христа, по...",N,Tit
74,"Павел, узник Иисуса Христа, и Тимофей брат, Фи...",N,Philem
75,"Бог, многократно и многообразно говоривший изд...",N,Heb


In [9]:
ChapsDf = pd.DataFrame([(x, ' '.join(y)) for (x,y) in BibDf.groupby(bibleOHCO[:3]).text], columns=[('test', 'book', 'chap'), 'text'])
ChapsDf[['test','book','chap']] = pd.DataFrame(list(ChapsDf[('test', 'book', 'chap')]), index=ChapsDf.index, columns=bibleOHCO[:3])
del ChapsDf[('test', 'book', 'chap')]
ChapsDf = ChapsDf.replace({"test": testsDict, "book": booksDict}).sort_values(by=bibleOHCO[:3], ascending=[True, True, True])
ChapsDf = ChapsDf.reset_index().drop(['index'], axis=1).set_index(np.arange(1,len(ChapsDf)+1))
ChapsDf.index.name = 'c_id'
ChapsDf = ChapsDf.replace({"test":dict(enumerate(BibDf.test.unique())), "book":dict(enumerate(BibDf.book.unique()))})
#ChapsDf = ChapsDf.reset_index().set_index(bibleOHCO[:3])
ChapsDf

Unnamed: 0_level_0,text,test,book,chap
c_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,В начале сотворил Бог небо и землю. Земля же б...,O,Gen,1
2,Так совершены небо и земля и все воинство их. ...,O,Gen,2
3,"Змей был хитрее всех зверей полевых, которых с...",O,Gen,3
4,"Адам познал Еву, жену свою; и она зачала, и ро...",O,Gen,4
5,Вот родословие Адама: когда Бог сотворил челов...,O,Gen,5
...,...,...,...,...
1357,"После сего я увидел иного Ангела, сходящего с ...",N,Rev,18
1358,После сего я услышал на небе громкий голос как...,N,Rev,19
1359,"И увидел я Ангела, сходящего с неба, который и...",N,Rev,20
1360,"И увидел я новое небо и новую землю, ибо прежн...",N,Rev,21


In [32]:
# function for applying all of natasha's morphological tagger components to tokens to make a TokenDf
def nat_parse(textDf=BibTextDf, textCol='text', columns=tokenCols): 
    # initialize token dataframe
    tokenDf = pd.DataFrame(columns=columns)
    # gather row list
    for an_id in textDf.index: 
        pDict = []
        doc = Doc(textDf.loc[an_id][textCol])
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        for token in doc.tokens: 
            token.lemmatize(morph_vocab)
        doc.parse_syntax(syntax_parser)
        doc.tag_ner(ner_tagger)
        #for sent in enumerate(doc.sents): 
        #sent_num = sent[0]
        #sent_text = sent[1]
        for token in [x for x in doc.tokens if x.pos != 'PUNCT']: 
            #token_num = token[0]
            #token_text = token[1]
            start = token.start
            stop = token.stop
            text = token.text
            token_id = token.id
            head_id = token.head_id
            rel = token.rel
            pos = token.pos
            lemma = token.lemma
            # Animacy, Aspect, Case, Degree, Gender, Mood, Number, Person, Tense, VerbForm, Voice
            try: 
                anim = token.feats['Animacy']
            except: 
                anim = None
            try: 
                aspect = token.feats['Aspect']
            except: 
                aspect = None
            try: 
                case = token.feats['Case']
            except: 
                case = None
            try: 
                degree = token.feats['Degree']
            except: 
                degree = None
            try: 
                gender = token.feats['Gender']
            except: 
                gender = None
            try: 
                mood = token.feats['Mood']
            except: 
                mood = None
            try: 
                number = token.feats['Number']
            except: 
                number = None
            try: 
                person = token.feats['Person']
            except: 
                person = None
            try: 
                tense = token.feats['Tense']
            except: 
                tense = None
            try: 
                verb_form = token.feats['VerbForm']
            except: 
                verb_form = None
            try: 
                voice = token.feats['Voice']
            except: 
                voice = None
            #print(token)
            tokenDict = {
                'p_id': an_id,
                #'token_num': token_num, 
                'start': start, 
                'stop': stop, 
                'text': text, 
                'token_id': token_id, 
                'head_id': head_id, 
                'rel': rel, 
                'pos': pos, 
                'lemma': lemma, 
                'anim': anim, 
                'aspect': aspect, 
                'case': case, 
                'degree': degree, 
                'gender': gender, 
                'mood': mood, 
                'number': number, 
                'person': person, 
                'tense': tense, 
                'verb_form': verb_form, 
                'voice': voice
            }
            pDict.append(tokenDict)
            #print(sent)
            pDf = pd.DataFrame(pDict, columns=columns)
        tokenDf = pd.concat([tokenDf, pDf])
    return tokenDf

In [39]:
TestTokenDf = nat_parse(BibTextDf.iloc[:500])

In [None]:
TestTokenDf.set_index(['p_id','token_id']).sample(50)

In [42]:
%%time
TokenDf = nat_parse(BibTextDf).set_index(['p_id', 'token_num'])

KeyboardInterrupt: 

In [43]:
TokenDf.sample(50)

Unnamed: 0,v_id,token_num,start,stop,text,token_id,head_id,rel,pos,lemma,...,aspect,case,degree,gender,mood,number,person,tense,verb_form,voice
656,402,12,4109,4110,и,22_16,22_18,cc,CCONJ,и,...,,,,,,,,,,
88,363,17,604,609,Гофир,3_30,3_6,conj,PROPN,гофир,...,,Nom,,Masc,,Sing,,,,
234,1335,40,1557,1566,истинному,7_48,7_46,conj,ADJ,истинный,...,,Dat,Pos,Masc,,Sing,,,,
1026,1167,21,5914,5917,Его,61_27,61_26,nmod,PRON,он,...,,Gen,,Masc,,Sing,3.0,,,
288,171,37,1756,1761,моего,5_45,5_44,det,DET,мой,...,,Gen,,Masc,,Sing,,,,
141,714,8,876,886,показались,13_13,13_5,conj,VERB,показаться,...,Perf,,,,Ind,Plur,,Past,Fin,Mid
516,892,5,2981,2987,четыре,18_6,18_4,conj,NUM,четыре,...,,Nom,,,,,,,,
265,1127,11,1606,1609,они,12_14,12_15,nsubj,PRON,они,...,,Nom,,,,Plur,3.0,,,
123,993,10,718,730,подстерегать,8_12,8_7,advcl,VERB,подстерегать,...,Imp,,,,,,,,Inf,Mid
422,250,5,2524,2528,руки,22_7,22_6,obj,NOUN,рука,...,,Acc,,Fem,,Plur,,,,


In [None]:
%%time
nat_parse(BibTextDf.iloc[:321])#.set_index(['v_id', 'token_num'])

In [44]:
TokenDf.to_pickle('./proc/BibleTokenDf.pkl')