# Russian Synodal Bible (1885)

## Import

### Libraries

In [2]:
import os
from itertools import chain
from backend import *
import matplotlib
import string

### Definitions

In [3]:
bibleTXT = '../texts/bible/sinodalnyi-perevod.txt' 
booksJSON = './proc/booksDict.json'
bibleJSON = '../texts/bible/bible.json'
bibleIdJSON = '../texts/bible/bibleID.json'
bibleXML = '../texts/bible/bible.xml'
bibFrazyTXT = '../texts/bible/Dubrovina_Slovar_Bibleyskikh_Frazeologizmov.txt'

bibleOHCO = ['test', 'book', 'chap', 'verse']
tokenCols = ['p_id', 'start', 'stop', 'text', 'token_id', 'head_id', 'rel', 'pos', 'lemma', 'anim', 'aspect', 'case', 'degree', 'gender', 'mood', 'number', 'person', 'tense', 'verb_form', 'voice']

chap_lines_re = '^===\s(\d{1,3})\s===$'
book_lines_re = '^==\s(.+)\s==$'
matt_name = 'От Матфея святое благовествование'

In [4]:
with open(booksJSON) as json_file: 
    booksDict = json.load(json_file)

## Pre-Processing

### Import Text into DF

In [None]:
with open(bibleTXT, 'r', encoding='windows-1251') as f: 
    bibleText = f.readlines()

bibliiaDf = pd.DataFrame(bibleText).rename(columns={0:'text'}).dropna()
bibliiaDf

### Tidy Up

In [None]:
# remove blank lines and select puncutation, keeping only header & clean text lines
bibliiaDf.loc[:,'text'] = bibliiaDf.loc[:,'text'].str.replace(r'\n|^\s$', '')

#unwanted_parts = bibliiaDf.loc[bibliiaDf.text.str.contains('Предисловие|Примечания')].index.tolist() + bibliiaDf.loc[bibliiaDf.text.str.contains('^[^\d|^=]')].index.tolist() + [x for x in range(21483,21490)] + [x for x in range(16571,16577)]

# concat list of unwanted sections' indices
get_rid = chain(range(13259,13261), range(15172,15175), range(16571,16578), range(21483,21491))

# drop notes section from Book of Job
bibliiaDf = bibliiaDf.drop(index=[x for x in get_rid])

# rename foreword section for Esther and Wisdom of Jesus to match chapter style
#bibliiaDf.loc[:,'text'] = bibliiaDf.loc[:,'text'].str.replace('=== Предисловие ===', '=== 0 ===')

# remove various errata and notes marks
bibliiaDf.loc[:,'text'] = bibliiaDf.loc[:,'text'].str.replace('\]|\[|_|-|', '')
bibliiaDf = bibliiaDf.loc[bibliiaDf.text != '']#.reset_index().drop(columns="index")

bibliiaDf

In [None]:
#bibliiaDf.loc[13259:13260] # MOLITVA and gospodi vsederzhiteliu
#bibliiaDf.loc[15172:15174] # predislovie for Esfira
#bibliiaDf.loc[16571:16577] # notes for Job
#bibliiaDf.loc[21483:21490] # предисловие for WisdJes

### Find Parts

In [None]:
# find df rows containing all the book and chapter title lines 
# as well as the OT/NT split on the 1st page of Matthew
chap_lines = bibliiaDf.loc[bibliiaDf.text.str.contains(chap_lines_re)].index
book_lines = bibliiaDf.loc[bibliiaDf.text.str.contains(book_lines_re)].index
test_line = bibliiaDf.loc[bibliiaDf.text.str.contains(matt_name)].index

# combine the title lines for extraction later
title_lines = test_line.append([chap_lines, book_lines]).sort_values()

### Assign OHCO Labels

In [None]:
bibliiaDf.text.str.split(pat=' ', n=0)

In [None]:
bibliiaDf

In [None]:
# ['test']
bibliiaDf[bibleOHCO[0]] = np.where(bibliiaDf.index<test_line[0], 1, 2)
# ['book']
bibliiaDf[bibleOHCO[1]] = bibliiaDf.loc[book_lines].text.str.extract(book_lines_re)
# ['chap']
bibliiaDf[bibleOHCO[2]] = bibliiaDf.loc[chap_lines].text.str.extract(chap_lines_re)
# ['verse'] by splitting verse num at beginning from text in current verse lines
bibliiaDf[[bibleOHCO[3], 'text']] = bibliiaDf.text.str.split(' ', n=1, expand=True)
bibliiaDf

In [None]:
# ['test']
#bibliiaDf[bibleOHCO[0]] = np.where(bibliiaDf.index<test_line[0], 1, 2)
# ['book']
#bibliiaDf[bibleOHCO[1]] = bibliiaDf.loc[book_lines].text.str.extract(book_lines_re)
# ['chap']
#bibliiaDf[bibleOHCO[2]] = bibliiaDf.loc[chap_lines].text.str.extract(chap_lines_re)
# ['verse'] by splitting verse num at beginning from text in current verse lines
#bibliiaDf[[bibleOHCO[3],'text']] = bibliiaDf.text.str.split(pat=' ', n=1).iloc[:, [0, 1]]

# fill in book and chapter titles to cells below them
bibliiaDf[bibleOHCO[1:3]] = bibliiaDf[['book','chap']].ffill()

# transfer book numbers onto book titles
#bibliiaDf[bibleOHCO[1]] = bibliiaDf[bibleOHCO[1]].map(dict([(y,x) for (x,y) in enumerate(bibliiaDf.book.unique(), start=1)]))

# drop title lines and reset index to give verse id num 
bibliiaDf = bibliiaDf.drop(title_lines, axis=0).reset_index()#.set_index(bibleOHCO)
bibliiaDf.index = range(1,len(bibliiaDf)+1)
bibliiaDf.index.name = 'v_id'

# reorder columns
bibliiaDf = bibliiaDf[['test', 'book', 'chap', 'verse', 'text']]
bibliiaDf

### Swap Russian Book Name for English Equivalent

In [None]:
booksDict.keys()
total_books = 0
for book in booksDict.keys():
    total_books += sum(booksDict[book]['chap_verse'].values())
print(total_books)

In [None]:
book_nums = [(x,y) for x,y in enumerate(bibliiaDf.book.unique())][1:]
bibliiaDf['book_en'] = [booksDict[str(x)]['en_name'] for x,y in book_nums for name in bibliiaDf.book if y == name]

In [None]:
bibliiaDf.loc[~bibliiaDf.isnull().any(axis=1)].reset_index()

### Export to JSON

In [None]:
bibliiaDf.to_json(bibleJSON, orient='index')

## Processing

### OHCO DFs

#### Dictionaries for testament and book IDs

In [None]:
#testsDict = dict(enumerate(bibliiaDf.test.unique()))
testsDict = dict([(value, key) for key, value in dict(enumerate(bibliiaDf.test.unique())).items()])
booksDict = dict([(value, key) for key, value in dict(enumerate(bibliiaDf.book.unique())).items()])

#### Testaments

In [None]:
# make df of testament texts

TestsDf = pd.DataFrame([(x, ' '.join(y)) for (x,y) in bibliiaDf.\
    groupby('test', sort=False).text], columns=['test', 'text']).\
        set_index(np.arange(1,len(bibliiaDf.test.unique())+1))
TestsDf.index.name = 't_id'
TestsDf 

#### Books

In [None]:
## make Df of book texts

BooksDf = pd.DataFrame([(x, ' '.join(y)) for (x,y) in bibliiaDf.\
    groupby('book', sort=False).text], columns=['book', 'text'])\
    .set_index(np.arange(1,len(bibliiaDf.book.unique())+1))
BooksDf.index.name = 'b_id'

BooksDf 

#### Chapters

In [None]:
# grab book name and chapter number from main df
ChapsDf = pd.DataFrame([(x, ' '.join(y)) for (x,y) in bibliiaDf.groupby(bibleOHCO[1:3], sort=False).text], columns=[('book', 'chap'), 'text'])
ChapsDf[['book','chap']] = pd.DataFrame(list(ChapsDf[('book', 'chap')]), index=ChapsDf.index, columns=bibleOHCO[1:3])
del ChapsDf[('book', 'chap')]
#ChapsDf = ChapsDf.replace({"book": booksDict}).sort_values(by=bibleOHCO[1:3], ascending=[True, True])

ChapsDf = ChapsDf.reset_index().drop(['index'], axis=1).set_index(np.arange(1,len(ChapsDf)+1))
ChapsDf.index.name = 'c_id'

#ChapsDf = ChapsDf.replace({"book":dict(enumerate(bibliiaDf.book.unique()))})
#ChapsDf = ChapsDf.reset_index().set_index(bibleOHCO[:3])

ChapsDf

In [None]:
# make t_id and b_id columns with TestsDf and BooksDf IDs respectively
bibliiaDf['t_id'] = bibliiaDf['test'].map(TestsDf.reset_index().set_index('test')['t_id'])
bibliiaDf['b_id'] = bibliiaDf['book'].map(BooksDf.reset_index().set_index('book')['b_id'])

# merge existing bibliiaDf with Chapsdf for c_id column (because it's based on multiple matching columns)
bibliiaDf = pd.merge(bibliiaDf, ChapsDf.reset_index()[['book', 'chap', 'c_id']], on=['book', 'chap'], how='left')

# remake index
bibliiaDf = bibliiaDf.reset_index().drop(['index'], axis=1).set_index(np.arange(1,len(bibliiaDf)+1))
bibliiaDf.index.name = 'v_id'

bibliiaDf

In [None]:
bibliiaDf.to_json(bibleIdJSON, orient='index')

### Make XML

In [5]:
bibliiaDf = pd.read_json(bibleIdJSON, orient='index')

In [6]:
bibliiaDf#.rename(columns={'book':'book_ru'})

Unnamed: 0,test,book,chap,verse,text,t_id,b_id,c_id_x,book_en,c_id_y
1,1,Бытие,1,1,В начале сотворил Бог небо и землю.,1,1,1,Genesis,1
2,1,Бытие,1,2,"Земля же была безвидна и пуста, и тьма над без...",1,1,1,Genesis,1
3,1,Бытие,1,3,И сказал Бог: да будет свет. И стал свет.,1,1,1,Genesis,1
4,1,Бытие,1,4,"И увидел Бог свет, что он хорош, и отделил Бог...",1,1,1,Genesis,1
5,1,Бытие,1,5,"И назвал Бог свет днем, а тьму ночью. И был ве...",1,1,1,Genesis,1
...,...,...,...,...,...,...,...,...,...,...
37082,2,Откровение святого Иоанна Богослова,22,17,И Дух и невеста говорят: прииди! И слышавший д...,2,77,1361,Book of Revelation,1361
37083,2,Откровение святого Иоанна Богослова,22,18,И я также свидетельствую всякому слышащему сло...,2,77,1361,Book of Revelation,1361
37084,2,Откровение святого Иоанна Богослова,22,19,и если кто отнимет что от слов книги пророчест...,2,77,1361,Book of Revelation,1361
37085,2,Откровение святого Иоанна Богослова,22,20,"Свидетельствующий сие говорит: ей, гряду скоро...",2,77,1361,Book of Revelation,1361


In [None]:
bookRefDict = {}
for book in booksDict.items():
    cleanDict = {int(book[0]):book[1]}
    cleanDict[int(book[0])]['chap_verse'] = bibliiaDf.loc[bibliiaDf.b_id == int(book[0])].groupby('chap').max().to_dict()['verse']
    #print(verseDict)
    #cleanDict.update(verseDict)
    #print(cleanDict)
    bookRefDict.update(cleanDict)
#bookRefDict

In [None]:
with open('../site/data/booksDict.json', "w") as outfile:
    json.dump(bookRefDict, outfile)
    #json.dump('../site/data/booksDict.json', outfile)

In [None]:
root = etree.Element("bible")
print(root.tag)
t = b = c = v = 1
#TestList = [x for x in bibliiaDf.test.unique()]
#for test in range(len(TestList)): 
#    root.append(etree.Element("t", n=str(t), name=TestList[test]))
BookList = list(bibliiaDf.book.unique())
for book in range(len(BookList)): 
    root.append(etree.Element("b", n=str(b), name_ru=BookList[book], name_en=bibliiaDf.loc[bibliiaDf.book==BookList[book]].book_en.unique()[0]))
    ChapList = list(bibliiaDf.loc[bibliiaDf['book'] == BookList[book]].chap.unique())
    for chap in range(len(ChapList)): 
        root[book].append(etree.Element("c", n=str(c), name=str(chap+1)))
        VerseList = list(bibliiaDf.loc[(bibliiaDf['book'] == BookList[book]) & (bibliiaDf['chap'] == ChapList[chap])].index)
        for verse in range(len(VerseList)): 
            root[book][chap].append(etree.Element("v", n=str(v), name=str(verse+1)))
            verse_text = bibliiaDf.loc[v].text
            root[book][chap][verse].text = verse_text
            v+=1
        c+=1
    b+=1
#t+=1
#print(etree.tostring(root, pretty_print=True, xml_declaration=True))
etree.ElementTree(root).write(bibleXML, pretty_print=True, xml_declaration=True, encoding='windows-1251')

## Token DFs

In [57]:
bibliiaDf = pd.read_json(bibleIdJSON, orient='index')

In [58]:
bibliiaDf.index.name = 'v_id'
bibliiaDf = bibliiaDf.drop(columns='c_id_y')
bibliiaDf = bibliiaDf.rename(columns={'c_id_x':'c_id'})
bibliiaDf#.reset_index()

Unnamed: 0_level_0,test,book,chap,verse,text,t_id,b_id,c_id,book_en
v_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,Бытие,1,1,В начале сотворил Бог небо и землю.,1,1,1,Genesis
2,1,Бытие,1,2,"Земля же была безвидна и пуста, и тьма над без...",1,1,1,Genesis
3,1,Бытие,1,3,И сказал Бог: да будет свет. И стал свет.,1,1,1,Genesis
4,1,Бытие,1,4,"И увидел Бог свет, что он хорош, и отделил Бог...",1,1,1,Genesis
5,1,Бытие,1,5,"И назвал Бог свет днем, а тьму ночью. И был ве...",1,1,1,Genesis
...,...,...,...,...,...,...,...,...,...
37082,2,Откровение святого Иоанна Богослова,22,17,И Дух и невеста говорят: прииди! И слышавший д...,2,77,1361,Book of Revelation
37083,2,Откровение святого Иоанна Богослова,22,18,И я также свидетельствую всякому слышащему сло...,2,77,1361,Book of Revelation
37084,2,Откровение святого Иоанна Богослова,22,19,и если кто отнимет что от слов книги пророчест...,2,77,1361,Book of Revelation
37085,2,Откровение святого Иоанна Богослова,22,20,"Свидетельствующий сие говорит: ей, гряду скоро...",2,77,1361,Book of Revelation


In [81]:
bibMetaDf = bibliiaDf.reset_index().groupby('book_en').v_id.min().to_frame().join(bibliiaDf.reset_index().groupby('book_en').v_id.max().to_frame(), lsuffix='_min', rsuffix='_max')
bibMetaDf['num_chaps'] = bibliiaDf.groupby('book_en').chap.max()
bibMetaDf['b_id'] = bibliiaDf.groupby('book_en').b_id.first()
bibMetaDf['num_verses'] = bibliiaDf.groupby('book_en').verse.count()
#bibMetaDf['num_words'] = bibliiaDf.groupby('book_en').text.count()
bibMetaDf['ru_name'] = bibliiaDf.groupby('book_en').book.first()
bibMetaDf

Unnamed: 0_level_0,v_id_min,v_id_max,num_chaps,b_id,num_verses,ru_name
book_en,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1 Chronicles,10257,11198,29,13,942,Первая книга Паралипоменон
1 Kings,8722,9537,22,11,816,Третья книга Царств
1 Maccabees,26597,27520,16,47,924,Первая книга Маккавейская
1 Samuel,7216,8026,31,9,811,Первая книга Царств
2 Chronicles,11199,12020,36,14,822,Вторая книга Паралипоменон
...,...,...,...,...,...,...
Third Epistle of John,34309,34323,1,61,15,Третье соборное послание святого апостола Иоанна
Tobit,13149,13392,14,18,244,Книга Товита
Wisdom of Solomon,18759,19198,19,26,440,Книга Премудрости Соломона
Zechariah,26331,26541,14,45,211,Книга пророка Захарии


In [61]:
#bibliiaDf = pd.read_json(bibleJSON, orient='index')
#bibliiaDf.index.name = 'v_id'
BibTextDf = bibliiaDf[['text']]
#BibLibDf = bibliiaDf[bibleOHCO]
bibliiaDf

Unnamed: 0_level_0,test,book,chap,verse,text,t_id,b_id,c_id,book_en
v_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,Бытие,1,1,В начале сотворил Бог небо и землю.,1,1,1,Genesis
2,1,Бытие,1,2,"Земля же была безвидна и пуста, и тьма над без...",1,1,1,Genesis
3,1,Бытие,1,3,И сказал Бог: да будет свет. И стал свет.,1,1,1,Genesis
4,1,Бытие,1,4,"И увидел Бог свет, что он хорош, и отделил Бог...",1,1,1,Genesis
5,1,Бытие,1,5,"И назвал Бог свет днем, а тьму ночью. И был ве...",1,1,1,Genesis
...,...,...,...,...,...,...,...,...,...
37082,2,Откровение святого Иоанна Богослова,22,17,И Дух и невеста говорят: прииди! И слышавший д...,2,77,1361,Book of Revelation
37083,2,Откровение святого Иоанна Богослова,22,18,И я также свидетельствую всякому слышащему сло...,2,77,1361,Book of Revelation
37084,2,Откровение святого Иоанна Богослова,22,19,и если кто отнимет что от слов книги пророчест...,2,77,1361,Book of Revelation
37085,2,Откровение святого Иоанна Богослова,22,20,"Свидетельствующий сие говорит: ей, гряду скоро...",2,77,1361,Book of Revelation


In [None]:
BibTextDf

In [None]:
%%time
VerseTokenDf = nat_parse(BibTextDf)

In [None]:
VerseTokenDf

In [None]:
VerseTokenDf.to_pickle('./proc/BibleTokenDf.pkl')

In [59]:
VerseTokenDf = pd.read_pickle('./proc/BibleTokenDf.pkl')
VerseTokenDf['stopword'] = ~VerseTokenDf.loc[:, tokenCols[9:]].any(axis=1)
VerseTokenDfIdx = pd.Index(range(1, (VerseTokenDf.shape[0]+1)), name='id')
VerseTokenDf.index = VerseTokenDfIdx# = TokenDf.set_index(['p_id', 'token_id'])

In [60]:
VerseTokenDf

Unnamed: 0_level_0,p_id,start,stop,text,token_id,head_id,rel,pos,lemma,anim,...,case,degree,gender,mood,number,person,tense,verb_form,voice,stopword
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,В,1_1,1_2,case,ADP,в,,...,,,,,,,,,,True
2,1,2,8,начале,1_2,1_3,obl,NOUN,начало,Inan,...,Loc,,Neut,,Sing,,,,,False
3,1,9,17,сотворил,1_3,1_0,root,VERB,сотворить,,...,,,Masc,Ind,Sing,,Past,Fin,Act,False
4,1,18,21,Бог,1_4,1_3,nsubj,PROPN,бог,Anim,...,Nom,,Masc,,Sing,,,,,False
5,1,22,26,небо,1_5,1_3,obj,NOUN,небо,Inan,...,Acc,,Neut,,Sing,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675064,37086,32,38,Христа,1_5,1_4,flat:name,PROPN,христос,Anim,...,Gen,,Masc,,Sing,,,,,False
675065,37086,39,41,со,1_6,1_8,case,ADP,с,,...,,,,,,,,,,True
675066,37086,42,47,всеми,1_7,1_8,det,DET,весь,,...,Ins,,,,Plur,,,,,False
675067,37086,48,52,вами,1_8,1_4,nmod,PRON,вы,,...,Ins,,,,Plur,2,,,,False


In [None]:
# stopword counts

VerseTokenDf.loc[TokenDf.stopword == True].lemma.value_counts()[:25]

In [None]:
VerseTokenDf.loc[TokenDf[tokenCols[9]] == 'Anim'].lemma.value_counts().to_frame()

In [None]:
GetRankDf(VerseTokenDf)#.plot(x='rank')

In [None]:
GetRankDf(VerseTokenDf, 'pos')

In [None]:
VocabDf = VerseTokenDf.loc[VerseTokenDf.stopword != True].lemma.value_counts().to_frame().reset_index()
VocabDf.columns = ['lemma', 'count']
VocabDf.index = VocabDf.index.map(lambda x: x + 1).rename('rank')
VocabDf.iloc[:25]#.plot(x='rank')

In [None]:
bibliiaType = {'test':str, 'book':str, 'chap':int, 'verse':int}
bibliiaDf = pd.read_json(bibleJSON, orient='index', dtype=bibliiaType)
bibliiaDf.index.name = 'p_id'
#bibliiaDf.verse = bibliiaDf.verse.astype(int)
p_idDict = bibliiaDf.drop('text', axis=1).to_dict(orient='index')
bibliiaDf

In [None]:
# dictionary with verse lines in format 
# {book: {0: first_line, last_line}, {chap_num: (first_chap_line, last_chap_line)}}
PartLineNumsDict = {}
for test_name in bibliiaDf.test.unique():
    t_min = bibliiaDf.loc[bibliiaDf['test'] == test_name].reset_index().p_id.min()
    t_max = bibliiaDf.loc[bibliiaDf['test'] == test_name].reset_index().p_id.max()
    print(f"{test_name}: {t_min} - {t_max}")
    for book_name in bibliiaDf.book.unique():
        b_min = bibliiaDf.loc[bibliiaDf['book'] == book_name].reset_index().p_id.min()
        b_max = bibliiaDf.loc[bibliiaDf['book'] == book_name].reset_index().p_id.max()
        PartLineNumsDict.update({book_name: {0: (b_min, b_max)}})
        #print(f"{book_name}: {b_min} - {b_max}")
        for chap_name in bibliiaDf.loc[bibliiaDf['book'] == book_name].chap.unique():
            c_min = bibliiaDf.loc[(bibliiaDf['book'] == book_name) & (bibliiaDf['chap'] == chap_name)].reset_index().p_id.min()
            c_max = bibliiaDf.loc[(bibliiaDf['book'] == book_name) & (bibliiaDf['chap'] == chap_name)].reset_index().p_id.max()
            PartLineNumsDict[book_name][chap_name] = (c_min, c_max)
            #    print(f"{chap_name}: {c_min} - {c_max}")
#print(PartLineNumsDict)

In [None]:
TokenDf.loc[(TokenDf['p_id'] >= count_start) & (TokenDf['p_id'] <= count_stop)].lemma.value_counts()

In [None]:
[x for x in PartLineNumsDict]

### Manual TF-IDF

In [None]:
# TF-IDF

TfidfDf = TokenDf.reindex(columns=['text', 'lemma', 'tfidf_bible', 'tfidf_book', 'tfidf_chap'])
#vocab = TokenDf.loc[TokenDf['p_id'] <= 10, 'lemma'].unique().tolist()

def getTFIDF(count_start=0, count_stop=(TokenDf.shape[0]+1)):

    return TFIDF

count_start = 0
count_stop = (TokenDf.shape[0]+1)

TfidfCountDf = TokenDf.loc[(TokenDf['p_id'] >= count_start) & (TokenDf['p_id'] <= count_stop)][['p_id', 'lemma']]
VocabCountDict = TfidfCountDf.value_counts('lemma').to_dict()

total_words = sum(VocabCountDict.values())
num_docs = TfidfCountDf.p_id.unique().size
    
for lemma in VocabCountDict.keys(): 

    instances = VocabCountDict.get(lemma)
    docsw_word = TfidfCountDf.loc[TfidfCountDf['lemma'] == lemma].p_id.unique().size

    TF = instances / total_words
    IDF = num_docs / docsw_word
    TFIDF = TF * IDF

    TfidfDf['tfidf_bible'].loc[TfidfDf['lemma'] == lemma] = TFIDF

#TfidfDf['tfidf_bible'] = TokenDf

# PartLineNumsDict = {book: {0: first_line, last_line}, {chap_num: (first_chap_line, last_chap_line)}}

TfidfDf

In [None]:
TfidfDf.to_pickle('./proc/TfidfDf.pkl')

In [None]:
TfidfDf = pd.read_pickle('./proc/TfidfDf.pkl')
TfidfDf

In [None]:
GetRankDf(TokenDf)

In [None]:
TfidfDf.sort_values('tfidf_bible', ascending=False).lemma.unique()[:20]

## NER-supplemented Bible text

pronominal references swapped with names

## Bible Phrase Dictionary

In [None]:
with open(bibFrazyTXT) as bibleFrazy: 
    bibleFrazyLines = bibleFrazy.readlines()

In [None]:
BibleFrazyDf = pd.DataFrame(bibleFrazyLines).rename(columns={0:'fraza'})
BibleFrazyDf.loc[:,'fraza'] = BibleFrazyDf.loc[:,'fraza'].apply(lambda x: x.strip().strip('\n').translate(str.maketrans('', '', string.punctuation)))
BibleFrazyDf = BibleFrazyDf.loc[BibleFrazyDf.fraza != '']

In [None]:
frazyList = BibleFrazyDf.loc[30189:].fraza.to_list()
FrazyDf = pd.DataFrame(data=frazyList).rename(columns={0:'fraza'})
FrazyDf.index.name = 'f_id'
FrazyDf

In [None]:
FrazyDf.iloc[7]

In [None]:
%%capture
FrazyTokenDf = nat_parse(FrazyDf, textCol='fraza')
FrazyTokenDf['stopword'] = ~FrazyTokenDf.loc[:, tokenCols[9:]].any(axis=1)
FrazyTokenDf = FrazyTokenDf.set_index('p_id')
#FrazyTokenDf.index.name = 'p_token'
#FrazyTokenDf = FrazyTokenDf.reset_index()

In [None]:
FrazyTokenDf#.set_index(['p_id', 'token_id'])

In [None]:
pos_lists = []
for x in range(0,(FrazyTokenDf.p_id.max()+1)):
    pos_lists.append(FrazyTokenDf.loc[(FrazyTokenDf.p_id == x) & (FrazyTokenDf.stopword == False)].lemma.unique())

In [None]:
pos_lists

In [None]:
#bibliiaDf.loc[(bibliiaDf.test == 'N')]
bibliiaDf#.loc[(bibliiaDf.book == 'Lk') & (bibliiaDf.chap == 10) & (bibliiaDf.verse == '42')]

In [None]:
bibliiaDf.loc[31412].text

In [None]:
#phraseList = [x.tolist() for x in pos_lists]
for verse_id in range(31400,31450):
    print(f"verse_id: {verse_id}")
    testVerse = TokenDf.loc[(TokenDf.p_id == verse_id) & (TokenDf.stopword == False)].lemma.tolist()
    print(f"testVerse: {testVerse}")
    for phrase in pos_lists:
        print(f"phrase: {phrase}")
        i = 0
        for word in phrase:
            if word in testVerse:
                i+=1
        print(f"{verse_id} : {i} / {phrase.size}")

In [None]:
TokenDf.loc[TokenDf.p_id == 31400]

In [None]:
FrazyDf.loc[FrazyDf.p_id == 31400].lemma.tolist()

In [None]:
testPhrase = FrazyDf.loc[FrazyDf.p_id == 22].lemma.unique()

In [None]:
testVerse = TokenDf.loc[TokenDf.p_id == 31412].lemma.tolist()

In [None]:
i = 0
print(testPhrase.tolist())
print(testVerse)
for x in testPhrase.tolist():
    if x in testVerse:
        i+=1
print(f"i = {i}")
if i == testPhrase.size:
    print("phrase in verse")

In [None]:
for x in pos_lists[20:25]:
    print(x)
    for y in TokenDf.p_id:
        #print(TokenDf.loc[TokenDf.p_id == y].lemma.unique())
        #if x in TokenDf.loc[TokenDf.p_id == y].lemma.unique():
            #print(f"{x} in {y}")

In [None]:
FrazyAbbrsDf = BibleFrazyDf.iloc[:105].fraza.str.split(' – ', expand=True).rename(columns={0:"abbr", 1:"fraza"})
FrazyAbbrsDf.loc[:,'abbr'] = FrazyAbbrsDf.loc[:,'abbr'].apply(lambda x: x.strip())
FrazyAbbrsDf.loc[:,'fraza'] = FrazyAbbrsDf.loc[:,'fraza'].apply(lambda x: x.strip())
FrazyAbbrsDict = FrazyAbbrsDf.set_index('abbr').to_dict().get('fraza')
FrazyAbbrsDict