In [1]:
### Prerequisties
# most of the packages are available via anaconda
# some of them require to be installed separately

# some of the packages (especially nltk) are necessary only for preprocessing
# feel free to ignore them and move directly into later sections

import os
import pandas as pd
import numpy as np
import nltk
import pickle
import json
import sys
import re

import csv
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET

from gensim import corpora, models, similarities
### drawing on a basic tutorial: https://radimrehurek.com/gensim/tut1.html#corpus-streaming-one-document-at-a-time
### we use it here basically only for generating the vector model

import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer

### for plotting
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
### import cufflinks as cf - to connect plotly and pandas

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe

# a few other packages are imported in the section on visualizing distances

## To start with preprocessed CSEL and PL


In [2]:
CSEL_modified_df = pd.read_pickle("data/CSEL_modified_df.pkl")
CSEL_modified_df.head(5)

Unnamed: 0,file_name,author,title,text
0,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",\n\n\n (1.) QVAE GESTA SUNT INTER LIBERIUM ET ...
1,stoa0045a.stoa001.opp-lat2.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, II",\n\n\n\n(105.)\n\n\n\n SACRO ET BEA...
2,stoa0162.stoa004.opp-lat3.xml,Hieronymus,"Epistulae, 120-154",\n\n\n\nCXXI. AD ALGASIAM LIBER QVAESTIONUM UN...
3,stoa0162.stoa004.opp-lat2.xml,Hieronymus,"Epistulae, 71-120",\n\n\n\n\nLXXI. AD LUCINUM BAETICUM. \n\n\n\nN...
4,stoa0162.stoa004.opp-lat1.xml,Hieronymus,"Epistulae, 1-70",\n\n\nS. EVSEBII HIERONVMI EPISTVLAE I-LXX. \n...


In [3]:
first_text = CSEL_modified_df["text"][0]
first_text[:400]

'\n\n\n (1.) QVAE GESTA SUNT INTER LIBERIUM ET FELICEM EPISCOPOS.  \n\nTemporibus Constantii imperatoris filii Constantini durior \n            orta est persecutio Christianorum ab impiis haereticis Arrianis \n            annitente Constantio, qui et Athanasium episcopum resistentem \n            haereticis persecutus est et, ut damnaretur ab omnibus episcopis, \n            imperauit. quod etiam metu princ'

In [4]:
### FUNCTION: replace Roman numbers with arabic

def roman_to_int(n):
    n = str(n.upper())
    numeral_map = zip(
        (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1),
        ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
    )
    i = 0
    result = 0
    for integer, numeral in numeral_map:
        while n[i:i + len(numeral)] == numeral:
            result += integer
            i += len(numeral)  
    return result

In [2]:
### set up the credentials
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('jupgsheets.json', scope)
gc = gspread.authorize(creds)

In [3]:
### open spreadsheet from url
sh = gc.open_by_key("1byVbRwnQ058--0ud1_NBt2xgIhuiV7LK1BmCm67CTxA")
### open particular worksheet
worksheet = sh.worksheet("AbbrevConverter")

In [7]:
### upload the worksheet into dataframe
abbrevConverter_df = get_as_dataframe(worksheet)
### select only its part as a 
abbrevConverter_df = abbrevConverter_df[6:82]

In [8]:
abbrevConverter_df.head(5)

Unnamed: 0,Book_ID,SBL_handbook_of_style,CSEL_VK,CSEL_OGL,English_full,Latin1,Latin2,Latin_full,Czech,Czech_full,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
6,1.0,Gen,Gen.,,Genesis,Gn,Gn,Genesis,Gn,Genesis,...,,,,,,,,,,
7,2.0,Exod,Exo.,,Exodus,Ex,Ex,Exodus,Ex,Exodus,...,,,,,,,,,,
8,3.0,Lev,Lev.,,Leviticus,Lv,Lv,Leviticus,Lv,Leviticus,...,,,,,,,,,,
9,4.0,Num,Num.,,Numbers,Nm,Nm,Numeri,Nu,Numeri,...,,,,,,,,,,
10,5.0,Deut,Deut.,,Deuteronomy,Dt,Dt,Deuteronomium,Dt,Deuteronomium,...,,,,,,,,,,


In [None]:
abbrevConverter_df.set_index("CSEL_VK", inplace=True)

In [35]:
abbrevConverter_df.loc[word]["Classification_genre_DZ1"]

'historical'

In [9]:
### the list of biblical books abbreviations we will use

bib_books = abbrevConverter_df["CSEL_VK"].tolist()
print(bib_books[:10])

['Gen.', 'Exo.', 'Lev.', 'Num.', 'Deut.', 'Josh.', 'Judg.', 'Ruth', '1Sam.', '2Sam.']


In [10]:
CSEL_modified_df.head(5)

Unnamed: 0,file_name,author,title,text
0,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",\n\n\n (1.) QVAE GESTA SUNT INTER LIBERIUM ET ...
1,stoa0045a.stoa001.opp-lat2.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, II",\n\n\n\n(105.)\n\n\n\n SACRO ET BEA...
2,stoa0162.stoa004.opp-lat3.xml,Hieronymus,"Epistulae, 120-154",\n\n\n\nCXXI. AD ALGASIAM LIBER QVAESTIONUM UN...
3,stoa0162.stoa004.opp-lat2.xml,Hieronymus,"Epistulae, 71-120",\n\n\n\n\nLXXI. AD LUCINUM BAETICUM. \n\n\n\nN...
4,stoa0162.stoa004.opp-lat1.xml,Hieronymus,"Epistulae, 1-70",\n\n\nS. EVSEBII HIERONVMI EPISTVLAE I-LXX. \n...


In [4]:
sh = gc.open_by_url('https://docs.google.com/spreadsheets/d/1vWUsY1moh6b5A27YDxPSuibayXNLRUV97Hz9TICyBBo/edit?usp=sharing')
###worksheet = sh.add_worksheet(title="CSEL_overview", rows=1, cols=4)
worksheet = sh.worksheet("CSEL_overview")

In [12]:
worksheet.clear()

{'spreadsheetId': '1vWUsY1moh6b5A27YDxPSuibayXNLRUV97Hz9TICyBBo',
 'clearedRange': 'CSEL_overview!A1:D203'}

In [14]:
### they do not like it at google
overview = []
for file_name, author, title, text in zip(CSEL_modified_df["file_name"], CSEL_modified_df["author"], CSEL_modified_df["title"], CSEL_modified_df["text"]):
    overview.append([file_name, author, title, len(text.split())])

In [15]:
### export the overview into spreadsheet
    set_with_dataframe(worksheet, pd.DataFrame(overview))

In [44]:
### from the content of the footnotes we will ignore everything except the abbreviations
bib_abbr = []
# go throug every note one by one
for file_name, author, title, text in zip(CSEL_modified_df["file_name"], CSEL_modified_df["author"], CSEL_modified_df["title"], CSEL_modified_df["text"]):
    # find all matches between words in the text and abbr. of biblical books
    author_text_split = text.split()
    for num, word in enumerate(author_text_split):
        if word in bib_books:
            word_plus_one = author_text_split[num+1]
                
            word_plus_two = author_text_split[num+2]
            word_minus_one = author_text_split[num-1]
            words_original = " ".join([word, word_plus_one, word_plus_two])
            if (word_plus_one == "cap.") or (word_plus_one == "c."):
                word_plus_one = author_text_split[num+2]
                word_plus_two = author_text_split[num+3]
            try: ### identify any roman number in chapters and replace them by arabic
                found = re.search(r'(^(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})(\b))', word_plus_one, flags=re.IGNORECASE).groups()[0]
                if not found == "":
                    word_plus_one = word_plus_one.replace(found, str(roman_to_int(found)))
            except:
                pass
            try: 
                chapterverse = re.search('(\d+)(\.|\,|\:)(\d+)', word_plus_one, flags=re.IGNORECASE).groups()
                word_plus_one = chapterverse[0]
                word_plus_two = chapterverse[2]
            except:
                pass
            if not word_plus_one[0].isdigit():
                word_plus_one = ""
                precision_type = "no_chapter_"
            else:
                word_plus_one = re.search(r'(\d+)', word_plus_one, flags=re.IGNORECASE).groups()[0]
                precision_type = "chapter_"
            if word_plus_two.startswith("v."):
                word_plus_two.replace("v.", "")
            if not word_plus_two[0].isdigit():
                word_plus_two = ""
                precision_type = precision_type + "no_verse"
            else:
                word_plus_two = re.search(r'((\d+)(\-\d+)?)', word_plus_two, flags=re.IGNORECASE).groups()[0]
                precision_type = precision_type + "verse"
            genre = abbrevConverter_df.loc[word]["Classification_genre_DZ1"]
            testament = abbrevConverter_df.loc[word]["Classification_testament"]
            czech = abbrevConverter_df.loc[word]["Czech"]
            ### to match instances with more verses (Gen. 5.7-14)
            if word_minus_one.lower() == "cf.":
                citation_or_allusion = "a"
            else:
                citation_or_allusion = "c"
            more_verses = re.match(r'(\d+)(\-)(\d+)', word_plus_two)
            if more_verses is not None:
                beginning = int(more_verses.groups()[0])
                end = int(more_verses.groups()[2])
                words_modified = "_".join([word.replace(".","").lower(), word_plus_one, str(beginning) + "to" + str(end)])
                bib_abbr.append([file_name, author, title, word_minus_one, words_original, citation_or_allusion, words_modified, "child", precision_type, word, word_plus_one, str(beginning), str(end), czech, genre, testament])
                ###    beginning = beginning + 1
                ###while (beginning <= end):
                ###    words_modified = "_".join([word.replace(".","").lower(), word_plus_one, str(beginning)])
                ###    bib_abbr.append([file_name, author, title, words_original, words_modified])
                ###    beginning = beginning + 1
            else:
                words_modified = "_".join([word.replace(".","").lower(), word_plus_one, word_plus_two])
                bib_abbr.append([file_name, author, title, word_minus_one, words_original, citation_or_allusion, words_modified, "single", precision_type, word, word_plus_one, word_plus_two,  0, czech, genre, testament])

In [49]:
bib_abbr_df = pd.DataFrame(bib_abbr, columns=["file_name", "author", "Work", "word_before", "text_as_found", "Citation_or_allusion", "condenced_expression", "Composite_type", "completion?", "CSEL_VK", "Composite_reference_chapter", "Composite_reference_firstverse", "Composite_reference_lastverse", "Czech", "Classification_genre_DZ1", "Classification_testament"])   
bib_abbr_df.head(5)

Unnamed: 0,file_name,author,Work,word_before,text_as_found,Citation_or_allusion,condenced_expression,Composite_type,completion?,CSEL_VK,Composite_reference_chapter,Composite_reference_firstverse,Composite_reference_lastverse,Czech,Classification_genre_DZ1,Classification_testament
0,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",1,"Luke 12, 20",c,luke_12_20,single,chapter_verse,Luke,12,20,0,L,historical,NT
1,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",cf.,"Acts 9, 15",a,acts_9_15,single,chapter_verse,Acts,9,15,0,Sk,historical,NT
2,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",6,Gal. 1. 8,c,gal_1_8,single,chapter_verse,Gal.,1,8,0,Ga,admonishing/normative,NT
3,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",9,"Gal. 2, 18",c,gal_2_18,single,chapter_verse,Gal.,2,18,0,Ga,admonishing/normative,NT
4,stoa0045a.stoa001.opp-lat1.xml,unknown,"Epistulae Imperatorum Pontificum Aliorum, I",14,John 14. 27,c,john_14_27,single,chapter_verse,John,14,27,0,J,historical,NT


In [47]:
#  to save the abbreviations into a file
with open("data/CSEL_bib_abbr.pkl", "wb") as f:
    pickle.dump(bib_abbr_df, f)

In [5]:
### upload the data back and start here
file = open("data/CSEL_bib_abbr.pkl", "rb")
bib_abbr_df = pickle.load(file)

In [18]:
worksheet = sh.add_worksheet(title="CSEL_abbr_data", rows=1, cols=10)

In [6]:
worksheet = sh.worksheet("CSEL_abbr_data")

In [7]:
### export the data into spreadsheet
set_with_dataframe(worksheet, pd.DataFrame(bib_abbr_df))

In [None]:
### Overview

In [20]:
bib_abbr_df.columns.tolist()

['file_name',
 'author',
 'Work',
 'word_before',
 'text_as_found',
 'Citation_or_allusion',
 'condenced_expression',
 'Composite_type',
 'completion?',
 'CSEL_VK',
 'Composite_reference_chapter',
 'Composite_reference_firstverse',
 'Composite_reference_lastverse',
 'Czech',
 'Classification_genre_DZ1',
 'Classification_testament']

In [8]:
len(bib_abbr_df)

27053

In [16]:
len(bib_abbr_df[bib_abbr_df["completion?"]=="chapter_verse"])

24694

In [12]:
len(bib_abbr_df[bib_abbr_df["completion?"]=="chapter_no_verse"])

961

In [13]:
len(bib_abbr_df[bib_abbr_df["completion?"]=="no_chapter_no_verse"])

1112

In [15]:
len(bib_abbr_df[bib_abbr_df["Composite_type"]=="child"])

1739

In [50]:
authors_bib_books = []
for author in authors_bib_abbr:
    author_bookchapterverses = []
    for element in author:
        author_bookchapterverses.append(element[1])
    authors_bib_books.append(" ".join(author_bookchapterverses))

In [41]:
#  to save the abbreviations into a file
with open("data/CSEL_PL_bib_abbr.pkl", "wb") as f:
    pickle.dump(authors_bib_books, f)

In [6]:
file = open("data/CSEL_PL_bib_abbr.pkl", "rb")
authors_bib_books = pickle.load(file)

In [7]:
### how many elements we have
len(" ".join(authors_bib_books).split())

49981

In [39]:
import itertools

books_only = []
book_chapters = []
book_chapter_verses = []
for author in authors_bib_books:
    book_chapter_verses.append(re.findall(r'\w+\_\d+\_\d+', author))
    book_chapters.append(re.findall(r'\w+\_\d+.', author)) ### or add "\b" to match chapters only
    books_only.append(re.findall(r'\w+\_\_\b', author))
print("Book abbr. only: ", len(list(itertools.chain.from_iterable(books_only))), ", book and chapters: ", len(list(itertools.chain.from_iterable(book_chapters))), "book, chapter and verses: ", len(list(itertools.chain.from_iterable(book_chapter_verses))))




Book abbr. only:  7360 , book and chapters:  42621 book, chapter and verses:  38263


In [68]:
authors_overview = []
for author in authors_bib_books:
    authors_overview.append([len(re.findall(r'\w+.', author)), len(re.findall(r'\w+\_\d+.', author)), len(re.findall(r'\w+\_\d+\_\d+', author))])
authors_overview
authors_overview_df = pd.DataFrame(authors_overview, columns=["all_book_abbr", "all_book_chapters", "book_ch_v"])                                

In [69]:
file = open("data/authors_dict.pickle", "rb")
authors_dict = pickle.load(file)
authors_df = pd.DataFrame.from_dict(authors_dict, orient="index")

file = open('data/authors_word_counts.pickle', 'rb')
authors_word_counts = pickle.load(file)
authors_word_counts_df = pd.DataFrame(authors_word_counts, columns=["word_counts"])
authors_data_overview = pd.concat([authors_df, authors_word_counts_df, authors_overview_df], axis=1)

authors_data_overview.sort_values("word_counts", ascending=False, inplace=True)

with open("data/authors_data_overview.pkl", "wb") as f:
    pickle.dump(authors_data_overview, f)
authors_data_overview.to_csv("data/authors_data_overview.csv")

authors_data_overview

Unnamed: 0,0,word_counts,all_book_abbr,all_book_chapters,book_ch_v
68,unknown,8669518,4500,2235,1198
2,Augustinus,7578566,18798,18386,17784
30,Hieronymus,2773401,4120,3772,3514
28,Gregorius I Magnus,2131024,1211,357,224
0,Ambrosius,1918521,5613,4606,4390
33,Isidorus Hispalensis,1878572,1224,762,123
4,Boethius,1045805,53,26,11
36,Leo I Magnus,884185,272,132,68
31,Hilarius,816962,636,293,254
12,Cyprianus,746357,1224,1084,1013


In [102]:
data = [
     go.Bar(
        y=authors_data_overview[1:24]["word_counts"],
        x=authors_data_overview[1:24][0],
        name= "Total number of words",
        width=0.95,
        marker=dict(
            color='rgba(204,204,204,1)',
            )
        ),   
    go.Bar(
       y=authors_data_overview[1:24]["all_book_abbr"],
        x=authors_data_overview[1:24][0],
        name= "All biblical books abbreviations",
        yaxis="y2",
        width=0.15
        ),
    go.Bar(
        y=authors_data_overview[1:24]["all_book_chapters"],
        x=authors_data_overview[1:24][0],
        name= "Biblical books abbr. and chapters",
        yaxis="y2",
        width=0.15
        ),
    go.Bar(
        y=authors_data_overview[1:24]["book_ch_v"],
        x=authors_data_overview[1:24][0],
        name= "Bib. books abbr., chapters and verses",
        yaxis="y2",
        width=0.15
        ),
    ]


layout = go.Layout(
    barmode='group',
    title='Word couts and biblical books abbreviations counts',
    yaxis=dict(
        title="word counts",
        side="right"
    ),
    yaxis2=dict(
        title="abbrevations counts",
        overlaying="y",
        side="left"
    ),
    autosize=False,
    width=700,
    height=500,
    margin=go.Margin(
        ##l=50,
        ##r=50,
        b=200,
        ##t=100,
        ##pad=4
    ),
    legend=dict(
        x=0.5,
        y=0.8,
        # traceorder='reversed',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        ## bgcolor='#E2E2E2',
        ## bordercolor='#FFFFFF',
        ## borderwidth=2
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='authors_data_overview')


### authors_data_overview.iplot(kind='bar', filename='authors_data_overview')

In [33]:
book_chapter_verses[0]

['luke_4_33',
 'isa_1_3',
 'acts_9_34',
 'acts_8_10',
 'deut_32_49',
 'deut_6_5',
 'sap_1_4',
 'john_4_8',
 'gen_32_2',
 'deut_33_23',
 'deut_19_13',
 'luke_7_57',
 'john_16_3',
 'luke_2_38',
 'john_8_11',
 'rom_3_28',
 'john_5_44',
 'luke_7_41',
 'rom_7_14',
 'john_4_16',
 'luke_2_35',
 'gen_48_22',
 'dan_3_18',
 'luke_8_5',
 'acts_28_27',
 'rom_5_5',
 'sap_7_12',
 'john_10_38',
 'john_3_2',
 'john_1_1',
 'rom_1_24',
 'john_8_42',
 'john_16_27',
 'rom_8_32',
 'john_5_20',
 'mark_10_18',
 'luke_22_42',
 'john_10_3',
 'john_5_23',
 'deut_21_23',
 'luke_23_43',
 'john_20_12',
 'john_2_18',
 'sap_8_13',
 'john_1_18',
 'acts_2_6',
 'luke_4_51',
 'mark_1_13',
 'rom_1_3',
 'deut_6_4',
 'luke_9_26',
 'matt_25_31',
 'mark_8_38',
 'sap_7_26',
 'mark_1_25',
 'luke_4_41',
 'gen_5_11',
 'mark_3_17',
 'john_17_21',
 'john_10_17',
 'john_6_58',
 'john_3_13',
 'john_5_21',
 'rom_4_24',
 'john_5_26',
 'john_6_58',
 'john_17_5',
 'john_13_31',
 'john_1_3',
 'rom_9_5',
 'sap_7_37',
 'rom_11_35',
 'rom_1

In [52]:
authors_b_c_v = []
for author in book_chapter_verses:
    authors_b_c_v.append(" ".join(author))
### to look at first instance
authors_b_c_v[0]

'luke_4_33 isa_1_3 acts_9_34 acts_8_10 deut_32_49 deut_6_5 sap_1_4 john_4_8 gen_32_2 deut_33_23 deut_19_13 luke_7_57 john_16_3 luke_2_38 john_8_11 rom_3_28 john_5_44 luke_7_41 rom_7_14 john_4_16 luke_2_35 gen_48_22 dan_3_18 luke_8_5 acts_28_27 rom_5_5 sap_7_12 john_10_38 john_3_2 john_1_1 rom_1_24 john_8_42 john_16_27 rom_8_32 john_5_20 mark_10_18 luke_22_42 john_10_3 john_5_23 deut_21_23 luke_23_43 john_20_12 john_2_18 sap_8_13 john_1_18 acts_2_6 luke_4_51 mark_1_13 rom_1_3 deut_6_4 luke_9_26 matt_25_31 mark_8_38 sap_7_26 mark_1_25 luke_4_41 gen_5_11 mark_3_17 john_17_21 john_10_17 john_6_58 john_3_13 john_5_21 rom_4_24 john_5_26 john_6_58 john_17_5 john_13_31 john_1_3 rom_9_5 sap_7_37 rom_11_35 rom_11_35 rom_1_25 john_4_2 john_17_1 john_8_17 john_14_12 john_5_22 gen_11_7 phil_2_6 rom_5_19 john_7_8 john_13_35 john_13_31 john_12_45 john_7_28 john_12_49 john_6_17 john_12_49 john_8_38 matt_11_27 luke_17_31 luke_19_43 rom_11_20 gen_18_21 tit_1_2 dan_13_42 luke_12_10 john_20_22 rom_5_5 rom

In [53]:
#  to save the abbreviations into a file
with open("data/CSEL_PL_b_c_v.pkl", "wb") as f:
    pickle.dump(authors_b_c_v, f)

In [45]:
def repl(m):
    return m.group(1)

authors_b_c = []
for author in book_chapters:
    authors_b_c.append(re.sub(r'(\w+_\d+)(\_)(\d+)?', repl, " ".join(author)))
authors_b_c[0]    

'luke_4  isa_1  acts_9  acts_8  deut_32  job_10 gen_3 jude_7 deut_6  sap_1  luke_7 john_1 john_7 john_4  gen_32  deut_33  deut_19  john_11 rom_12 luke_7  john_16  luke_2  john_1 gen_43 john_8  rom_3  john_5  luke_111 luke_7  john_3 rom_7  john_4  luke_2  gen_48  deut_32 dan_3  dan_3 luke_8  job_100 rom_12 acts_4 acts_28  john_1 rom_5  john_1 sap_7  john_10  john_3  john_1  rom_1  john_8  john_16  rom_8  john_5  john_8 mark_10  luke_22  phil_2 john_10  john_5  deut_21  luke_23  john_20  john__7  john_2  john_500 sap_8  john_1  rom_2 acts_2  luke_4  mark_1  rom_1  john_14 deut_6  luke_9  matt_25  mark_8  sap_7  mark_1  luke_4  acts_7 gen_5  john_3 mark_3  john_17  mark_6 john_10  john_6  john_3  john_5  rom_4  john_5  john_6 john_6  john_17  john_13  john_1  rom_9  rom_11 sap_7  rom_11  sap__37  rom_11  gen_1 rom_1  john_5 john_4  john_17  acts_4 john_8  john_14  john_5  luke_1 john_17 john_17 rom_8 gen_11  phil_2  rom_5  john_7  john_13  john_16 john_8 john_13  john_1 rom_1 john_12  joh

In [46]:
#  to save the abbreviations into a file
with open("data/CSEL_PL_b_c.pkl", "wb") as f:
    pickle.dump(authors_b_c, f)