In [0]:
### Prerequisties
# most of the packages are available via anaconda
# some of them require to be installed separately

# some of the packages (especially nltk) are necessary only for preprocessing
# feel free to ignore them and move directly into later sections

import os
import pandas as pd
import numpy as np
import nltk
import pickle
import json
import sys
import re

import csv

from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET

### for plotting
import matplotlib.pyplot as plt

import gspread

from gspread_dataframe import get_as_dataframe, set_with_dataframe

# a few other packages are imported in the section on visualizing distances

In [1]:
### to establish connection with google drive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
### to work with GoogleSheets

from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

# To start with parsed data

In [0]:
### read the file from GoogleDrive

### path in the VK drive - other users have to modify accordingly...
CSEL_raw_df = pd.read_csv("/content/gdrive/My Drive/aaa_computation/AUCTORITATES/AUCTORITATES B - extraction of biblical references from footnotes/v3.3 (implementation of abbrs from JarKub)/data/CSEL_raw.csv", names=["file_name", "author", "title", "volume","text"])


In [10]:
CSEL_raw_df

Unnamed: 0,file_name,author,title,volume,text
0,stoa0045a.stoa001.opp-lat1.xml,,"Epistulae Imperatorum Pontificum Aliorum, I",unknown,\n\n\n (1.) QVAE GESTA SUNT INTER LIBERIUM ET ...
1,stoa0045a.stoa001.opp-lat2.xml,,"Epistulae Imperatorum Pontificum Aliorum, II",unknown,\n\n\n\n(105.)\n\n\n\n SACRO ET BEA...
2,stoa0162.stoa004.opp-lat3.xml,Hieronymus,"Epistulae, 120-154",unknown,\n\n\n\nCXXI. AD ALGASIAM LIBER QVAESTIONUM UN...
3,stoa0162.stoa004.opp-lat2.xml,Hieronymus,"Epistulae, 71-120",unknown,\n\n\n\n\nLXXI. AD LUCINUM BAETICUM. \n\n\n\nN...
4,stoa0162.stoa004.opp-lat1.xml,Hieronymus,"Epistulae, 1-70",unknown,\n\n\nS. EVSEBII HIERONVMI EPISTVLAE I-LXX. \n...
5,stoa0162.stoa024.opp-lat1.xml,Hieronymus,In Hieremiam Prophetam,unknown,\n\n\n\n\nPROLOGUS. \n\n\n\n Post explanation...
6,stoa0215a.stoa002.opp-lat1.xml,Optatus Milevitanus,Contra Parmenianum Donatistam,unknown,\n\n\n\nAppendix decem monumentorum ueterum ad...
7,stoa0215a.stoa001.opp-lat1.xml,Optatus Milevitanus,Contra Parmenianum Donatistam,unknown,\n\n\n\nLIBER PRIMUS. \n\n\nCunctos nos christ...
8,stoa0089a.stoa002.opp-lat1.xml,Claudianus Mamertus,De Statu Animae,unknown,\n\n\nCLAVDIANI MAMERTI EPISTVLAE DVAE. EPISTV...
9,stoa0089a.stoa001.opp-lat1.xml,Claudianus Mamertus,De Statu Animae,unknown,"\n\n\n\nEPISTVLA FAVSTI. \n\nQuaeris a me, reu..."


In [0]:
authors_kuben = get_as_dataframe(gc.open_by_url('https://docs.google.com/spreadsheets/d/1F9EKced2JVW4a34Wu3XYLZ-kaZGP6Qt3Nh0GQjdp3to/edit?usp=sharing').worksheet("Authors"))

In [0]:
authors_kuben

In [9]:
### what unique authors are hidden behind the tag author

print(CSEL_raw_df["author"].unique())

[nan 'Hieronymus' 'Optatus Milevitanus' 'Claudianus Mamertus'
 'Paulinus Petricordiae' 'Ennodius' 'Lucifer Calaritanus'
 'Hilarius Pictaviensis' 'Cyprianus' 'Sulpicius Severus' 'Iuvencius'
 'Anonymous' 'Paulinus Nolanus' 'Pseudo-Hilarius' 'Tertullianus'
 'Minutius Felix' 'Eucherius' 'Sedulius' 'Salvianus' 'Arnobius'
 'Pseudo-Cyprianus' 'Boethius' 'Victor Vitensis'
 'Victorinus Petavionensis' 'Paulinus\n\t\t\t\t\t\t\t\tPetricordiae'
 'Lactantius' 'Pseudo-Augustinus' 'Orosius' 'Priscillianus'
 'Flavius Iosephus' 'Ambrosius' 'Evagrius' 'Augustinus' 'Rufinus'
 'Faustus Reiensis' 'Commodianus' 'Cassianus' 'Iohannes Cassianus'
 'Filastrius' 'Eugippius']


In [6]:
len(CSEL_raw_df["author"].unique())

40

In [0]:
### transform the author column to string type
CSEL_raw_df["author"] = CSEL_raw_df["author"].astype(str)

In [8]:
### replace disturbing characters from authors' names
CSEL_raw_df["author"] = CSEL_raw_df["author"].replace({
    "\n" : " ",
    "\t" : "",
    "nan" : "unknown",
    "Anonymous" : "unknown",
    "Iohannes Cassianus" : "Cassianus",   
    }, regex=True)
sorted(CSEL_raw_df["author"].unique().tolist())

['Ambrosius',
 'Arnobius',
 'Augustinus',
 'Boethius',
 'Cassianus',
 'Claudianus Mamertus',
 'Commodianus',
 'Cyprianus',
 'Ennodius',
 'Eucherius',
 'Eugippius',
 'Evagrius',
 'Faustus Reiensis',
 'Filastrius',
 'Flavius Iosephus',
 'Hieronymus',
 'Hilarius Pictaviensis',
 'Iuvencius',
 'Lactantius',
 'Lucifer Calaritanus',
 'Minutius Felix',
 'Optatus Milevitanus',
 'Orosius',
 'Paulinus Nolanus',
 'Paulinus Petricordiae',
 'Priscillianus',
 'Pseudo-Augustinus',
 'Pseudo-Cyprianus',
 'Pseudo-Hilarius',
 'Rufinus',
 'Salvianus',
 'Sedulius',
 'Sulpicius Severus',
 'Tertullianus',
 'Victor Vitensis',
 'Victorinus Petavionensis',
 'unknown']

In [9]:
len(CSEL_raw_df)

278

In [10]:
CSEL_raw_df[CSEL_raw_df["author"]=="unknown"]["file_name"].tolist()

['stoa0045a.stoa001.opp-lat1.xml',
 'stoa0045a.stoa001.opp-lat2.xml',
 'stoa0227b.stoa001.opp-lat1.xml',
 'stoa0329.stoa001.opp-lat1.xml',
 'stoa0028e.stoa001.opp-lat1.xml',
 'stoa0028e.stoa001.opp-lat2.xml',
 'stoa0329c.stoa001.opp-lat1.xml',
 'stoa0117p.stoa001.opp-lat1.xml',
 'stoa0054.stoa001a.opp-lat1.xml',
 'stoa0007.stoa002.opp-lat1.xml',
 'stoa0111.stoa001.opp-lat3.xml',
 'stoa0278a.stoa001.opp-lat1.xml']

In [11]:
### merge rows with the same author
CSEL_grouped_df = CSEL_raw_df.groupby(["author"]).sum()
CSEL_grouped_df.reset_index(level=[0], inplace=True)
CSEL_grouped_df.head(5)

Unnamed: 0,author,file_name,title,volume,text
0,Ambrosius,stoa0022.stoa021.opp-lat1.xmlstoa0022.stoa019....,HexameronDe IacobDe IacobDe IacobHexameronExpo...,unknownunknownunknownunknownunknownunknownunkn...,\n\n\n\n[LIBER PRIMVS.]\n\n De paradiso in sup...
1,Arnobius,stoa0034.stoa001.opp-lat1.xml,Adversus Nationes,unknown,\n\n\n\nLIBER PRIMVS\n\nQuoniam comperi nonnul...
2,Augustinus,stoa0120d.stoa001.opp-lat1.xmlstoa0040.stoa056...,Contra FelicemDe Perfectione Iustitiae Hominis...,unknownunknownunknownunknownunknownunknownunkn...,\n\n\n1. EUODII DE FIDE CONTRA MANICHAEOS. \n\...
3,Boethius,stoa0058.stoa001.opp-lat3.xmlstoa0058.stoa007....,De Consolatione PhilosophiaeCommentaria in Por...,unknownunknownunknown,\n\n\nANICII MANLII SEVERINI BOETHII\n\n\nPHIL...
4,Cassianus,stoa0076c.stoa002.opp-lat2.xmlstoa0076c.stoa00...,InstitutionesInstitutionesConlationes,unknownunknownunknown,\n\n\n\nPRAEFATIO. \n\n\nUeteris instrumenti n...


In [12]:
len(CSEL_grouped_df)

37

In [0]:
authors_word_counts = {}
for author, text in zip(CSEL_grouped_df["author"].tolist(), CSEL_grouped_df["text"].tolist()):
    authors_word_counts[author] = len(text.split())

In [14]:
authors_word_counts

{'Ambrosius': 885580,
 'Arnobius': 93920,
 'Augustinus': 3523400,
 'Boethius': 155451,
 'Cassianus': 337862,
 'Claudianus Mamertus': 62407,
 'Commodianus': 63889,
 'Cyprianus': 691880,
 'Ennodius': 173599,
 'Eucherius': 55442,
 'Eugippius': 370282,
 'Evagrius': 20585,
 'Faustus Reiensis': 140744,
 'Filastrius': 40146,
 'Flavius Iosephus': 47413,
 'Hieronymus': 740652,
 'Hilarius Pictaviensis': 80588,
 'Iuvencius': 44853,
 'Lactantius': 376422,
 'Lucifer Calaritanus': 105594,
 'Minutius Felix': 39577,
 'Optatus Milevitanus': 76746,
 'Orosius': 242516,
 'Paulinus Nolanus': 264778,
 'Paulinus Petricordiae': 125985,
 'Priscillianus': 46662,
 'Pseudo-Augustinus': 155884,
 'Pseudo-Cyprianus': 110456,
 'Pseudo-Hilarius': 1195,
 'Rufinus': 41078,
 'Salvianus': 106197,
 'Sedulius': 112702,
 'Sulpicius Severus': 89301,
 'Tertullianus': 349441,
 'Victor Vitensis': 39620,
 'Victorinus Petavionensis': 44145,
 'unknown': 341418}

In [15]:
### check the number of words
len((" ".join(CSEL_grouped_df["text"].tolist())).split())

10198410

# Bibl Abbreviations

In [0]:
abbr_kase_df = get_as_dataframe(gc.open_by_url('https://docs.google.com/spreadsheets/d/1vWUsY1moh6b5A27YDxPSuibayXNLRUV97Hz9TICyBBo/edit?usp=sharing').worksheet("replace_abbr"))

In [40]:
repl_tuple = list(abbr_kase_df.itertuples(index=False, name=None))
print(repl_tuple[:10])

[('Gn ', 'Gen. '), ('Genes.', 'Gen.'), ('Ex.', 'Exo.'), ('Lv.', 'Lev.'), ('Nm.', 'Num.'), ('Dt.', 'Deut'), ('I Sm.', '1Sam.'), ('1Sm.', '1Sam.'), ('1 Sam.', '1Sam.'), ('II Sm.', '2Sam.')]


In [41]:
len(repl_tuple)

150

In [17]:
### upload abbreviatians variants as found by Mr. Kuben into a dataframe

abbr_kuben = get_as_dataframe(gc.open_by_url("https://docs.google.com/spreadsheets/d/15_acFNFaTUgTW4DODk4hdMcgvGeyQlTbx783gTrEXhI/edit?usp=sharing").worksheet("List 1"))
abbr_kuben.head(5)

Unnamed: 0,Book_ID,Full_name,Abbr1,Abbr2,Abbr3,Abbr4,Abbr5,Abbr6,Unnamed: 8,Unnamed: 9,...,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,1.0,Genesis,Gen.,Genes.,Gon.,Qen.,Oen.,,,,...,,,,,,,,,,
1,2.0,Exodus,Ex.,Exod.,Ei.,E*.,Er.,EL,Eiod.,Ezod.,...,,,,,,,,,,
2,3.0,Leviticus,Levit.,Leuit.,Lev.,LeY.,Leu.,Len.,Leoit.,,...,,,,,,,,,,
3,4.0,Numerorum,Num.,Nam.,Nom.,NUID.,Nurn.,NumBr.,Nomer.,Nuin.,...,,,,,,,,,,
4,5.0,Deuteronomii,Deut.,Dent.,Deot.,Deat.,Deuteron.,Deuter.,Deuteroii.,Deater.,...,,,,,,,,,,


In [18]:
### upload the table of different forms of biblical abbr (from DZ)

abbr_conv_df = get_as_dataframe(gc.open_by_key("1byVbRwnQ058--0ud1_NBt2xgIhuiV7LK1BmCm67CTxA").worksheet("AbbrevConverter"))
abbr_conv_df[6:11]

Unnamed: 0,Book_ID,SBL_handbook_of_style,CSEL_VK,CSEL_OGL,English_full,Latin1,Latin2,Latin_full,Czech,Czech_full,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
6,1.0,Gen,Gen.,,Genesis,Gn,Gn,Genesis,Gn,Genesis,...,,,,,,,,,,
7,2.0,Exod,Exo.,,Exodus,Ex,Ex,Exodus,Ex,Exodus,...,,,,,,,,,,
8,3.0,Lev,Lev.,,Leviticus,Lv,Lv,Leviticus,Lv,Leviticus,...,,,,,,,,,,
9,4.0,Num,Num.,,Numbers,Nm,Nm,Numeri,Nu,Numeri,...,,,,,,,,,,
10,5.0,Deut,Deut.,,Deuteronomy,Dt,Dt,Deuteronomium,Dt,Deuteronomium,...,,,,,,,,,,


In [19]:
### we can navigate  through the converter by "Book_ID" variable

abbr_conv_df.loc[abbr_conv_df["Book_ID"]==5, "CSEL_VK"].values[0]

'Deut.'

In [42]:
for row in abbr_kuben.values.tolist():
  if row[0]>0:
    replaced_abbr = abbr_conv_df.loc[abbr_conv_df["Book_ID"]==float(row[0]), "CSEL_VK"].values[0]
    for element in row[2:]:
      if "nan" not in str(element):
        tuple_pair = (element, replaced_abbr)
        if tuple_pair not in repl_tuple:
          repl_tuple.append(tuple_pair)
len(repl_tuple)

453

In [43]:
print(repl_tuple)

[('Gn ', 'Gen. '), ('Genes.', 'Gen.'), ('Ex.', 'Exo.'), ('Lv.', 'Lev.'), ('Nm.', 'Num.'), ('Dt.', 'Deut'), ('I Sm.', '1Sam.'), ('1Sm.', '1Sam.'), ('1 Sam.', '1Sam.'), ('II Sm.', '2Sam.'), ('2 Sam.', '2Sam.'), ('Ios.', 'Josh.'), ('Idc.', 'Judg.'), ('Jdg.', 'Judg.'), ('1 Reg.', '1Kgs.'), ('II Reg.', '2Kgs.'), ('2 Reg.', '2Kgs.'), ('2 Rg.', '2Kgs.'), ('II Rg.', '2Kgs.'), ('2Rg.', '2Kgs.'), ('2 Ki.', '2Kgs.'), (' II Ki.', '2Kgs.'), ('I Reg.', '1Kgs.'), ('1 Rg.', '1Kgs.'), ('I Rg.', '1Kgs.'), ('1Rg.', '1Kgs.'), ('1 Ki.', '1Kgs.'), ('I Ki.', '1Kgs.'), ('2 Ch.', '2Chr.'), ('II Ch.', '2Chr.'), ('2 Chr.', '2Chr.'), ('II Chr.', '2Chr.'), ('2 Par.', '2Chr.'), ('II Par.', '2Chr.'), ('1 Ch.', '1Chr.'), ('I Ch.', '1Chr.'), ('1 Chr.', '1Chr.'), ('I Chr.', '1Chr.'), ('1 Par.', '1Chr.'), ('I Par.', '1Chr.'), ('Est.', 'Esth.'), ('Hiob', 'Job'), ('Iob', 'Job'), ('PB.', 'Psa.'), ('Psal.', 'Psa.'), ('Ps.', 'Psa.'), ('Ecc.', 'Eccl.'), ('Ecl.', 'Eccl.'), ('Prov.', 'Pro.'), ('Prou.', 'Pro.'), ('Prv.', 'Pro.')

In [0]:
repl_tuple.to_pickle ???



In [0]:
### we will only replace " " with "_"for now

In [0]:
CSEL_raw_df["text"].replace(replace_abbr, regex=True, inplace=True)

In [45]:
CSEL_raw_df["text"]

0      \n\n\n (1.) QVAE GESTA SUNT INTER LIBERIUM ET ...
1      \n\n\n\n(105.)\n\n\n\n            SACRO ET BEA...
2      \n\n\n\nCXXI. AD ALGASIAM LIBER QVAESTIONUM UN...
3      \n\n\n\n\nLXXI. AD LUCINUM BAETICUM. \n\n\n\nN...
4      \n\n\nS. EVSEBII HIERONVMI EPISTVLAE I-LXX. \n...
5      \n\n\n\n\nPROLOGUS. \n\n\n\n  Post explanation...
6      \n\n\n\nAppendix decem monumentorum ueterum ad...
7      \n\n\n\nLIBER PRIMUS. \n\n\nCunctos nos christ...
8      \n\n\nCLAVDIANI MAMERTI EPISTVLAE DVAE. EPISTV...
9      \n\n\n\nEPISTVLA FAVSTI. \n\nQuaeris a me, reu...
10     \n\n\nPOMPONII UERSUS AD GRATIAM DOMINI. \n\n\...
11     \n\n\nV. [EVCHARISTICVM DE VITA SVA.] IN NOMIN...
12     \n\n\nX. BENEDICTIO CEREI. \n\nDignum et iustu...
13     \n\n\nIIII. DE VITA BEATI ANTONI MONACHI. PRAE...
14     \n\n\nDICTIONES. \n\n\n\nI. DICTIO IN NATALI L...
15     \n\n\nMAGNI FELICIS ENNODI CARMINA. \n\n\n LIB...
16     \n\n\nVII. PRAECEPTVM QVANDO IVSSI SVNT OMNES ...
17     \n\n\nVIII. PETITORIVM Q

In [0]:
### saving the dataframe into a pickle file
CSEL_raw_df.to_pickle("data/CSEL_modified_df.pkl")