# Data preparation and ingestion

In [10]:
# Imports
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm
from elasticsearch import Elasticsearch
from statistics import median
from transformers import BigBirdModel, BigBirdTokenizer
import matplotlib.pyplot as plt

## Load the data

In [11]:
# Load in Pandas df
data_path = "../data/ancient_sources.csv.gz"
df = pd.read_csv(data_path, compression="gzip")
df.head()

Unnamed: 0,author,title,section,text
0,P. Ovidius Naso,The Epistles of Ovid,"Penelope to Ulysses, carte 53",Now corn grows where once Troy stood; and the ...
1,P. Ovidius Naso,Amores,"Liber Primus, ELEGIA 1","ELEGIA 1 Quemadmodum a Cupidine, pro bellis am..."
2,E.C. Marchant,Commentary on Thucydides: Book 7,"book 7, chapter 1, section 1",Ὁ —the art. added because this is a continuati...
3,E.C. Marchant,Commentary on Thucydides: Book 6,"book 6, chapter 1, section 1",ἐβούλοντο —‘the word is here (as in Xen. Hel. ...
4,E.C. Marchant,Commentary on Thucydides: Book 3,"book 3, chapter 1, section 1","θέρους —Thuc. divides the year into θέρος , co..."


In [12]:
# Check size
df.shape

(111752, 4)

## Data cleaning and EDA

In [13]:
# Create a function that counts number of words of a text
def count_words(text):
    return len(str(text).split())

# Append a column with the number of words of the texts
df["num_words"] = df["text"].apply(count_words)

# Show first rows
df.head()

Unnamed: 0,author,title,section,text,num_words
0,P. Ovidius Naso,The Epistles of Ovid,"Penelope to Ulysses, carte 53",Now corn grows where once Troy stood; and the ...,684
1,P. Ovidius Naso,Amores,"Liber Primus, ELEGIA 1","ELEGIA 1 Quemadmodum a Cupidine, pro bellis am...",287
2,E.C. Marchant,Commentary on Thucydides: Book 7,"book 7, chapter 1, section 1",Ὁ —the art. added because this is a continuati...,514
3,E.C. Marchant,Commentary on Thucydides: Book 6,"book 6, chapter 1, section 1",ἐβούλοντο —‘the word is here (as in Xen. Hel. ...,739
4,E.C. Marchant,Commentary on Thucydides: Book 3,"book 3, chapter 1, section 1","θέρους —Thuc. divides the year into θέρος , co...",50


In [14]:
df["num_words"].describe()

count    111752.000000
mean        173.964663
std         363.657284
min           1.000000
25%          56.000000
50%          92.000000
75%         166.000000
max       31177.000000
Name: num_words, dtype: float64

In [23]:
df_nans_text = df[df["text"].isna()]

In [25]:
df_nans_text

Unnamed: 0,author,title,section,text,num_words
3339,Quintilian,"Institutio Oratoria, Book 8",", chapter pr, section 1",,1
32225,M. Tullius Cicero,On the Agrarian Law,"text Agr., chapter 1, section 1",,1
44168,Plutarch,An vitiositas ad infelicitatem sufficia,section intro,,1
44189,Plutarch,An vitiositas ad infelicitatem sufficia,,,1
44210,Plutarch,An vitiositas ad infelicitatem sufficia,section 2,,1
...,...,...,...,...,...
71142,Plutarch,Apophthegmata Laconica,,,1
71163,Plutarch,Apophthegmata Laconica,", section 69",,1
71184,Plutarch,Apophthegmata Laconica,,,1
71205,Plutarch,Apophthegmata Laconica,", section 71",,1


## Generate embeddings for vector search

In [None]:
# Transform into a list of dicts
df_dict = df.to_dict(orient="records")

In [62]:
# Remove outliers
#df_dict.pop(4782)
#df_dict.pop(88955)
#df_dict.pop(4758)
df_dict.pop(75268)

# Calculate the longest and mean text in our data
counts = []
for src in tqdm(df_dict):
    if type(src["text"]) != str:
        src["text"] = str(src["text"])
        
    counts.append(len(src["text"].split()))

max_words = max(counts)
median_words = median(counts)

print(f"The longest text in our dataset contains {max_words} words, and the median number of words per text is {median_words}")

  0%|          | 0/111745 [00:00<?, ?it/s]

The longest text in our dataset contains 14981 words, and the median number of words per text is 92


In [64]:
np.argmax(counts)
df_dict[74688]

{'author': 'Diogenes Laertius',
 'title': 'Lives of Eminent Philosophers',
 'section': 'BOOK III, PLATO (427-347 B.C.)',
 'text': 'PLATO (427-347 B.C.) Plato was the son of Ariston and a citizen of Athens. His mother was Perictione (or Potone), who traced back her descent to Solon. For Solon had a brother, Dropides; he was the father of Critias, who was the father of Callaeschrus, who was the father of Critias, one of the Thirty, as well as of Glaucon, who was the father of Charmides and Perictione. Thus Plato, the son of this Perictione and Ariston, was in the sixth generation from Solon. And Solon traced his descent to Neleus and Poseidon. His father too is said to be in the direct line from Codrus, the son of Melanthus, and, according to Thrasylus, Codrus and Melanthus also trace their descent from Poseidon. [ 2 ] Speusippus in the work entitled Plato\'s Funeral Feast , Clearchus in his Encomium on Plato , and Anaxilaïdes in his second book On Philosophers , tell us that there was a

In [6]:
# Initialize tokenizer and model
model_name = "google/bigbird-roberta-base"
tokenizer = BigBirdTokenizer.from_pretrained(model_name)
model = BigBirdModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]