

**Make sure that you are connected to the kernel associated with our virtual environment . Go to `Kernel` -> `Change kernel` and choose `pipa_venv`.**

In [22]:
import pandas as pd
import requests
import re

import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

In [23]:
s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


# Load the main dataset of ancient Greek texts

In [2]:
# find local version of the input dataset
local_paths = !find ~/Projects -name "LIRE_v1-0.json"
print(local_paths)

['/Users/kasev/Projects/paul/data/large_files/LIRE_v1-0.json']


In [3]:
# LAGT (v1.0.1) dataset directly from Zenodo
try:
    LAGT = pd.read_json(local_paths[0])
except:
    resp = requests.get("https://zenodo.org/record/4971946/files/LAGT_v1-0.json?download=1")
    # save it for next time
    LAGT = pd.DataFrame(resp.json())
    LAGT.to_json("../data/large_files/LIRE_v1-0.json")

In [4]:
# get some overview of the shape of the data
LAGT.head(5)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,Epici/-ae,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",3252,"[[ἄρχω, Φοῖβος, παλαιγενής, κλέος, φώς, μιμνήσ..."
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Historici/-ae,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,6068,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,110763,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,Biographi,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,10245,"[[φιλοσοφία, ἔργον, ἔνιοι, φημί, βάρβαρος, ἄρχ..."
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,"̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...",1982,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, τῆνος, πηγή,..."
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,152,"[[ῥόδον, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλ..."


In [5]:
def flat_lemmata(sentences):
    lemmata = [word for sent in sentences for word in sent]
    return lemmata
LAGT["lemmata"] = LAGT["lemmatized_sentences"].apply(flat_lemmata)

LAGT["lemmata_wordcount"] = LAGT["lemmata"].apply(lambda x: len(x))
LAGT["lemmata_wordcount"].sum() # previously we had 13925726, then 13713183, finally 14756899

14383627

# Extract subcorpus



In [6]:
### how many documents we have
len(LAGT)

1457

In [33]:
cgl = LAGT[(LAGT["date_avr"]>-5) & (LAGT["date_avr"]<-3)].copy()

In [34]:
cgl["lemmata_wordcount"].sum()

1637456

# Subcorpora

In [35]:
# However in the case of Aristotle, we are interested only in a subselection of works associated with his name
# we coded them manually in the "include" column
c_aristotelicum_coded = pd.read_csv("../data/c_aristotelicum_OVERVIEW.csv")

In [36]:
c_arist_doc_ids = c_aristotelicum_coded[c_aristotelicum_coded["include?"] == "y"]["doc_id"].tolist()
c_arist_doc_ids[:10]

['tlg0086.tlg001',
 'tlg0086.tlg002',
 'tlg0086.tlg005',
 'tlg0086.tlg006',
 'tlg0086.tlg008',
 'tlg0086.tlg009',
 'tlg0086.tlg010',
 'tlg0086.tlg014',
 'tlg0086.tlg016',
 'tlg0086.tlg017']

In [37]:
def assign_subcorpus(row):
    subcorp = None
    if row["author_id"] == "tlg0086":
        if row["doc_id"] in c_arist_doc_ids:
            subcorp = "c_aristotelicum"
    if row["author_id"] == "tlg0627":
        subcorp = "c_hippocraticum"
    if row["author_id"] == "tlg0059":
        subcorp = "c_platonicum"
    return subcorp

In [38]:
cgl["subcorpus"] = cgl.apply(assign_subcorpus, axis=1)

In [39]:
# test
cgl[cgl["subcorpus"]=="c_aristotelicum"]


Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,lemmata,lemmata_wordcount,subcorpus
685,tlg0086.tlg001.1st1K-grc2.xml,Aristotle,Aristotelis Analytica Priora et Posteriora,59614,tlg0086,tlg0086.tlg001,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Πρῶτον εἰπεῖν περὶ τί καὶ τίνος ἐστὶν ἡ σκέψις...,3710,"[[πρῶτος, λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστ...","[πρῶτος, λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστή...",24680,c_aristotelicum
686,tlg0086.tlg002.1st1K-grc2.xml,Aristotle,De anima,20912,tlg0086,tlg0086.tlg002,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Τῶν καλῶν καὶ τιμίων τὴν εἴδησιν ὑπολαμβάνοντε...,1317,"[[καλός, τίμιος, εἶδος, ὑπολαμβάνω, ἕτερος, ἕτ...","[καλός, τίμιος, εἶδος, ὑπολαμβάνω, ἕτερος, ἕτε...",9395,c_aristotelicum
688,tlg0086.tlg005.1st1K-grc1.xml,Aristotle,De caelo,30794,tlg0086,tlg0086.tlg005,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Ἡ περὶ φύσεως ἐπιστήμη σχεδὸν ἡ πλείστη φαίνετ...,2276,"[[φύσις, ἐπιστήμη, πλεῖστος, φαίνω, σῶμα, μέγε...","[φύσις, ἐπιστήμη, πλεῖστος, φαίνω, σῶμα, μέγεθ...",13437,c_aristotelicum
689,tlg0086.tlg006.1st1K-grc1.xml,Aristotle,Categoriae,10316,tlg0086,tlg0086.tlg006,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"ὉΜΩΝΥΜΑ λέγεται ὧν ὄνομα μόνον κοινόν, ὁ δὲ κα...",660,"[[ὁμώνυμος, λέγω, ὄνομα, μόνος, κοινός, ὄνομα,...","[ὁμώνυμος, λέγω, ὄνομα, μόνος, κοινός, ὄνομα, ...",4678,c_aristotelicum
690,tlg0086.tlg008.1st1K-grc1.xml,Aristotle,De divinatione per somnum,1194,tlg0086,tlg0086.tlg008,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Περὶ δὲ τῆς μαντικῆς τῆς ἐν τοῖς ὕπνοις γινομέ...,68,"[[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐ...","[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐν...",549,c_aristotelicum
691,tlg0086.tlg009.perseus-grc1.xml,Aristotle,Eudemian Ethics (Greek). Machine readable text,26345,tlg0086,tlg0086.tlg009,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,ὁ μὲν ἐν Δήλῳ παρὰ τῷ θεῷ τὴν αὑτοῦ γνώμην ἀπ...,1683,"[[δῆλος, θεός, ἑαυτοῦ, γνώμη, ἀποφηνάμενος, συ...","[δῆλος, θεός, ἑαυτοῦ, γνώμη, ἀποφηνάμενος, συγ...",11564,c_aristotelicum
692,tlg0086.tlg010.perseus-grc1.xml,Aristotle,Nicomachean Ethics,56620,tlg0086,tlg0086.tlg010,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"πᾶσα τέχνη καὶ πᾶσα μέθοδος, ὁμοίως δὲ πρᾶξίς ...",3920,"[[τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις, ἀγα...","[τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις, ἀγαθ...",25082,c_aristotelicum
694,tlg0086.tlg014.1st1K-grc1.xml,Aristotle,Historia animalium,93923,tlg0086,tlg0086.tlg014,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,ΤΩΝ ἐν τοῖς ζῴοις μορίων τὰ μέν ἐστιν ἀσύνθετα...,6337,"[[ζῷον, μόριον, εἰμί, ἀσύνθετος, ὅσος, διαιρέω...","[ζῷον, μόριον, εἰμί, ἀσύνθετος, ὅσος, διαιρέω,...",43967,c_aristotelicum
696,tlg0086.tlg016.1st1K-grc1.xml,Aristotle,De insomniis,2436,tlg0086,tlg0086.tlg016,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"Μετὰ δὲ ταῦτα περὶ ἐνυπνίου ζητητέον, καὶ πρῶτ...",129,"[[ἐνύπνιον, ζητητέος, πρῶτος, ψυχή, φαίνω, νοη...","[ἐνύπνιον, ζητητέος, πρῶτος, ψυχή, φαίνω, νοητ...",1095,c_aristotelicum
697,tlg0086.tlg017.1st1K-grc1.xml,Aristotle,De interpretatione,6271,tlg0086,tlg0086.tlg017,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"ΠΡΩΤΟΝ δεῖ θέσθαι τί ὄνομα καὶ τί ῥῆμα, ἔπειτα...",396,"[[πρῶτος, δεῖ, τίθημι, ὄνομα, ῥῆμα, ἀπόφασις, ...","[πρῶτος, δεῖ, τίθημι, ὄνομα, ῥῆμα, ἀπόφασις, κ...",2632,c_aristotelicum


# Testing replacements

In [10]:
cgl_list = []
for list_element in cgl["lemmata"].tolist():
    cgl_list.extend(list_element)

In [14]:
keyterm_patterns = [("^λ[υ|ύ]π.+", "λύπ*"), ("[α|ά|ἀ|ἄ]λγ.+", "ἄλγ*"), ("^[ὀ|ὠ]δ[ύ|υ]ν.", "ὀδύν*"), ("^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+", "πόνο*")]

In [16]:
# unique word forms
matches = []
for pattern_tuple in keyterm_patterns:
    r = re.compile(pattern_tuple[0])
    matches.extend([(pattern_tuple[0], pattern_tuple[1], match, cgl_list.count(match)) for match in list(filter(r.search, list(set(cgl_list))))])
matches_df = pd.DataFrame(matches, columns=["pattern", "replacement", "match", "cgl",]) #, "translation"])
matches_df

Unnamed: 0,pattern,replacement,match,cgl
0,^λ[υ|ύ]π.+,λύπ*,λυποδίαν,1
1,^λ[υ|ύ]π.+,λύπ*,λυπητικός,1
2,^λ[υ|ύ]π.+,λύπ*,λυπούσας,1
3,^λ[υ|ύ]π.+,λύπ*,λυπρᾷ,1
4,^λ[υ|ύ]π.+,λύπ*,λυπεῖταἰ,1
...,...,...,...,...
222,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονεέτω,4
223,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονήσεται,1
224,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονοῦσʼ,1
225,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονοῦντος,2


In [40]:
# set_with_dataframe(PIPA_data.add_worksheet("matches_20220517", 1,1), matches_df)

# Make the actual replacement within the texts

In [41]:
# use these regular expressions to make replacements in the list of lemmata
def replacer_word_list(pattern, product, word_list):
    return [re.sub(pattern, product, word) for word in word_list]

def replace_keywords(list_of_words, list_of_tuples):
    for pattern in list_of_tuples:
        list_of_words = replacer_word_list(pattern[0], pattern[1], list_of_words)
    return list_of_words

In [42]:
### test (includes artificial words):
word_list_test = ['βοοκ', 'πᾶς', 'μέλυπρᾷ', "ἄλγτέχνη",'τέχνη' ,'πᾶς', 'μέθοδος', 'ὅμοιος', "λύπη",'πρᾶξίς', 'προαίρεσις', 'ἀγαθός', 'ἐφίημι']
replace_keywords(word_list_test, keyterm_patterns)

['βοοκ',
 'πᾶς',
 'μέλυπρᾷ',
 'ἄλγ*',
 'τέχνη',
 'πᾶς',
 'μέθοδος',
 'ὅμοιος',
 'λύπ*',
 'πρᾶξίς',
 'προαίρεσις',
 'ἀγαθός',
 'ἐφίημι']

In [43]:
# apply the replacement on the level of individual words
cgl["lemmata_repl"] = cgl["lemmata"].apply(lambda x: replace_keywords(x, keyterm_patterns))

In [44]:
# apply the replacement on the sentences
def replace_in_sentences(list_of_sentences):
    return [replace_keywords(sentence, keyterm_patterns) for sentence in list_of_sentences]

cgl["lemmatized_sentences_repl"] = cgl["lemmatized_sentences"].apply(replace_in_sentences)

# Export the subcorpus for future usage

In [45]:
cgl.to_json("../data/large_data/cgl_repl.json")