In [1]:
import pandas as pd
import requests
import re

In [23]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


# Load the main dataset of ancient Greek texts

In [2]:
# find local version of the input dataset
local_paths = !find ~/Projects -name "LIRE_v1-0.json"
print(local_paths)

['/Users/kasev/Projects/paul/data/large_files/LIRE_v1-0.json']


In [3]:
# LAGT (v1.0.1) dataset directly from Zenodo
try:
    LAGT = pd.read_json(local_paths[0])
except:
    resp = requests.get("https://zenodo.org/record/4971946/files/LAGT_v1-0.json?download=1")
    # save it for next time
    LAGT = pd.DataFrame(resp.json())
    LAGT.to_json("../data/large_files/LIRE_v1-0.json")

In [5]:
# get some overview of the shape of the data
LAGT.head(5)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,Epici/-ae,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",3252,"[[ἄρχω, Φοῖβος, παλαιγενής, κλέος, φώς, μιμνήσ..."
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Historici/-ae,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,6068,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,110763,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,Biographi,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,10245,"[[φιλοσοφία, ἔργον, ἔνιοι, φημί, βάρβαρος, ἄρχ..."
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,"̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...",1982,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, τῆνος, πηγή,..."
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,152,"[[ῥόδον, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλ..."


Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,Epici/-ae,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",3252,"[[ἄρχω, Φοῖβος, παλαιγενής, κλέος, φώς, μιμνήσ..."
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Historici/-ae,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,6068,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,110763,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,Biographi,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,10245,"[[φιλοσοφία, ἔργον, ἔνιοι, φημί, βάρβαρος, ἄρχ..."
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,"̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...",1982,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, τῆνος, πηγή,..."
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,152,"[[ῥόδον, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλ..."


In [6]:
def flat_lemmata(sentences):
    lemmata = [word for sent in sentences for word in sent]
    return lemmata
LAGT["lemmata"] = LAGT["lemmatized_sentences"].apply(flat_lemmata)

LAGT["lemmata_wordcount"] = LAGT["lemmata"].apply(lambda x: len(x))
LAGT["lemmata_wordcount"].sum() # previously we had 13925726, then 13713183, finally 14756899

14383627

# Extract subcorpus



In [7]:
### how many documents we have
len(LAGT)

1457

In [15]:
LAGT[(LAGT["date_avr"]==-5)]


Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,lemmata,lemmata_wordcount
208,tlg0013.tlg004.perseus-grc2.xml,Homeric hymn,Hymn 4 To Hermes,4031,tlg0013herm,tlg0013.tlg004,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"Ἑρμῆν ὕμνει, Μοῦσα, Διὸς καὶ Μαιάδος υἱόν, Κυλ...",332,"[[Ἑρμῆς, ὑμνέω, Μοῦσα, Ζεύς, Μαιάς, υἱός, Κυλλ...","[Ἑρμῆς, ὑμνέω, Μοῦσα, Ζεύς, Μαιάς, υἱός, Κυλλή...",2562
211,tlg0013.tlg007.perseus-grc2.xml,Homeric hymn,Hymn 7 To Dionysus,425,tlg0013dyo2,tlg0013.tlg007,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"ἀμφὶ Διώνυσον, Σεμέλης ἐρικυδέος υἱόν, μνήσομα...",42,"[[Διόνυσος, σεμέλης, ἐρικυδής, υἱός, μιμνήσκω,...","[Διόνυσος, σεμέλης, ἐρικυδής, υἱός, μιμνήσκω, ...",262
223,tlg0013.tlg019.perseus-grc2.xml,Homeric hymn,Hymn 19 to Pan,336,tlg0013pan,tlg0013.tlg019,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"ἀμφί μοι Ἑρμείαο φίλον γόνον ἔννεπε, Μοῦσα, αἰ...",19,"[[Ἑρμῆς, φίλος, γόνος, ἐνέπω, Μοῦσα, αἰγιπόδης...","[Ἑρμῆς, φίλος, γόνος, ἐνέπω, Μοῦσα, αἰγιπόδης,...",220
426,tlg0033.tlg001.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - O.,5875,tlg0033,tlg0033.tlg001,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"ἄριστον μὲν ὕδωρ, ὁ δὲ χρυσὸς αἰθόμενον πῦρ ἅ...",427,"[[ἀγαθός, ὕδωρ, χρυσός, αἴθω, πῦρ, διαπρέπω, ν...","[ἀγαθός, ὕδωρ, χρυσός, αἴθω, πῦρ, διαπρέπω, νύ...",4025
427,tlg0033.tlg002.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - P.,7427,tlg0033,tlg0033.tlg002,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"χρυσέα φόρμιγξ, Ἀπόλλωνος καὶ ἰοπλοκάμων σύνδ...",595,"[[χρύσεος, φόρμιγξ, Ἀπόλλων, ἰοπλοκάμων, σύνδι...","[χρύσεος, φόρμιγξ, Ἀπόλλων, ἰοπλοκάμων, σύνδικ...",5152
428,tlg0033.tlg003.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - N.,4944,tlg0033,tlg0033.tlg003,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"ἄμπνευμα σεμνὸν Ἀλφεοῦ, κλεινᾶν Συρακοσσᾶν θά...",402,"[[ἀνάπνευμα, σεμνός, ἀλφεοῦ, κλεινός, συρακοσσ...","[ἀνάπνευμα, σεμνός, ἀλφεοῦ, κλεινός, συρακοσσᾶ...",3393
429,tlg0033.tlg004.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - I.,2899,tlg0033,tlg0033.tlg004,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"μᾶτερ ἐμά, τὸ τεόν, χρύσασπι Θήβα, πρᾶγμα καὶ...",222,"[[μήτηρ, ἐμός, τεός, χρύσασπις, θήβα, πρᾶγμα, ...","[μήτηρ, ἐμός, τεός, χρύσασπις, θήβα, πρᾶγμα, ἀ...",1989
678,tlg0085.tlg001.perseus-grc2.xml,Aeschylus,Suppliant Maidens,4977,tlg0085,tlg0085.tlg001,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Tragici,Ζεὺς μὲν ἀφίκτωρ ἐπίδοι προφρόνως στόλον ἡμέτε...,527,"[[Ζεύς, ἀφίκτωρ, ἐπεῖδον, στόλος, ἡμέτερος, νή...","[Ζεύς, ἀφίκτωρ, ἐπεῖδον, στόλος, ἡμέτερος, νήι...",3276
679,tlg0085.tlg002.perseus-grc2.xml,Aeschylus,Persians,5221,tlg0085,tlg0085.tlg002,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Tragici,Τάδε μὲν Περσῶν τῶν οἰχομένων Ἑλλάδʼ ἐς αἶαν π...,481,"[[Πέρσης, οἴχομαι, Ἑλλάς, αἶα, πιστός, καλέω, ...","[Πέρσης, οἴχομαι, Ἑλλάς, αἶα, πιστός, καλέω, ἀ...",3482
680,tlg0085.tlg003.perseus-grc2.xml,Aeschylus,Prometheus Bound,5943,tlg0085,tlg0085.tlg003,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Tragici,"Χθονὸς μὲν ἐς τηλουρὸν ἥκομεν πέδον, Σκύθην ἐς...",590,"[[χθών, τηλουρός, ἥκω, πέδον, Σκύθης, οἷμον, ἄ...","[χθών, τηλουρός, ἥκω, πέδον, Σκύθης, οἷμον, ἄβ...",3586


In [16]:
cgl = LAGT[(LAGT["date_avr"]>=-5) & (LAGT["date_avr"]<-3)].copy()

In [17]:
cgl["lemmata_wordcount"].sum()

1680907

# Subcorpora

In [19]:
# However in the case of Aristotle, we are interested only in a subselection of works associated with his name
# we coded them manually in the "include" column
c_aristotelicum_coded = pd.read_csv("../data/c_aristotelicum_OVERVIEW.csv")

In [20]:
c_arist_doc_ids = c_aristotelicum_coded[c_aristotelicum_coded["include?"] == "y"]["doc_id"].tolist()
c_arist_doc_ids[:10]

['tlg0086.tlg001',
 'tlg0086.tlg002',
 'tlg0086.tlg005',
 'tlg0086.tlg006',
 'tlg0086.tlg008',
 'tlg0086.tlg009',
 'tlg0086.tlg010',
 'tlg0086.tlg014',
 'tlg0086.tlg016',
 'tlg0086.tlg017']

In [21]:
def assign_subcorpus(row):
    subcorp = None
    if row["author_id"] == "tlg0086":
        if row["doc_id"] in c_arist_doc_ids:
            subcorp = "c_aristotelicum"
    if row["author_id"] == "tlg0627":
        subcorp = "c_hippocraticum"
    if row["author_id"] == "tlg0059":
        subcorp = "c_platonicum"
    return subcorp

In [22]:
cgl["subcorpus"] = cgl.apply(assign_subcorpus, axis=1)

In [23]:
# test
cgl[cgl["subcorpus"]=="c_aristotelicum"]


Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,lemmata,lemmata_wordcount,subcorpus
685,tlg0086.tlg001.1st1K-grc2.xml,Aristotle,Aristotelis Analytica Priora et Posteriora,59614,tlg0086,tlg0086.tlg001,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Πρῶτον εἰπεῖν περὶ τί καὶ τίνος ἐστὶν ἡ σκέψις...,3710,"[[πρῶτος, λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστ...","[πρῶτος, λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστή...",24680,c_aristotelicum
686,tlg0086.tlg002.1st1K-grc2.xml,Aristotle,De anima,20912,tlg0086,tlg0086.tlg002,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Τῶν καλῶν καὶ τιμίων τὴν εἴδησιν ὑπολαμβάνοντε...,1317,"[[καλός, τίμιος, εἶδος, ὑπολαμβάνω, ἕτερος, ἕτ...","[καλός, τίμιος, εἶδος, ὑπολαμβάνω, ἕτερος, ἕτε...",9395,c_aristotelicum
688,tlg0086.tlg005.1st1K-grc1.xml,Aristotle,De caelo,30794,tlg0086,tlg0086.tlg005,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Ἡ περὶ φύσεως ἐπιστήμη σχεδὸν ἡ πλείστη φαίνετ...,2276,"[[φύσις, ἐπιστήμη, πλεῖστος, φαίνω, σῶμα, μέγε...","[φύσις, ἐπιστήμη, πλεῖστος, φαίνω, σῶμα, μέγεθ...",13437,c_aristotelicum
689,tlg0086.tlg006.1st1K-grc1.xml,Aristotle,Categoriae,10316,tlg0086,tlg0086.tlg006,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"ὉΜΩΝΥΜΑ λέγεται ὧν ὄνομα μόνον κοινόν, ὁ δὲ κα...",660,"[[ὁμώνυμος, λέγω, ὄνομα, μόνος, κοινός, ὄνομα,...","[ὁμώνυμος, λέγω, ὄνομα, μόνος, κοινός, ὄνομα, ...",4678,c_aristotelicum
690,tlg0086.tlg008.1st1K-grc1.xml,Aristotle,De divinatione per somnum,1194,tlg0086,tlg0086.tlg008,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Περὶ δὲ τῆς μαντικῆς τῆς ἐν τοῖς ὕπνοις γινομέ...,68,"[[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐ...","[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐν...",549,c_aristotelicum
691,tlg0086.tlg009.perseus-grc1.xml,Aristotle,Eudemian Ethics (Greek). Machine readable text,26345,tlg0086,tlg0086.tlg009,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,ὁ μὲν ἐν Δήλῳ παρὰ τῷ θεῷ τὴν αὑτοῦ γνώμην ἀπ...,1683,"[[δῆλος, θεός, ἑαυτοῦ, γνώμη, ἀποφηνάμενος, συ...","[δῆλος, θεός, ἑαυτοῦ, γνώμη, ἀποφηνάμενος, συγ...",11564,c_aristotelicum
692,tlg0086.tlg010.perseus-grc1.xml,Aristotle,Nicomachean Ethics,56620,tlg0086,tlg0086.tlg010,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"πᾶσα τέχνη καὶ πᾶσα μέθοδος, ὁμοίως δὲ πρᾶξίς ...",3920,"[[τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις, ἀγα...","[τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις, ἀγαθ...",25082,c_aristotelicum
694,tlg0086.tlg014.1st1K-grc1.xml,Aristotle,Historia animalium,93923,tlg0086,tlg0086.tlg014,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,ΤΩΝ ἐν τοῖς ζῴοις μορίων τὰ μέν ἐστιν ἀσύνθετα...,6337,"[[ζῷον, μόριον, εἰμί, ἀσύνθετος, ὅσος, διαιρέω...","[ζῷον, μόριον, εἰμί, ἀσύνθετος, ὅσος, διαιρέω,...",43967,c_aristotelicum
696,tlg0086.tlg016.1st1K-grc1.xml,Aristotle,De insomniis,2436,tlg0086,tlg0086.tlg016,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"Μετὰ δὲ ταῦτα περὶ ἐνυπνίου ζητητέον, καὶ πρῶτ...",129,"[[ἐνύπνιον, ζητητέος, πρῶτος, ψυχή, φαίνω, νοη...","[ἐνύπνιον, ζητητέος, πρῶτος, ψυχή, φαίνω, νοητ...",1095,c_aristotelicum
697,tlg0086.tlg017.1st1K-grc1.xml,Aristotle,De interpretatione,6271,tlg0086,tlg0086.tlg017,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"ΠΡΩΤΟΝ δεῖ θέσθαι τί ὄνομα καὶ τί ῥῆμα, ἔπειτα...",396,"[[πρῶτος, δεῖ, τίθημι, ὄνομα, ῥῆμα, ἀπόφασις, ...","[πρῶτος, δεῖ, τίθημι, ὄνομα, ῥῆμα, ἀπόφασις, κ...",2632,c_aristotelicum


# Testing replacements

In [24]:
cgl_list = []
for list_element in cgl["lemmata"].tolist():
    cgl_list.extend(list_element)

In [25]:
keyterm_patterns = [("^λ[υ|ύ]π.+", "λύπ*"), ("[α|ά|ἀ|ἄ]λγ.+", "ἄλγ*"), ("^[ὀ|ὠ]δ[ύ|υ]ν.", "ὀδύν*"), ("^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+", "πόνο*")]

In [26]:
# unique word forms
matches = []
for pattern_tuple in keyterm_patterns:
    r = re.compile(pattern_tuple[0])
    matches.extend([(pattern_tuple[0], pattern_tuple[1], match, cgl_list.count(match)) for match in list(filter(r.search, list(set(cgl_list))))])
matches_df = pd.DataFrame(matches, columns=["pattern", "replacement", "match", "cgl",]) #, "translation"])
matches_df

Unnamed: 0,pattern,replacement,match,cgl
0,^λ[υ|ύ]π.+,λύπ*,λυπέουσα,1
1,^λ[υ|ύ]π.+,λύπ*,λυπέον,8
2,^λ[υ|ύ]π.+,λύπ*,λυποδίαν,1
3,^λ[υ|ύ]π.+,λύπ*,λυπέωσι,1
4,^λ[υ|ύ]π.+,λύπ*,λυπούμενα,2
...,...,...,...,...
225,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονέω,301
226,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονέεις,1
227,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονοῦσʼ,1
228,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονοῦντος,2


In [27]:
# set_with_dataframe(PIPA_data.add_worksheet("matches_20220517", 1,1), matches_df)

# Make the actual replacement within the texts

In [28]:
# use these regular expressions to make replacements in the list of lemmata
def replacer_word_list(pattern, product, word_list):
    return [re.sub(pattern, product, word) for word in word_list]

def replace_keywords(list_of_words, list_of_tuples):
    for pattern in list_of_tuples:
        list_of_words = replacer_word_list(pattern[0], pattern[1], list_of_words)
    return list_of_words

In [29]:
### test (includes artificial words):
word_list_test = ['βοοκ', 'πᾶς', 'μέλυπρᾷ', "ἄλγτέχνη",'τέχνη' ,'πᾶς', 'μέθοδος', 'ὅμοιος', "λύπη",'πρᾶξίς', 'προαίρεσις', 'ἀγαθός', 'ἐφίημι']
replace_keywords(word_list_test, keyterm_patterns)

['βοοκ',
 'πᾶς',
 'μέλυπρᾷ',
 'ἄλγ*',
 'τέχνη',
 'πᾶς',
 'μέθοδος',
 'ὅμοιος',
 'λύπ*',
 'πρᾶξίς',
 'προαίρεσις',
 'ἀγαθός',
 'ἐφίημι']

In [30]:
# apply the replacement on the level of individual words
cgl["lemmata_repl"] = cgl["lemmata"].apply(lambda x: replace_keywords(x, keyterm_patterns))

In [31]:
# apply the replacement on the sentences
def replace_in_sentences(list_of_sentences):
    return [replace_keywords(sentence, keyterm_patterns) for sentence in list_of_sentences]

cgl["lemmatized_sentences_repl"] = cgl["lemmatized_sentences"].apply(replace_in_sentences)

# Pain words overview

In [33]:
keywords = ['πόνο*', 'ὀδύν*', 'ἄλγ*', 'λύπ*']
for keyword in keywords:
    cgl["count_" + keyword] = cgl["lemmata_repl"].apply(lambda x: x.count(keyword))

In [34]:
authors_overview = cgl.groupby("author").sum().drop(["date_avr", "date_manual"], axis=1)
authors_overview

Unnamed: 0_level_0,wordcount,n_sentences,lemmata_wordcount,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aeneas Tacticus,13993,745,6902,6,0,0,0
Aeschines,47136,3281,23031,2,0,1,5
Aeschines [Sp.],4273,188,2055,0,0,2,0
Aeschylus,40335,4020,25848,70,3,34,6
Alcidamas,5983,263,3016,5,0,0,1
Andocides,18987,1958,8902,1,0,0,2
Antiphon,22136,2405,9936,1,0,0,1
Antisthenes,924,61,417,0,0,0,0
Aristophanes,94897,11585,50255,21,4,13,18
Aristoteles,12571,792,5196,2,0,0,3


In [None]:
#set_with_dataframe(PIPA_data.add_worksheet("authors_overview",1,1), authors_overview, include_index=True)

In [35]:
subcorpora_overview_simple = cgl.groupby("subcorpus").sum().drop(["date_avr", "date_manual"], axis=1)
subcorpora_overview_simple

Unnamed: 0_level_0,wordcount,n_sentences,lemmata_wordcount,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*
subcorpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c_aristotelicum,785703,56737,343165,103,3,34,406
c_hippocraticum,333443,24456,171332,657,645,315,58
c_platonicum,574294,52384,272931,121,14,67,316


In [None]:
#set_with_dataframe(PIPA_data.add_worksheet("subcorpora_overview_simple", 1,1), subcorpora_overview_simple, include_index=True)

In [None]:
subcorpora_overview = cgl[cgl["subcorpus"].notnull()][['subcorpus', 'title', 'wordcount', 'n_sentences', 'lemmata_wordcount', 'count_πόνο*','count_ὀδύν*', 'count_ἄλγ*', 'count_λύπ*']]
subcorpora_overview

In [None]:
#set_with_dataframe(PIPA_data.add_worksheet("subcorpora_overview", 1,1), subcorpora_overview)

# Extract concordances with lypé

In [37]:
cgl.head(5)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,lemmatized_sentences,lemmata,lemmata_wordcount,subcorpus,lemmata_repl,lemmatized_sentences_repl,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...","[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",71863,,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...","[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",31,0,6,25
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθεν...","[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...",2535,,"[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...","[[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθεν...",7,0,0,1
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ...","[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...",3545,,"[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...","[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ...",11,0,1,2
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,...","[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...",4898,,"[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...","[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,...",8,3,8,8
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο...","[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...",4420,,"[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...","[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο...",5,0,7,4


In [50]:
# how to get index for searched elements:
wordlist = ["a", "b", "c", "d", "e", "f", "g", "a", "b", "c", "d", "e", "f", "g", "h"]
indeces = [el[0] for el in enumerate(wordlist) if el[1]=="d"]
indeces

[3, 10]

In [67]:
def get_concordances(wordlist, keyword, window):
    half = int(window / 2)
    keyword_indices = [el[0] for el in enumerate(wordlist) if el[1]==keyword]
    concordances = [wordlist[i-half:i+half+1] for i in keyword_indices]
    concordances = [c for c in concordances if len(c)==window]
    return concordances

In [68]:
get_concordances(wordlist, "b", 5)

[['g', 'a', 'b', 'c', 'd']]

In [73]:
cgl["conc_lype"] = cgl["lemmata_repl"].apply(lambda x: get_concordances(x, "λύπ*", 31))

In [77]:
cgl["conc_lype"].tolist()[0]

[['μέγας',
  'κινδυνεύοντας',
  'δέχομαι',
  'ἀείμνηστος',
  'μαρτύριον',
  'χάρις',
  'κατατίθημι',
  'ναυτικός',
  'κτέομαι',
  'πολύς',
  'σκέπτομαι',
  'εὐπραξία',
  'σπάνιος',
  'τὶς',
  'πολέμιος',
  'λύπ*',
  'πολύς',
  'χρῆμα',
  'χάρις',
  'τιμάω',
  'δύναμις',
  'προσγίγνομαι',
  'οὗτος',
  'πάρειμι',
  'αὐτεπάγγελτος',
  'κίνδυνος',
  'δαπάνη',
  'διδοῦσα',
  'φέρω',
  'πολύς',
  'ἀρετή'],
 ['Λακεδαιμόνιος',
  'διαμέλλω',
  'οἴομαι',
  'ἡσυχία',
  'ἄνθρωπος',
  'πολύς',
  'ἀρκέω',
  'παρασκευή',
  'δίκαιος',
  'πράσσω',
  'γνώμη',
  'ἀδικέω',
  'δῆλος',
  'εἰμί',
  'ἐπιτρέψοντες',
  'λύπ*',
  'ἄλλος',
  'αὐτός',
  'ἀμύνω',
  'βλάπτω',
  'ἴσος',
  'νέμω',
  'πόλις',
  'ὅμοιος',
  'παροικοῦντες',
  'τυγχάνω',
  'οὗτος',
  'δηλόω',
  'ἀρχαιότροπος',
  'ἐπιτήδευμα',
  'εἰμί'],
 ['μέγας',
  'κίνδυνος',
  'τίθημι',
  'Λακεδαιμόνιος',
  'Πελοπόννησος',
  'πόλις',
  'ὠφέλιμος',
  'καθίστημι',
  'ἐξηγέομαι',
  'ὑπομένω',
  'πᾶς',
  'ἀπέχθομαι',
  'ἡγεμονία',
  'οἶδα',
  'ἥσσων',
  'λ

# Export the subcorpus for future usage

In [78]:
cgl.to_json("../data/large_data/cgl.json")