In [108]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


# Load the main dataset of ancient Greek texts

In [4]:
# find local version of the input dataset
local_paths = !find ~/Projects -name "LAGT_v1-0.json"
print(local_paths)

['/Users/kasev/Projects/ECCE_DIK/data/large_data/LAGT_v1-0.json']


In [6]:
# LAGT (v1.0.1) dataset directly from Zenodo
try:
    LAGT = pd.read_json(local_paths[0])
except:
    resp = requests.get("https://zenodo.org/record/4971946/files/LAGT_v1-0.json?download=1")
    # save it for next time
    LAGT = pd.DataFrame(resp.json())
    LAGT.to_json("../data/large_files/LIRE_v1-0.json")

In [7]:
# get some overview of the shape of the data
LAGT.head(20)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,Epici/-ae,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",3252,"[[ἄρχω, Φοῖβος, παλαιγενής, κλέος, φώς, μιμνήσ..."
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Historici/-ae,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,6068,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,110763,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,Biographi,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,10245,"[[φιλοσοφία, ἔργον, ἔνιοι, φημί, βάρβαρος, ἄρχ..."
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,"̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...",1982,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, τῆνος, πηγή,..."
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,152,"[[ῥόδον, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλ..."
5,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,77,tlg0005,tlg0005.tlg003,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαία...,3,"[[εὐνητήρ, μακροπτολέμοιο, μήτηρ, Μαιάς, ἀντιπ..."
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Tragici,"Ὦ Βρόμιε, διὰ σὲ μυρίους ἔχω πόνους νῦν χὥτʼ ἐ...",529,"[[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθεν..."
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Tragici,Πάλαι ποτʼ ἐστὶ τοῦτʼ ἐμοὶ δεδογμένον· ὁ μὲν ...,662,"[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ..."
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Tragici,"Πολλὴ μὲν ἐν βροτοῖσι κοὐκ ἀνώνυμος, θεὰ κέκλ...",918,"[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,..."
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Tragici,"Ἀσιάτιδος γῆς σχῆμα, Θηβαία πόλι, ὅθεν ποθʼ ἕ...",712,"[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο..."


In [8]:
def flat_lemmata(sentences):
    lemmata = [word for sent in sentences for word in sent]
    return lemmata
LAGT["lemmata"] = LAGT["lemmatized_sentences"].apply(flat_lemmata)

LAGT["lemmata_wordcount"] = LAGT["lemmata"].apply(lambda x: len(x))
LAGT["lemmata_wordcount"].sum() # previously we had 13925726, then 13713183, finally 14756899

14383627

# Extract subcorpus



In [9]:
### how many documents we have
len(LAGT)

1457

In [10]:
LAGT[(LAGT["date_avr"]==-5)]

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,lemmata,lemmata_wordcount
208,tlg0013.tlg004.perseus-grc2.xml,Homeric hymn,Hymn 4 To Hermes,4031,tlg0013herm,tlg0013.tlg004,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"Ἑρμῆν ὕμνει, Μοῦσα, Διὸς καὶ Μαιάδος υἱόν, Κυλ...",332,"[[Ἑρμῆς, ὑμνέω, Μοῦσα, Ζεύς, Μαιάς, υἱός, Κυλλ...","[Ἑρμῆς, ὑμνέω, Μοῦσα, Ζεύς, Μαιάς, υἱός, Κυλλή...",2562
211,tlg0013.tlg007.perseus-grc2.xml,Homeric hymn,Hymn 7 To Dionysus,425,tlg0013dyo2,tlg0013.tlg007,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"ἀμφὶ Διώνυσον, Σεμέλης ἐρικυδέος υἱόν, μνήσομα...",42,"[[Διόνυσος, σεμέλης, ἐρικυδής, υἱός, μιμνήσκω,...","[Διόνυσος, σεμέλης, ἐρικυδής, υἱός, μιμνήσκω, ...",262
223,tlg0013.tlg019.perseus-grc2.xml,Homeric hymn,Hymn 19 to Pan,336,tlg0013pan,tlg0013.tlg019,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"ἀμφί μοι Ἑρμείαο φίλον γόνον ἔννεπε, Μοῦσα, αἰ...",19,"[[Ἑρμῆς, φίλος, γόνος, ἐνέπω, Μοῦσα, αἰγιπόδης...","[Ἑρμῆς, φίλος, γόνος, ἐνέπω, Μοῦσα, αἰγιπόδης,...",220
426,tlg0033.tlg001.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - O.,5875,tlg0033,tlg0033.tlg001,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"ἄριστον μὲν ὕδωρ, ὁ δὲ χρυσὸς αἰθόμενον πῦρ ἅ...",427,"[[ἀγαθός, ὕδωρ, χρυσός, αἴθω, πῦρ, διαπρέπω, ν...","[ἀγαθός, ὕδωρ, χρυσός, αἴθω, πῦρ, διαπρέπω, νύ...",4025
427,tlg0033.tlg002.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - P.,7427,tlg0033,tlg0033.tlg002,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"χρυσέα φόρμιγξ, Ἀπόλλωνος καὶ ἰοπλοκάμων σύνδ...",595,"[[χρύσεος, φόρμιγξ, Ἀπόλλων, ἰοπλοκάμων, σύνδι...","[χρύσεος, φόρμιγξ, Ἀπόλλων, ἰοπλοκάμων, σύνδικ...",5152
428,tlg0033.tlg003.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - N.,4944,tlg0033,tlg0033.tlg003,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"ἄμπνευμα σεμνὸν Ἀλφεοῦ, κλεινᾶν Συρακοσσᾶν θά...",402,"[[ἀνάπνευμα, σεμνός, ἀλφεοῦ, κλεινός, συρακοσσ...","[ἀνάπνευμα, σεμνός, ἀλφεοῦ, κλεινός, συρακοσσᾶ...",3393
429,tlg0033.tlg004.perseus-grc2.xml,Pindar,Odes (Greek). Machine readable text - I.,2899,tlg0033,tlg0033.tlg004,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Lyrici/-ae,"μᾶτερ ἐμά, τὸ τεόν, χρύσασπι Θήβα, πρᾶγμα καὶ...",222,"[[μήτηρ, ἐμός, τεός, χρύσασπις, θήβα, πρᾶγμα, ...","[μήτηρ, ἐμός, τεός, χρύσασπις, θήβα, πρᾶγμα, ἀ...",1989
678,tlg0085.tlg001.perseus-grc2.xml,Aeschylus,Suppliant Maidens,4977,tlg0085,tlg0085.tlg001,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Tragici,Ζεὺς μὲν ἀφίκτωρ ἐπίδοι προφρόνως στόλον ἡμέτε...,527,"[[Ζεύς, ἀφίκτωρ, ἐπεῖδον, στόλος, ἡμέτερος, νή...","[Ζεύς, ἀφίκτωρ, ἐπεῖδον, στόλος, ἡμέτερος, νήι...",3276
679,tlg0085.tlg002.perseus-grc2.xml,Aeschylus,Persians,5221,tlg0085,tlg0085.tlg002,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Tragici,Τάδε μὲν Περσῶν τῶν οἰχομένων Ἑλλάδʼ ἐς αἶαν π...,481,"[[Πέρσης, οἴχομαι, Ἑλλάς, αἶα, πιστός, καλέω, ...","[Πέρσης, οἴχομαι, Ἑλλάς, αἶα, πιστός, καλέω, ἀ...",3482
680,tlg0085.tlg003.perseus-grc2.xml,Aeschylus,Prometheus Bound,5943,tlg0085,tlg0085.tlg003,6-5 B.C.,-5.0,"{'-5.5': 0.5, '-4.5': 0.5}",-4.5,pagan,Tragici,"Χθονὸς μὲν ἐς τηλουρὸν ἥκομεν πέδον, Σκύθην ἐς...",590,"[[χθών, τηλουρός, ἥκω, πέδον, Σκύθης, οἷμον, ἄ...","[χθών, τηλουρός, ἥκω, πέδον, Σκύθης, οἷμον, ἄβ...",3586


In [12]:
LAGT[LAGT["date_avr"]==-3]

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,lemmata,lemmata_wordcount
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,"̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...",1982,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, τῆνος, πηγή,...","[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, τῆνος, πηγή, ...",11879
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,152,"[[ῥόδον, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλ...","[ῥόδον, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλ...",1077
5,tlg0005.tlg003.1st1K-grc1.xml,Theocritus,Syrinx,77,tlg0005,tlg0005.tlg003,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Bucolici,Οὐδενὸς εὐνάτειρα Μακροπτολέμοιο δὲ μάτηρ μαία...,3,"[[εὐνητήρ, μακροπτολέμοιο, μήτηρ, Μαιάς, ἀντιπ...","[εὐνητήρ, μακροπτολέμοιο, μήτηρ, Μαιάς, ἀντιπέ...",60
376,tlg0029.tlg004.perseus-grc1.xml,Dinarchus,Against Demosthenes,8434,tlg0029,tlg0029.tlg004,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Oratores,"ὁ μὲν δημαγωγὸς ὑμῖν, ὦ Ἀθηναῖοι, καὶ θανάτου ...",883,"[[δημαγωγός, Ἀθηναῖος, θάνατος, τετιμημένος, ἐ...","[δημαγωγός, Ἀθηναῖος, θάνατος, τετιμημένος, ἐξ...",4152
377,tlg0029.tlg005.perseus-grc1.xml,Dinarchus,Against Aristogiton,2035,tlg0029,tlg0029.tlg005,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Oratores,"πάνθʼ, ὡς ἔοικεν, ὦ Ἀθηναῖοι, προσδοκητέα ἐστὶ...",216,"[[πᾶς, ἔοικα, Ἀθηναῖος, προσδοκητέα, εἰμί, ἀκο...","[πᾶς, ἔοικα, Ἀθηναῖος, προσδοκητέα, εἰμί, ἀκού...",974
378,tlg0029.tlg006.perseus-grc1.xml,Dinarchus,Against Philocles,1632,tlg0029,tlg0029.tlg006,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,Oratores,τί χρὴ λέγειν πρὸς τῶν θεῶν περὶ τοιούτων ἀνθρ...,186,"[[χρή, λέγω, θεός, τοιοῦτος, ἄνθρωπος, τοιοῦτο...","[χρή, λέγω, θεός, τοιοῦτος, ἄνθρωπος, τοιοῦτος...",776
754,tlg0093.ogl001.1st1K-grc1.xml,Theophrastus,Concerning Odours,5880,tlg0093,tlg0093.ogl001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",-3.5,pagan,Philosophici/-ae,Ι. Αἱ ὀσμαὶ τὸ μὲν ὅλον ἐκ μίξεώς εἰσι καθάπερ...,704,"[[ι], [ὀσμή, ὅλος, μίξεώς, χυμός], [ἄμικτος, ἅ...","[ι, ὀσμή, ὅλος, μίξεώς, χυμός, ἄμικτος, ἅπας, ...",2477
755,tlg0093.ogl002.1st1K-grc1.xml,Theophrastus,Enquiry into Plants,4032,tlg0093,tlg0093.ogl002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",-3.5,pagan,Philosophici/-ae,. Σημεῖα ὑδάτων καὶ πνευμάτων καὶ χειμώνων καὶ...,558,"[[], [σημεῖον, ὕδωρ, πνεῦμα, χειμών, εὔδιος, γ...","[σημεῖον, ὕδωρ, πνεῦμα, χειμών, εὔδιος, γράφω,...",2113
756,tlg0093.tlg001.1st1K-grc1.xml,Theophrastus,Enquiry into Plants,81451,tlg0093,tlg0093.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",-3.5,pagan,Philosophici/-ae,Ι. Τῶν φυτῶν τὰς διαφορὰς καὶ τὴν ἄλλην φύσιν ...,12309,"[[ι], [φυτόν, διαφορά, ἄλλος, φύσις, ληπτέος, ...","[ι, φυτόν, διαφορά, ἄλλος, φύσις, ληπτέος, μέρ...",35095
757,tlg0093.tlg003.1st1K-grc1.xml,Theophrastus,De sensu et sensibilibus,9044,tlg0093,tlg0093.tlg003,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",-3.5,pagan,Philosophici/-ae,. . Περὶ δʼ αἰσθήσεως αἱ μὲν πολλαὶ καὶθόλου δ...,662,"[[], [], [αἴσθησις, πολύς, καὶθόλου, δοκέω, εἰ...","[αἴσθησις, πολύς, καὶθόλου, δοκέω, εἰμί, ὅμοιο...",3968


In [16]:
border_authors = "tlg0093", "tlg0085", "tlg0093", "tlg0341", "tlg0751"

In [17]:
cgl = LAGT[((LAGT["date_avr"]>-5) & (LAGT["date_avr"]<-3)) | (LAGT["author_id"].isin(border_authors))].copy()
len(cgl)

371

In [18]:
cgl["lemmata_wordcount"].sum()

1729265

# Subcorpora

In [22]:
# However in the case of Aristotle, we are interested only in a subselection of works associated with his name
# we coded them manually in the "include" column
c_aristotelicum_coded = pd.read_csv("../data/c_aristotelicum_OVERVIEW.csv")
c_aristotelicum_coded

Unnamed: 0,filename,include?,author,title,author_id,doc_id,wordcount,lemmata_wordcount,num_of_sents
0,tlg0086.tlg001.1st1K-grc2.xml,y,Aristotle,Aristotelis Analytica Priora et Posteriora; Ar...,tlg0086,tlg0086.tlg001,59772,12287,3384
1,tlg0086.tlg002.1st1K-grc2.xml,y,Aristotle,De anima; Aritoteles De anima,tlg0086,tlg0086.tlg002,20988,5579,1250
2,tlg0086.tlg003.perseus-grc1.xml,,Aristotle,Athenian Constitution; Machine readable text; ...,tlg0086,tlg0086.tlg003,16536,4243,817
3,tlg0086.tlg005.1st1K-grc1.xml,y,Aristotle,De caelo; Aristoteles De coelo et De generatio...,tlg0086,tlg0086.tlg005,31395,8370,1856
4,tlg0086.tlg006.1st1K-grc1.xml,y,Aristotle,Categoriae; Aristotelis Opera,tlg0086,tlg0086.tlg006,10317,2865,646
5,tlg0086.tlg008.1st1K-grc1.xml,y,Aristotle,De divinatione per somnum; Aristotelis Opera,tlg0086,tlg0086.tlg008,1207,305,66
6,tlg0086.tlg009.perseus-grc1.xml,y,Aristotle,Eudemian Ethics (Greek). Machine readable text...,tlg0086,tlg0086.tlg009,26361,7183,1604
7,tlg0086.tlg010.perseus-grc1.xml,y,Aristotle,Nicomachean Ethics; Machine readable text; ed....,tlg0086,tlg0086.tlg010,56638,14995,3715
8,tlg0086.tlg013.1st1K-grc2.xml,,Aristotle,De generatione et corruptione; Aristoteles De ...,tlg0086,tlg0086.tlg013,17173,4452,1025
9,tlg0086.tlg014.1st1K-grc1.xml,y,Aristotle,Historia animalium; Aristotelis Opera,tlg0086,tlg0086.tlg014,94068,26141,6094


In [23]:
c_arist_doc_ids = c_aristotelicum_coded[c_aristotelicum_coded["include?"] == "y"]["doc_id"].tolist()
c_arist_doc_ids[:10]

['tlg0086.tlg001',
 'tlg0086.tlg002',
 'tlg0086.tlg005',
 'tlg0086.tlg006',
 'tlg0086.tlg008',
 'tlg0086.tlg009',
 'tlg0086.tlg010',
 'tlg0086.tlg014',
 'tlg0086.tlg016',
 'tlg0086.tlg017']

In [24]:
def assign_subcorpus(row):
    subcorp = None
    if row["author_id"] == "tlg0086":
        if row["doc_id"] in c_arist_doc_ids:
            subcorp = "c_aristotelicum"
    if row["author_id"] == "tlg0627":
        subcorp = "c_hippocraticum"
    if row["author_id"] == "tlg0059":
        subcorp = "c_platonicum"
    return subcorp

In [25]:
cgl["subcorpus"] = cgl.apply(assign_subcorpus, axis=1)

In [26]:
# test
cgl[cgl["subcorpus"]=="c_aristotelicum"]


Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,lemmata,lemmata_wordcount,subcorpus
685,tlg0086.tlg001.1st1K-grc2.xml,Aristotle,Aristotelis Analytica Priora et Posteriora,59614,tlg0086,tlg0086.tlg001,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Πρῶτον εἰπεῖν περὶ τί καὶ τίνος ἐστὶν ἡ σκέψις...,3710,"[[πρῶτος, λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστ...","[πρῶτος, λέγω, εἰμί, σκέψις, ἀπόδειξις, ἐπιστή...",24680,c_aristotelicum
686,tlg0086.tlg002.1st1K-grc2.xml,Aristotle,De anima,20912,tlg0086,tlg0086.tlg002,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Τῶν καλῶν καὶ τιμίων τὴν εἴδησιν ὑπολαμβάνοντε...,1317,"[[καλός, τίμιος, εἶδος, ὑπολαμβάνω, ἕτερος, ἕτ...","[καλός, τίμιος, εἶδος, ὑπολαμβάνω, ἕτερος, ἕτε...",9395,c_aristotelicum
688,tlg0086.tlg005.1st1K-grc1.xml,Aristotle,De caelo,30794,tlg0086,tlg0086.tlg005,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Ἡ περὶ φύσεως ἐπιστήμη σχεδὸν ἡ πλείστη φαίνετ...,2276,"[[φύσις, ἐπιστήμη, πλεῖστος, φαίνω, σῶμα, μέγε...","[φύσις, ἐπιστήμη, πλεῖστος, φαίνω, σῶμα, μέγεθ...",13437,c_aristotelicum
689,tlg0086.tlg006.1st1K-grc1.xml,Aristotle,Categoriae,10316,tlg0086,tlg0086.tlg006,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"ὉΜΩΝΥΜΑ λέγεται ὧν ὄνομα μόνον κοινόν, ὁ δὲ κα...",660,"[[ὁμώνυμος, λέγω, ὄνομα, μόνος, κοινός, ὄνομα,...","[ὁμώνυμος, λέγω, ὄνομα, μόνος, κοινός, ὄνομα, ...",4678,c_aristotelicum
690,tlg0086.tlg008.1st1K-grc1.xml,Aristotle,De divinatione per somnum,1194,tlg0086,tlg0086.tlg008,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,Περὶ δὲ τῆς μαντικῆς τῆς ἐν τοῖς ὕπνοις γινομέ...,68,"[[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐ...","[μαντικός, ὕπνος, γίγνομαι, λέγω, συμβαίνω, ἐν...",549,c_aristotelicum
691,tlg0086.tlg009.perseus-grc1.xml,Aristotle,Eudemian Ethics (Greek). Machine readable text,26345,tlg0086,tlg0086.tlg009,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,ὁ μὲν ἐν Δήλῳ παρὰ τῷ θεῷ τὴν αὑτοῦ γνώμην ἀπ...,1683,"[[δῆλος, θεός, ἑαυτοῦ, γνώμη, ἀποφηνάμενος, συ...","[δῆλος, θεός, ἑαυτοῦ, γνώμη, ἀποφηνάμενος, συγ...",11564,c_aristotelicum
692,tlg0086.tlg010.perseus-grc1.xml,Aristotle,Nicomachean Ethics,56620,tlg0086,tlg0086.tlg010,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"πᾶσα τέχνη καὶ πᾶσα μέθοδος, ὁμοίως δὲ πρᾶξίς ...",3920,"[[τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις, ἀγα...","[τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις, ἀγαθ...",25082,c_aristotelicum
694,tlg0086.tlg014.1st1K-grc1.xml,Aristotle,Historia animalium,93923,tlg0086,tlg0086.tlg014,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,ΤΩΝ ἐν τοῖς ζῴοις μορίων τὰ μέν ἐστιν ἀσύνθετα...,6337,"[[ζῷον, μόριον, εἰμί, ἀσύνθετος, ὅσος, διαιρέω...","[ζῷον, μόριον, εἰμί, ἀσύνθετος, ὅσος, διαιρέω,...",43967,c_aristotelicum
696,tlg0086.tlg016.1st1K-grc1.xml,Aristotle,De insomniis,2436,tlg0086,tlg0086.tlg016,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"Μετὰ δὲ ταῦτα περὶ ἐνυπνίου ζητητέον, καὶ πρῶτ...",129,"[[ἐνύπνιον, ζητητέος, πρῶτος, ψυχή, φαίνω, νοη...","[ἐνύπνιον, ζητητέος, πρῶτος, ψυχή, φαίνω, νοητ...",1095,c_aristotelicum
697,tlg0086.tlg017.1st1K-grc1.xml,Aristotle,De interpretatione,6271,tlg0086,tlg0086.tlg017,4 B.C.,-3.5,{'-3.5': 1},-3.5,pagan,Philosophici/-ae,"ΠΡΩΤΟΝ δεῖ θέσθαι τί ὄνομα καὶ τί ῥῆμα, ἔπειτα...",396,"[[πρῶτος, δεῖ, τίθημι, ὄνομα, ῥῆμα, ἀπόφασις, ...","[πρῶτος, δεῖ, τίθημι, ὄνομα, ῥῆμα, ἀπόφασις, κ...",2632,c_aristotelicum


# GLAUx

In [83]:
import os
import re

In [100]:
path = "../../glaux-trees/public/xml"
filenames = os.listdir(path)
filenames[:10]

['0057-094.xml',
 '0010-009.xml',
 '1342-002.xml',
 '0057-057.xml',
 '0010-021.xml',
 '0057-043.xml',
 '0007-049.xml',
 '0007-075.xml',
 '0007-061.xml',
 '0555-001.xml']

In [101]:
len(filenames)

936

In [102]:
def from_filename_to_docid(filename):
    parts = re.split("\-|\.", filename)[:2]
    docid = "tlg"+parts[0] + ".tlg" + parts[1]
    return docid
glaux_docids = [from_filename_to_docid(filename) for filename in filenames]
glaux_docids

['tlg0057.tlg094',
 'tlg0010.tlg009',
 'tlg1342.tlg002',
 'tlg0057.tlg057',
 'tlg0010.tlg021',
 'tlg0057.tlg043',
 'tlg0007.tlg049',
 'tlg0007.tlg075',
 'tlg0007.tlg061',
 'tlg0555.tlg001',
 'tlg0545.tlg001',
 'tlg0540.tlg030',
 'tlg0014.tlg029',
 'tlg0641.tlg001',
 'tlg0032.tlg003',
 'tlg0540.tlg024',
 'tlg0284.tlg012',
 'tlg0014.tlg015',
 'tlg0014.tlg001',
 'tlg0284.tlg006',
 'tlg0540.tlg018',
 'tlg0062.tlg035',
 'tlg0086.tlg020',
 'tlg0059.tlg028',
 'tlg0627.tlg009',
 'tlg0086.tlg034',
 'tlg0062.tlg021',
 'tlg0526.tlg004',
 'tlg0062.tlg009',
 'tlg0059.tlg014',
 'tlg0627.tlg021',
 'tlg0086.tlg008',
 'tlg0627.tlg035',
 'tlg0007.tlg082a',
 'tlg0551.tlg009',
 'tlg0005.tlg005',
 'tlg0261.tlg003',
 'tlg0551.tlg008',
 'tlg0059.tlg001',
 'tlg0086.tlg009',
 'tlg0062.tlg008',
 'tlg0627.tlg020',
 'tlg0059.tlg015',
 'tlg0131.tlg001',
 'tlg0086.tlg035',
 'tlg0627.tlg008',
 'tlg0062.tlg020',
 'tlg0062.tlg034',
 'tlg0059.tlg029',
 'tlg0086.tlg021',
 'tlg0284.tlg007',
 'tlg0540.tlg019',
 'tlg0198.t

In [104]:
len(cgl)

371

In [106]:
len(set(cgl["doc_id"]) & set(glaux_docids))

340

In [110]:
f = open(path + "/0086-010.xml", "r")

In [111]:
soup = BeautifulSoup(f)

In [124]:
lemmata_sentences = []
for sent in soup.find_all("sentence"):
    sentence_lemmata = []
    for w in sent.find_all("word"):
        if w["postag"][0] in ["n", "v", "a"]:
            sentence_lemmata.append(w["lemma"])
    lemmata_sentences.append(sentence_lemmata)

In [126]:
lemmata_sentences[:30]

[['πᾶς',
  'τέχνη',
  'πᾶς',
  'μέθοδος',
  'πρᾶξις',
  'προαίρεσις',
  'ἀγαθός',
  'τις',
  'ἐφίημι',
  'δοκέω'],
 ['ἀποφαίνω', 'ἀγαθός', 'πᾶς', 'ἐφίημι'],
 ['διαφορά', 'τις', 'φαίνω', 'τέλος'],
 ['εἰμί', 'ἐνέργεια', 'αὐτός', 'ἔργον', 'τις'],
 ['εἰμί',
  'τέλος',
  'τις',
  'πρᾶξις',
  'οὗτος',
  'ἀγαθός',
  'φύω',
  'ἐνέργεια',
  'ἔργον'],
 ['πολύς',
  'πρᾶξις',
  'εἰμί',
  'τέχνη',
  'ἐπιστήμη',
  'πολύς',
  'γίγνομαι',
  'τέλος'],
 ['ἰατρικός',
  'ὑγίεια',
  'ναυπηγικός',
  'πλοῖον',
  'στρατηγικός',
  'νίκη',
  'οἰκονομικός',
  'πλοῦτος'],
 ['εἰμί',
  'τοιοῦτος',
  'εἷς',
  'τις',
  'δύναμις',
  'ἱππικός',
  'χαλινοποιική',
  'ἄλλος',
  'ἱππικός',
  'ὄργανον',
  'εἰμί',
  'οὗτος',
  'πᾶς',
  'πολεμικός',
  'πρᾶξις',
  'στρατηγικός',
  'αὐτός',
  'τρόπος',
  'ἄλλος',
  'ἕτερος'],
 ['ἅπας', 'ἀρχιτεκτονικός', 'τέλος', 'πᾶς', 'εἰμί', 'αἱρετός', 'αὐτός'],
 ['οὗτος', 'χάρις', 'ἐκεῖνος', 'διώκω'],
 ['διαφέρω',
  'οὐδείς',
  'ἐνέργεια',
  'αὐτός',
  'εἰμί',
  'τέλος',
  'πρᾶξις',
  'οὗτος

# Testing replacements

In [27]:
cgl_list = []
for list_element in cgl["lemmata"].tolist():
    cgl_list.extend(list_element)

In [28]:
keyterm_patterns = [("^λ[υ|ύ]π.+", "λύπ*"), ("[α|ά|ἀ|ἄ]λγ.+", "ἄλγ*"), ("^[ὀ|ὠ]δ[ύ|υ]ν.", "ὀδύν*"), ("^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+", "πόνο*")]

In [29]:
# unique word forms
matches = []
for pattern_tuple in keyterm_patterns:
    r = re.compile(pattern_tuple[0])
    matches.extend([(pattern_tuple[0], pattern_tuple[1], match, cgl_list.count(match)) for match in list(filter(r.search, list(set(cgl_list))))])
matches_df = pd.DataFrame(matches, columns=["pattern", "replacement", "match", "cgl",]) #, "translation"])
matches_df

Unnamed: 0,pattern,replacement,match,cgl
0,^λ[υ|ύ]π.+,λύπ*,λυπηθῆ,1
1,^λ[υ|ύ]π.+,λύπ*,λύπη,560
2,^λ[υ|ύ]π.+,λύπ*,λυπήσαντα,1
3,^λ[υ|ύ]π.+,λύπ*,λυπέω,365
4,^λ[υ|ύ]π.+,λύπ*,λυπηρός,152
...,...,...,...,...
227,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονοῦντιτοῦτό,1
228,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονέσῃ,8
229,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονήσασιν,2
230,^π[ό|ο]ν[ο|ό|έ|ε|η|ή|ῆ](?!ρ).+,πόνο*,πονεούσης,1


In [27]:
# set_with_dataframe(PIPA_data.add_worksheet("matches_20220517", 1,1), matches_df)

# Make the actual replacement within the texts

In [30]:
# use these regular expressions to make replacements in the list of lemmata
def replacer_word_list(pattern, product, word_list):
    return [re.sub(pattern, product, word) for word in word_list]

def replace_keywords(list_of_words, list_of_tuples):
    for pattern in list_of_tuples:
        list_of_words = replacer_word_list(pattern[0], pattern[1], list_of_words)
    return list_of_words

In [31]:
### test (includes artificial words):
word_list_test = ['βοοκ', 'πᾶς', 'μέλυπρᾷ', "ἄλγτέχνη",'τέχνη' ,'πᾶς', 'μέθοδος', 'ὅμοιος', "λύπη",'πρᾶξίς', 'προαίρεσις', 'ἀγαθός', 'ἐφίημι']
replace_keywords(word_list_test, keyterm_patterns)

['βοοκ',
 'πᾶς',
 'μέλυπρᾷ',
 'ἄλγ*',
 'τέχνη',
 'πᾶς',
 'μέθοδος',
 'ὅμοιος',
 'λύπ*',
 'πρᾶξίς',
 'προαίρεσις',
 'ἀγαθός',
 'ἐφίημι']

In [32]:
# apply the replacement on the level of individual words
cgl["lemmata_repl"] = cgl["lemmata"].apply(lambda x: replace_keywords(x, keyterm_patterns))

In [33]:
# apply the replacement on the sentences
def replace_in_sentences(list_of_sentences):
    return [replace_keywords(sentence, keyterm_patterns) for sentence in list_of_sentences]

cgl["lemmatized_sentences_repl"] = cgl["lemmatized_sentences"].apply(replace_in_sentences)

# Pain words overview

In [34]:
keywords = ['πόνο*', 'ὀδύν*', 'ἄλγ*', 'λύπ*']
for keyword in keywords:
    cgl["count_" + keyword] = cgl["lemmata_repl"].apply(lambda x: x.count(keyword))

In [36]:
authors_overview = cgl.groupby("author_id").sum().drop(["date_avr", "date_manual"], axis=1)
authors_overview

Unnamed: 0_level_0,wordcount,n_sentences,lemmata_wordcount,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tlg0003,150118,6068,71863,31,0,6,25
tlg0006,134129,14482,81439,179,11,74,74
tlg0010,119155,4681,56497,45,0,11,47
tlg0011,61793,6197,34437,68,7,60,35
tlg0014,294362,14564,143390,21,0,7,40
tlg0016,184947,10149,94799,27,1,10,12
tlg0017,34075,2986,16836,0,0,0,1
tlg0019,94897,11585,50255,21,4,13,18
tlg0026,51409,3469,25086,2,0,3,5
tlg0027,18987,1958,8902,1,0,0,2


In [44]:
author_id_dict = dict(zip(cgl["author_id"].tolist(), cgl["author"].tolist()))

In [57]:
authors = authors_overview.reset_index()["author_id"].apply(lambda x: author_id_dict[x])
authors = authors.tolist()

In [58]:
authors_overview["author"] = authors

In [59]:
authors_overview

Unnamed: 0_level_0,wordcount,n_sentences,lemmata_wordcount,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*,author
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tlg0003,150118,6068,71863,31,0,6,25,Thucydides
tlg0006,134129,14482,81439,179,11,74,74,Euripides
tlg0010,119155,4681,56497,45,0,11,47,Isocrates
tlg0011,61793,6197,34437,68,7,60,35,Sophocles
tlg0014,294362,14564,143390,21,0,7,40,Demosthenes
tlg0016,184947,10149,94799,27,1,10,12,Herodotus
tlg0017,34075,2986,16836,0,0,0,1,Isaeus
tlg0019,94897,11585,50255,21,4,13,18,Aristophanes
tlg0026,51409,3469,25086,2,0,3,5,Aeschines [Sp.]
tlg0027,18987,1958,8902,1,0,0,2,Andocides


In [66]:
authors_overview.columns

Index(['wordcount', 'n_sentences', 'lemmata_wordcount', 'count_πόνο*',
       'count_ὀδύν*', 'count_ἄλγ*', 'count_λύπ*', 'author'],
      dtype='object')

In [67]:
num_cols = ['wordcount', 'n_sentences', 'lemmata_wordcount', 'count_πόνο*',
'count_ὀδύν*', 'count_ἄλγ*', 'count_λύπ*']

In [71]:
authors_overview.sum()[num_cols]

wordcount            3579690
n_sentences           271932
lemmata_wordcount    1729265
count_πόνο*             1555
count_ὀδύν*              696
count_ἄλγ*               658
count_λύπ*              1191
dtype: object

In [68]:
authors_overview.loc["tlg0627"][num_cols] / authors_overview.sum()[num_cols]

wordcount            0.093149
n_sentences          0.089934
lemmata_wordcount    0.099078
count_πόνο*          0.422508
count_ὀδύν*          0.926724
count_ἄλγ*           0.478723
count_λύπ*           0.048699
dtype: object

In [70]:
authors_overview.loc["tlg0086"][num_cols] / authors_overview.sum()[num_cols]


wordcount            0.234733
n_sentences           0.22168
lemmata_wordcount    0.212719
count_πόνο*          0.066881
count_ὀδύν*          0.007184
count_ἄλγ*           0.054711
count_λύπ*           0.347607
dtype: object

In [61]:
authors_overview.sum()

wordcount                                                      3579690
n_sentences                                                     271932
lemmata_wordcount                                              1729265
count_πόνο*                                                       1555
count_ὀδύν*                                                        696
count_ἄλγ*                                                         658
count_λύπ*                                                        1191
author               ThucydidesEuripidesIsocratesSophoclesDemosthen...
dtype: object

In [72]:
num_cols

['wordcount',
 'n_sentences',
 'lemmata_wordcount',
 'count_πόνο*',
 'count_ὀδύν*',
 'count_ἄλγ*',
 'count_λύπ*']

In [73]:
authors_users = {}
for col in ['count_πόνο*','count_ὀδύν*','count_ἄλγ*', 'count_λύπ*']:
    authors_users[col.rpartition("_")[2] + "_authors"] = authors_overview.sort_values(col, ascending=False)["author"].tolist()[:10]

In [74]:
pd.DataFrame(authors_users)

Unnamed: 0,πόνο*_authors,ὀδύν*_authors,ἄλγ*_authors,λύπ*_authors
0,Hippocrates,Hippocrates,Hippocrates,Aristotle
1,Euripides,Plato,Euripides,Plato
2,Xenophon,Euripides,Plato,Xenophon
3,Plato,Sophocles,Sophocles,Euripides
4,Aristotle,Aristotle,Aristotle,Hippocrates
5,Aeschylus,Aristophanes,Aeschylus,Isocrates
6,Sophocles,Aeschylus,Aristophanes,Demosthenes
7,Isocrates,Xenophon,Xenophon,Theophrastus
8,Thucydides,Theophrastus,Isocrates,Sophocles
9,Herodotus,Herodotus,Herodotus,Thucydides


In [None]:
#set_with_dataframe(PIPA_data.add_worksheet("authors_overview",1,1), authors_overview, include_index=True)

In [35]:
subcorpora_overview_simple = cgl.groupby("subcorpus").sum().drop(["date_avr", "date_manual"], axis=1)
subcorpora_overview_simple

Unnamed: 0_level_0,wordcount,n_sentences,lemmata_wordcount,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*
subcorpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c_aristotelicum,785703,56737,343165,103,3,34,406
c_hippocraticum,333443,24456,171332,657,645,315,58
c_platonicum,574294,52384,272931,121,14,67,316


In [None]:
#set_with_dataframe(PIPA_data.add_worksheet("subcorpora_overview_simple", 1,1), subcorpora_overview_simple, include_index=True)

In [None]:
subcorpora_overview = cgl[cgl["subcorpus"].notnull()][['subcorpus', 'title', 'wordcount', 'n_sentences', 'lemmata_wordcount', 'count_πόνο*','count_ὀδύν*', 'count_ἄλγ*', 'count_λύπ*']]
subcorpora_overview

In [None]:
#set_with_dataframe(PIPA_data.add_worksheet("subcorpora_overview", 1,1), subcorpora_overview)

# Extract concordances with lypé

In [37]:
cgl.head(5)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,lemmatized_sentences,lemmata,lemmata_wordcount,subcorpus,lemmata_repl,lemmatized_sentences_repl,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...","[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",71863,,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...","[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",31,0,6,25
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθεν...","[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...",2535,,"[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...","[[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθεν...",7,0,0,1
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ...","[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...",3545,,"[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...","[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ...",11,0,1,2
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,...","[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...",4898,,"[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...","[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,...",8,3,8,8
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο...","[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...",4420,,"[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...","[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο...",5,0,7,4


In [50]:
# how to get index for searched elements:
wordlist = ["a", "b", "c", "d", "e", "f", "g", "a", "b", "c", "d", "e", "f", "g", "h"]
indeces = [el[0] for el in enumerate(wordlist) if el[1]=="d"]
indeces

[3, 10]

In [67]:
def get_concordances(wordlist, keyword, window):
    half = int(window / 2)
    keyword_indices = [el[0] for el in enumerate(wordlist) if el[1]==keyword]
    concordances = [wordlist[i-half:i+half+1] for i in keyword_indices]
    concordances = [c for c in concordances if len(c)==window]
    return concordances

In [68]:
get_concordances(wordlist, "b", 5)

[['g', 'a', 'b', 'c', 'd']]

In [73]:
cgl["conc_lype"] = cgl["lemmata_repl"].apply(lambda x: get_concordances(x, "λύπ*", 31))

In [77]:
cgl["conc_lype"].tolist()[0]

[['μέγας',
  'κινδυνεύοντας',
  'δέχομαι',
  'ἀείμνηστος',
  'μαρτύριον',
  'χάρις',
  'κατατίθημι',
  'ναυτικός',
  'κτέομαι',
  'πολύς',
  'σκέπτομαι',
  'εὐπραξία',
  'σπάνιος',
  'τὶς',
  'πολέμιος',
  'λύπ*',
  'πολύς',
  'χρῆμα',
  'χάρις',
  'τιμάω',
  'δύναμις',
  'προσγίγνομαι',
  'οὗτος',
  'πάρειμι',
  'αὐτεπάγγελτος',
  'κίνδυνος',
  'δαπάνη',
  'διδοῦσα',
  'φέρω',
  'πολύς',
  'ἀρετή'],
 ['Λακεδαιμόνιος',
  'διαμέλλω',
  'οἴομαι',
  'ἡσυχία',
  'ἄνθρωπος',
  'πολύς',
  'ἀρκέω',
  'παρασκευή',
  'δίκαιος',
  'πράσσω',
  'γνώμη',
  'ἀδικέω',
  'δῆλος',
  'εἰμί',
  'ἐπιτρέψοντες',
  'λύπ*',
  'ἄλλος',
  'αὐτός',
  'ἀμύνω',
  'βλάπτω',
  'ἴσος',
  'νέμω',
  'πόλις',
  'ὅμοιος',
  'παροικοῦντες',
  'τυγχάνω',
  'οὗτος',
  'δηλόω',
  'ἀρχαιότροπος',
  'ἐπιτήδευμα',
  'εἰμί'],
 ['μέγας',
  'κίνδυνος',
  'τίθημι',
  'Λακεδαιμόνιος',
  'Πελοπόννησος',
  'πόλις',
  'ὠφέλιμος',
  'καθίστημι',
  'ἐξηγέομαι',
  'ὑπομένω',
  'πᾶς',
  'ἀπέχθομαι',
  'ἡγεμονία',
  'οἶδα',
  'ἥσσων',
  'λ

# Export the subcorpus for future usage

In [78]:
cgl.to_json("../data/large_data/cgl.json")