In [167]:
import numpy as np
import pandas as pd
import pickle
import re
import keras
from keras.layers import Embedding

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
cgl = pd.read_json("../data/large_data/cgl.json")
cgl.head()

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,lemmata,lemmata_wordcount,subcorpus,lemmata_repl,lemmatized_sentences_repl,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*,conc_lype
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",71863,,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...","[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",31,0,6,25,"[[μέγας, κινδυνεύοντας, δέχομαι, ἀείμνηστος, μ..."
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...",2535,,"[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...","[[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθεν...",7,0,0,1,"[[βοτόν, οὔτις, θύω, θεός, μέγας, γαστήρ, δαίμ..."
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...",3545,,"[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...","[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ...",11,0,1,2,"[[εἰμί, πολύς, χαίρω, δυσφημέω, ἅζομαι, θέα, σ..."
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...",4898,,"[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...","[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,...",8,3,8,8,"[[κοιτάζω, λέχος, σός, ναυβάτης, τὶς, πλέω, κρ..."
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...",4420,,"[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...","[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο...",5,0,7,4,"[[Τροία, πράσσω, μηδείς, ὅδε, αὐχέω, πράσσω, δ..."


In [4]:
cgl_embeddings = pickle.load(open("../data/large_data/cgl_embeddings.pkl", "rb"))

In [6]:
len(range(1, len(cgl_embeddings[1])+1))

5000

In [7]:
word_index = dict(zip(cgl_embeddings[1],range(1, len(cgl_embeddings[1])+1)))
list(word_index.items())[:10]

[('εἰμί', 1),
 ('οὗτος', 2),
 ('λέγω', 3),
 ('ἔχω', 4),
 ('γίγνομαι', 5),
 ('πολύς', 6),
 ('ἄλλος', 7),
 ('αὐτός', 8),
 ('πᾶς', 9),
 ('ποιέω', 10)]

In [8]:
# we will generate the concordances again, now focusing on words in the embeddings only
def get_concordances(wordlist, keyword, window, vocabulary=None):
    half = int(window / 2)
    if vocabulary != None:
        wordlist = [w for w in wordlist if w in vocabulary]
    keyword_indices = [el[0] for el in enumerate(wordlist) if el[1]==keyword]
    concordances = [wordlist[i-half:i+half+1] for i in keyword_indices]
    #concordances = [c for c in concordances if len(c)==window]
    return concordances

In [104]:
cgl["conc_lype"] = cgl["lemmata_repl"].apply(lambda x: get_concordances(x, "λύπ*", 21, vocabulary=cgl_embeddings[1]))

In [105]:
v_ud = "’"
v_agt = "ʼ"

def get_sentences(string):
    splits = re.split(r"(\.|\·|\:|\;|\?|\!)" , string)
    sentences = []
    for n in range(0, len(splits), 2):
        try:
            sentence = splits[n]+splits[n+1]
            sentence = "".join([c for c in sentence if re.match("[\w|\s|\.|\,|\·|\:|\;|\?|\!]", c)])
            sentence = re.sub("^\s*", "", sentence)
            sentence = re.sub(v_ud, v_agt, sentence)
            sentences.append(sentence)
        except:
            sentence = splits[n]
            sentence = re.sub("^\s*", "", sentence)
            if len(sentence) > 0:
                sentences.append(sentence)
            break
    return sentences
cgl["sentences"] = cgl["clean_string"].apply(get_sentences)

In [106]:
cgl["sentences"].apply(len).sum()

256084

In [107]:
cgl["lemmatized_sentences"].apply(len).sum()

256084

perfect, we have the same number of sentences as lemmatized sentences

In [108]:
cgl["lemmatized_sentences_repl"].apply(lambda x: sum([sent.count("λύπ*") for sent in x])).sum()

1147

In [109]:
cgl["conc_lype"].apply(lambda x: len(x)).sum()

1147

perfect, we have the same number of lype instances matched in concordances as in lemmatized sentences...

In [110]:
def sentences_with_lype(work_row):
    inds_counts_tups = [(i,j.count("λύπ*")) for i,j in enumerate(work_row["lemmatized_sentences_repl"]) if "λύπ*" in j]
    sentences = work_row["sentences"]
    lype_sentences = []
    for tup in inds_counts_tups:
        lype_sentences.extend([sentences[tup[0]]] * tup[1])
    return lype_sentences

In [111]:
cgl["sentences_with_lype"] = cgl.apply(sentences_with_lype, axis=1)

In [112]:
cgl["sentences_with_lype"].apply(len).sum()

1147

In [113]:
cgl.groupby("author_id").sum().sort_values("count_λύπ*", ascending=False)[["count_λύπ*"]] # [["author", "title", "doc_id", "lype_count"]]

Unnamed: 0_level_0,count_λύπ*
author_id,Unnamed: 1_level_1
tlg0086,414
tlg0059,316
tlg0032,75
tlg0006,74
tlg0627,58
tlg0010,47
tlg0014,40
tlg0011,35
tlg0003,25
tlg0019,18


In [114]:
cgl.sort_values("count_λύπ*", ascending=False)[["author", "title", "doc_id", "count_λύπ*"]]

Unnamed: 0,author,title,doc_id,count_λύπ*
692,Aristotle,Nicomachean Ethics,tlg0086.tlg010,153
544,Plato,Philebus,tlg0059.tlg010,99
691,Aristotle,Eudemian Ethics (Greek). Machine readable text,tlg0086.tlg009,76
711,Aristotle,Rhetoric,tlg0086.tlg038,64
568,Plato,Laws,tlg0059.tlg034,60
...,...,...,...,...
373,Antiphon,The Third Tetralogy: Prosecution for Murder Of...,tlg0028.tlg004,0
374,Antiphon,Περὶ τοῦ Ἡρῷδου φόνου,tlg0028.tlg005,0
375,Antiphon,On the Choreutes,tlg0028.tlg006,0
379,Hyperides,In Defence of Lycophron,tlg0030.tlg001,0


In [118]:
cgl[cgl["doc_id"].isin(biological_ids)]["conc_lype"].apply(len).sum()

26

In [119]:
cgl[cgl["doc_id"].isin(medical_ids)]["conc_lype"].apply(len).sum()


58

In [120]:
lype_author_ids = [el for ellist in cgl.apply(lambda row: [row["author_id"] for conc in row["conc_lype"]], axis=1) for el in ellist]
lype_doc_ids = [el for ellist in cgl.apply(lambda row: [row["doc_id"] for conc in row["conc_lype"]], axis=1) for el in ellist]
lype_doc_titles = [el for ellist in cgl.apply(lambda row: [row["title"] for conc in row["conc_lype"]], axis=1)  for el in ellist]
lype_concs = [el for ellist in cgl["conc_lype"] for el in ellist]
lype_sents = [el for ellist in cgl["sentences_with_lype"] for el in ellist]
data = np.array([[word_index[w] for w in conc] for conc in lype_concs])

In [121]:
len(lype_sents)

1147

In [122]:
data_df = pd.DataFrame([lype_author_ids, lype_doc_ids, lype_doc_titles, lype_concs, lype_sents, data]).T
data_df.columns = ["author_id", "doc_id", "title", "conc", "sent", "features"]
data_df.head(5)

Unnamed: 0,author_id,doc_id,title,conc,sent,features
0,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[χάρις, κατατίθημι, ναυτικός, κτέομαι, πολύς, ...",τίς εὐπραξία σπανιωτέρα ἢ τίς τοῖς πολεμίοις λ...,"[193, 1264, 690, 421, 6, 452, 4380, 2096, 22, ..."
1,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[ἄνθρωπος, πολύς, ἀρκέω, παρασκευή, δίκαιος, π...",ταύτης μέντοι τοιαύτης ἀντικαθεστηκυίας πόλεως...,"[24, 6, 1125, 870, 56, 57, 189, 145, 89, 1, 20..."
2,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[Πελοπόννησος, πόλις, ὠφέλιμος, καθίστημι, ἐξη...",καὶ εἰ τότε ὑπομείναντες διὰ παντὸς ἀπήχθεσθε ...,"[711, 13, 812, 142, 2251, 529, 9, 1616, 31, 13..."
3,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[συμβαίνω, αἰτία, ἄλλος, ἀπόστασις, μέγας, φόρ...",οἱ γὰρ Ἀθηναῖοι ἀκριβῶς ἔπρασσον καὶ λυπηροὶ ἦ...,"[85, 127, 7, 1770, 18, 2150, 98, 5, 70, 57, 20..."
4,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[κοινός, πολιτεύω, ἡμέρα, ἐπιτήδευμα, ὑποψία, ...",ἐλευθέρως δὲ τά τε πρὸς τὸ κοινὸν πολιτεύομεν ...,"[121, 1182, 66, 798, 2266, 494, 144, 222, 4, 3..."


In [125]:
# we work only with complete concordances:

data_df = data_df[data_df["conc"].apply(len)==data_df["conc"].apply(len).max()].copy()

# Focus on subsets

In [126]:
# two types of Aristotle's writings
ethical_ids = ["tlg0086.tlg009", "tlg0086.tlg010", "tlg0086.tlg022", "tlg0086.tlg035", "tlg0086.tlg038",]

biological_ids = ["tlg0086.tlg002", "tlg0086.tlg004", "tlg0086.tlg007", "tlg0086.tlg008", "tlg0086.tlg012", "tlg0086.tlg014", "tlg0086.tlg015", "tlg0086.tlg016", "tlg0086.tlg018", "tlg0086.tlg020", "tlg0086.tlg021", "tlg0086.tlg24", "tlg0086.tlg030", "tlg0086.tlg032", "tlg0086.tlg037", "tlg0086.tlg042", "tlg0086.tlg043"]

historians_ids = cgl[cgl["author_id"].isin(["tlg0003", "tlg0016", "tlg0032"])]["doc_id"].tolist()
rhetorics_ids = cgl[cgl["author_id"].isin(["tlg0014", "tlg0010", "tlg0540", "tlg0030", "tlg0026"])]["doc_id"].tolist()
dramatists_ids = cgl[cgl["author_id"].isin(["tlg0006", "tlg0019", "tlg0011", "tlg0085"])]["doc_id"].tolist()
medical_ids = cgl[cgl["author_id"].isin(["tlg0627"])]["doc_id"].tolist()
plato_ids = cgl[cgl["author_id"].isin( ["tlg0059"])]["doc_id"].tolist()

In [127]:
def get_classes_and_filter(doc_id, classes=[ethical_ids, medical_ids]):
    onehot = np.zeros(len(classes))
    for n in range(len(classes)):
        if doc_id in classes[n]:
            onehot[n] = 1
    onehot = onehot.astype(int)
    return onehot

In [128]:
data_df["onehot"] = data_df["doc_id"].apply(get_classes_and_filter, classes=[ethical_ids, medical_ids])

In [129]:
data_df

Unnamed: 0,author_id,doc_id,title,conc,sent,features,onehot
0,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[χάρις, κατατίθημι, ναυτικός, κτέομαι, πολύς, ...",τίς εὐπραξία σπανιωτέρα ἢ τίς τοῖς πολεμίοις λ...,"[193, 1264, 690, 421, 6, 452, 4380, 2096, 22, ...","[0, 0]"
1,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[ἄνθρωπος, πολύς, ἀρκέω, παρασκευή, δίκαιος, π...",ταύτης μέντοι τοιαύτης ἀντικαθεστηκυίας πόλεως...,"[24, 6, 1125, 870, 56, 57, 189, 145, 89, 1, 20...","[0, 0]"
2,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[Πελοπόννησος, πόλις, ὠφέλιμος, καθίστημι, ἐξη...",καὶ εἰ τότε ὑπομείναντες διὰ παντὸς ἀπήχθεσθε ...,"[711, 13, 812, 142, 2251, 529, 9, 1616, 31, 13...","[0, 0]"
3,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[συμβαίνω, αἰτία, ἄλλος, ἀπόστασις, μέγας, φόρ...",οἱ γὰρ Ἀθηναῖοι ἀκριβῶς ἔπρασσον καὶ λυπηροὶ ἦ...,"[85, 127, 7, 1770, 18, 2150, 98, 5, 70, 57, 20...","[0, 0]"
4,tlg0003,tlg0003.tlg001,The Peloponnesian War,"[κοινός, πολιτεύω, ἡμέρα, ἐπιτήδευμα, ὑποψία, ...",ἐλευθέρως δὲ τά τε πρὸς τὸ κοινὸν πολιτεύομεν ...,"[121, 1182, 66, 798, 2266, 494, 144, 222, 4, 3...","[0, 0]"
...,...,...,...,...,...,...,...
1140,tlg0627,tlg0627.tlg049,De medico,"[πολύς, σάρξ, ἄλλος, παραπλήσιος, κύκλος, μέγα...",οὕτω γὰρ ἐκ πλείστων μερῶν εὑρήσεις ἄγουσαν ἐς...,"[6, 329, 7, 892, 281, 18, 6, 81, 158, 19, 208,...","[0, 1]"
1141,tlg0627,tlg0627.tlg049,De medico,"[ἀπέχω, τόπος, πλατύς, κύκλος, πολύς, ἄλλος, σ...",ἐπιπροσθεῖν οὖν ξυμβαίνει τὴν ἐντεῦθεν ἑλκομέν...,"[516, 129, 1374, 281, 6, 7, 329, 85, 2823, 139...","[0, 1]"
1144,tlg0627,tlg0627.tlg055,"Epistulae, Decretum, Orationes","[θνῄσκω, πᾶς, κακός, ἕκαστος, ὕλη, γέλως, ὑπόκ...","Οὐ θεομαχεῖς δὲ, εἰ δύο ἐόντων ἐν κόσμῳ, χαρᾶς...","[867, 9, 42, 28, 326, 1307, 788, 1, 657, 3750,...","[0, 1]"
1145,tlg0627,tlg0627.tlg055,"Epistulae, Decretum, Orationes","[βίος, ἄτακτος, αὐτάρκης, ὑπάρχω, σύμπας, μετα...",Οἱ δʼ ὡς ἐπʼ ἀρηρυίῃ καὶ βεβαίῃ ἐκλελησμένοι π...,"[137, 3601, 1916, 46, 446, 403, 320, 1651, 227...","[0, 1]"


In [130]:
subset = data_df[data_df["onehot"].apply(sum)>0]

In [175]:
# randomized index
indices = np.arange(len(subset))
np.random.seed(1)
np.random.shuffle(indices)

In [176]:
indices[:10]

array([162, 300, 400, 117,  80, 201, 389, 268,  29,  62])

In [177]:
# reshuffle dataframe:
subset = subset.iloc[indices]

In [178]:
cgl_embeddings[3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
εἰμί,40.921943,23.975110,6.083791,1.724597,0.787708,1.395476,10.085121,-3.512090,-2.466157,-0.479203,...,0.135525,0.116194,0.038779,0.028518,0.048852,0.063003,-0.006917,-0.285075,0.083432,0.050213
οὗτος,40.370998,23.444483,5.179671,1.611451,0.341264,1.735556,9.830402,-3.353345,-2.345492,-0.181414,...,0.139695,0.078060,0.127660,0.069392,0.060385,0.249957,0.070054,-0.194425,-0.004014,0.040241
λέγω,39.001855,22.120811,3.711950,0.918392,1.452316,0.077037,9.391213,-2.759150,-2.398183,0.214704,...,-0.031307,0.068023,0.000057,0.069156,0.111072,0.038494,0.002585,-0.019338,0.269553,-0.130885
ἔχω,39.804302,23.024422,6.199601,2.275942,0.625015,1.243543,9.147309,-3.485462,-2.581793,-0.275819,...,0.024101,0.063533,0.134114,0.179058,0.043486,0.113336,-0.036198,-0.342098,0.209940,-0.073624
γίγνομαι,39.347879,22.489298,5.427730,1.608740,-0.299176,1.453300,8.521668,-3.282159,-2.755167,-0.361946,...,0.017706,0.150147,-0.100498,0.024932,-0.241511,0.180704,-0.044182,-0.225451,-0.009623,0.082624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
κύαθος,2.306894,-1.252014,1.967848,0.639823,-0.830738,1.400087,0.623265,-0.022984,1.364765,-1.015133,...,0.029445,0.035054,-0.064084,-0.022086,-0.026919,-0.029758,-0.059165,0.081717,0.070341,0.065586
σκυτεύς,2.426349,-1.921372,0.401636,-0.442950,-0.050278,0.079601,1.183014,-0.539222,0.169868,0.727299,...,-0.379041,0.069706,0.277146,0.071542,-0.010878,0.112946,-0.114023,0.315502,-0.062303,-0.036280
παραμίγνυμι,2.460930,-1.073599,2.207248,0.563545,-0.814028,1.938490,0.646300,-0.572682,2.623897,-1.114314,...,-0.005971,-0.170639,-0.140238,0.136425,0.311922,0.270307,-0.047148,-0.198601,-0.085457,0.311602
ὀφθαλμία,1.968229,-1.475017,1.460325,-0.047004,-0.329035,0.401820,0.145133,-0.578576,-0.902634,0.095910,...,-0.229633,-0.162759,-0.336879,-0.035314,0.135483,-0.145982,0.128570,-0.032265,-0.125454,-0.178760


In [179]:
training_samples = int(len(subset) * 0.6)
test_samples = int(len(subset) * 0.2)

train_ind = indices[:training_samples]
val_ind = indices[training_samples:-test_samples]
test_ind = indices[-test_samples:]

df_train = subset.iloc[train_ind]
df_val = subset.iloc[val_ind]
df_test = subset.iloc[test_ind]

In [180]:
# check embeddings
embedding_matrix = cgl_embeddings[3]
embedding_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
εἰμί,40.921943,23.975110,6.083791,1.724597,0.787708,1.395476,10.085121,-3.512090,-2.466157,-0.479203,...,0.135525,0.116194,0.038779,0.028518,0.048852,0.063003,-0.006917,-0.285075,0.083432,0.050213
οὗτος,40.370998,23.444483,5.179671,1.611451,0.341264,1.735556,9.830402,-3.353345,-2.345492,-0.181414,...,0.139695,0.078060,0.127660,0.069392,0.060385,0.249957,0.070054,-0.194425,-0.004014,0.040241
λέγω,39.001855,22.120811,3.711950,0.918392,1.452316,0.077037,9.391213,-2.759150,-2.398183,0.214704,...,-0.031307,0.068023,0.000057,0.069156,0.111072,0.038494,0.002585,-0.019338,0.269553,-0.130885
ἔχω,39.804302,23.024422,6.199601,2.275942,0.625015,1.243543,9.147309,-3.485462,-2.581793,-0.275819,...,0.024101,0.063533,0.134114,0.179058,0.043486,0.113336,-0.036198,-0.342098,0.209940,-0.073624
γίγνομαι,39.347879,22.489298,5.427730,1.608740,-0.299176,1.453300,8.521668,-3.282159,-2.755167,-0.361946,...,0.017706,0.150147,-0.100498,0.024932,-0.241511,0.180704,-0.044182,-0.225451,-0.009623,0.082624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
κύαθος,2.306894,-1.252014,1.967848,0.639823,-0.830738,1.400087,0.623265,-0.022984,1.364765,-1.015133,...,0.029445,0.035054,-0.064084,-0.022086,-0.026919,-0.029758,-0.059165,0.081717,0.070341,0.065586
σκυτεύς,2.426349,-1.921372,0.401636,-0.442950,-0.050278,0.079601,1.183014,-0.539222,0.169868,0.727299,...,-0.379041,0.069706,0.277146,0.071542,-0.010878,0.112946,-0.114023,0.315502,-0.062303,-0.036280
παραμίγνυμι,2.460930,-1.073599,2.207248,0.563545,-0.814028,1.938490,0.646300,-0.572682,2.623897,-1.114314,...,-0.005971,-0.170639,-0.140238,0.136425,0.311922,0.270307,-0.047148,-0.198601,-0.085457,0.311602
ὀφθαλμία,1.968229,-1.475017,1.460325,-0.047004,-0.329035,0.401820,0.145133,-0.578576,-0.902634,0.095910,...,-0.229633,-0.162759,-0.336879,-0.035314,0.135483,-0.145982,0.128570,-0.032265,-0.125454,-0.178760


In [181]:
model = Sequential()
model.add(Embedding(5000,  150, input_length=21))
model.add(Flatten())
model.add(Dense(21*150, activation='relu'))
model.add(Dense(21*75, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.summary()

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=["acc"])
history = model.fit(np.array(df_train["features"].tolist()),
                    np.array(df_train["onehot"].tolist()),
                    epochs=10,
                    batch_size=16,
                    validation_data=(np.array(df_val["features"].tolist()), np.array(df_val["onehot"].tolist())
                                     ))
model.save_weights('pre_trained_glove_model.h5')

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 21, 150)           750000    
                                                                 
 flatten_8 (Flatten)         (None, 3150)              0         
                                                                 
 dense_17 (Dense)            (None, 3150)              9925650   
                                                                 
 dense_18 (Dense)            (None, 1575)              4962825   
                                                                 
 dense_19 (Dense)            (None, 2)                 3152      
                                                                 
Total params: 15,641,627
Trainable params: 15,641,627
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/1

In [182]:
test_predict = model.predict(np.array(df_test["features"].tolist())).round().astype(int)

In [183]:
test_predict

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1,

In [174]:
len(test_predict)

80

In [166]:
test_predict

array([[9.999e-01, 1.000e-04],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [9.958e-01, 3.000e-03],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [9.992e-01, 1.000e-03],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [9.994e-01, 5.000e-04],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [9.999e-01, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00],
       [

In [186]:
df_test["predict_round"] = [el for el in test_predict]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["predict_round"] = [el for el in test_predict]


In [187]:
df_test

Unnamed: 0,author_id,doc_id,title,conc,sent,features,onehot,predict_round
1094,tlg0627,tlg0627.tlg001,On Ancient Medicine,"[δριμύς, ἀφίημι, τοιοῦτος, ῥώννυμι, νόσημα, πα...",ὅταν δὲ παχύτερα καὶ πεπαίτερα γένηται καὶ πάσ...,"[1384, 243, 17, 3838, 718, 635, 3695, 5, 288, ...","[0, 1]","[1, 0]"
1139,tlg0627,tlg0627.tlg049,De medico,"[χρῆσις, εἰμί, ἀμφότεροι, εἷς, χρή, ποιέω, ταχ...","ἐπεὶ γὰρ συμβαίνει τοὺς τεμνομένους πονέειν, τ...","[1087, 1, 173, 61, 50, 10, 109, 1042, 85, 143,...","[0, 1]","[1, 0]"
952,tlg0086,tlg0086.tlg022,Magna Moralia,"[εἰμί, ἀρετή, πάθος, πάθος, λύπ*, ἡδονή, μέσος...",δῆλον οὖν ὡς καὶ ἡ ἀρετὴ μετὰ λύπης ἢ ἡδονῆς.,"[1, 97, 227, 227, 208, 144, 111, 1, 89, 97, 20...","[1, 0]","[1, 0]"
721,tlg0086,tlg0086.tlg009,Eudemian Ethics (Greek). Machine readable text,"[ἐρῶ, βελτίων, διορίζω, φοβερός, λέγω, ποιητικ...",τοιαῦτα δʼ ἐστὶν ὅσα φαίνεται ποιητικὰ λύπης φ...,"[210, 200, 1177, 888, 3, 797, 377, 17, 40, 797...","[1, 0]","[1, 0]"
784,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[ἀπάτη, ἡδονή, ἔοικα, γίγνομαι, εἰμί, ἀγαθός, ...","αἱροῦνται οὖν τὸ ἡδὺ ὡς ἀγαθόν, τὴν δὲ λύπην ὡ...","[1735, 144, 93, 5, 1, 21, 40, 149, 195, 21, 20...","[1, 0]","[1, 0]"
...,...,...,...,...,...,...,...,...
813,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[ὀργιζόμενοι, δεῖ, ἠλίθιος, δοκέω, εἰμί, δεῖ, ...","δοκεῖ γὰρ οὐκ αἰσθάνεσθαι οὐδὲ λυπεῖσθαι, μὴ ὀ...","[4599, 19, 3002, 16, 1, 19, 466, 19, 16, 283, ...","[1, 0]","[1, 0]"
860,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[πολύς, λύπ*, φύσις, πόνο*, ζώιον, μαρτύρομαι,...","ἀεὶ γὰρ πονεῖ τὸ ζῷον, ὥσπερ καὶ οἱ φυσιολόγοι...","[6, 208, 44, 143, 417, 1846, 29, 51, 468, 1, 2...","[1, 0]","[1, 0]"
685,tlg0086,tlg0086.tlg009,Eudemian Ethics (Greek). Machine readable text,"[δῆλος, οὗτος, διαίρεσις, πάθος, δύναμις, ἔχω,...",αἱ μὲν γὰρ δυνάμεις καὶ αἱ ἕξεις τῶν παθημάτων...,"[89, 2, 1042, 227, 79, 4, 79, 4, 939, 227, 208...","[1, 0]","[1, 0]"
773,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[λέγω, μέσος, ὑπερβάλλω, πᾶς, μηδείς, ἀναίσχυν...","νέμεσις δὲ μεσότης φθόνου καὶ ἐπιχαιρεκακίας, ...","[3, 111, 717, 9, 55, 3028, 111, 1624, 1091, 1,...","[1, 0]","[1, 0]"


In [194]:
accuracy_score([el[0] for el in df_test["onehot"].tolist()], [el[0] for el in df_test["predict_round"].tolist()])

0.8625

In [197]:
balanced_accuracy_score([el[0] for el in df_test["onehot"].tolist()], [el[0] for el in df_test["predict_round"].tolist()])

0.5

In [198]:
f1_score([el[0] for el in df_test["onehot"].tolist()], [el[0] for el in df_test["predict_round"].tolist()])

0.9261744966442953

In [199]:
df_test[-10:]

Unnamed: 0,author_id,doc_id,title,conc,sent,features,onehot,predict_round
999,tlg0086,tlg0086.tlg038,Rhetoric,"[κατασκευάζω, τοιοῦτος, εὔνοια, φιλία, πάθος, ...",ἔστι δὲ τὰ πάθη διʼ ὅσα μεταβάλλοντες διαφέρου...,"[630, 17, 752, 313, 227, 728, 227, 128, 520, 3...","[1, 0]","[1, 0]"
894,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[ποιέω, ἀλλότριος, λυμαίνομαι, δῆλος, πολύς, δ...",σχεδὸν γὰρ αἱ ἀλλότριαι ἡδοναὶ ποιοῦσιν ὅπερ α...,"[10, 525, 2091, 89, 6, 2034, 525, 144, 10, 223...","[1, 0]","[1, 0]"
1000,tlg0086,tlg0086.tlg038,Rhetoric,"[ἄλλος, προερέω, πρότασις, οὗτος, ποιέω, διαιρ...",ἔστω δὴ ὀργὴ ὄρεξις μετὰ λύπης τιμωρίας φαινομ...,"[7, 1644, 543, 2, 10, 502, 3, 68, 494, 1557, 2...","[1, 0]","[1, 0]"
1141,tlg0627,tlg0627.tlg049,De medico,"[ἀπέχω, τόπος, πλατύς, κύκλος, πολύς, ἄλλος, σ...",ἐπιπροσθεῖν οὖν ξυμβαίνει τὴν ἐντεῦθεν ἑλκομέν...,"[516, 129, 1374, 281, 6, 7, 329, 85, 2823, 139...","[0, 1]","[1, 0]"
795,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[μέρος, οὗτος, εἰμί, ἀρετή, μεσότης, ἡδονή, σω...",ἧττον γὰρ καὶ οὐχ ὁμοίως ἐστὶ περὶ τὰς λύπας·,"[81, 2, 1, 97, 1624, 144, 621, 3, 134, 1, 208,...","[1, 0]","[1, 0]"
813,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[ὀργιζόμενοι, δεῖ, ἠλίθιος, δοκέω, εἰμί, δεῖ, ...","δοκεῖ γὰρ οὐκ αἰσθάνεσθαι οὐδὲ λυπεῖσθαι, μὴ ὀ...","[4599, 19, 3002, 16, 1, 19, 466, 19, 16, 283, ...","[1, 0]","[1, 0]"
860,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[πολύς, λύπ*, φύσις, πόνο*, ζώιον, μαρτύρομαι,...","ἀεὶ γὰρ πονεῖ τὸ ζῷον, ὥσπερ καὶ οἱ φυσιολόγοι...","[6, 208, 44, 143, 417, 1846, 29, 51, 468, 1, 2...","[1, 0]","[1, 0]"
685,tlg0086,tlg0086.tlg009,Eudemian Ethics (Greek). Machine readable text,"[δῆλος, οὗτος, διαίρεσις, πάθος, δύναμις, ἔχω,...",αἱ μὲν γὰρ δυνάμεις καὶ αἱ ἕξεις τῶν παθημάτων...,"[89, 2, 1042, 227, 79, 4, 79, 4, 939, 227, 208...","[1, 0]","[1, 0]"
773,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[λέγω, μέσος, ὑπερβάλλω, πᾶς, μηδείς, ἀναίσχυν...","νέμεσις δὲ μεσότης φθόνου καὶ ἐπιχαιρεκακίας, ...","[3, 111, 717, 9, 55, 3028, 111, 1624, 1091, 1,...","[1, 0]","[1, 0]"
899,tlg0086,tlg0086.tlg010,Nicomachean Ethics,"[ἕτερος, εἶδος, διαφέρω, εἶδος, εὔλογος, εἰμί,...","τὰ γὰρ αὐτὰ τοὺς μὲν τέρπει τοὺς δὲ λυπεῖ, καὶ...","[25, 117, 128, 117, 1491, 1, 1948, 99, 24, 272...","[1, 0]","[1, 0]"
