In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# plotting
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as mcolors

import seaborn as sns
import os
import json
from translator import translator
import pickle

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

# establish connection with gogglesheets...
file_data = json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

In [3]:
keywords = ["ἀλγέω", "ἄλγημα", "ἄλγος", "λυπέω", "λύπη", "λυπηρός", "ὀδυνάω", "ὀδύνη", "πονέω", "πόνος"]

In [4]:
vectors_dict = {}
for filename in [f for f in os.listdir("../data/") if "keyed_vectors" in f]:
    vectors_dict[filename.partition(".")[0]] = KeyedVectors.load("../data/" + filename)

In [5]:
complete_sim_matrices = {}
for item in vectors_dict.items():
    complete_sim_matrix = cosine_similarity(item[1].vectors)
    complete_sim_matrices[item[0].replace("keyed_vectors_", "")] = complete_sim_matrix

In [6]:
complete_sim_matrices

{'full': array([[ 0.99999994,  0.28401333,  0.22628057, ..., -0.11469285,
         -0.11473358, -0.08506203],
        [ 0.28401333,  0.99999976,  0.4833894 , ..., -0.28129402,
         -0.01936252,  0.1847815 ],
        [ 0.22628057,  0.4833894 ,  0.99999994, ..., -0.29970014,
          0.19981211,  0.2743499 ],
        ...,
        [-0.11469285, -0.28129402, -0.29970014, ...,  1.0000001 ,
          0.035104  , -0.20264925],
        [-0.11473358, -0.01936252,  0.19981211, ...,  0.035104  ,
          1.        ,  0.31559372],
        [-0.08506203,  0.1847815 ,  0.2743499 , ..., -0.20264925,
          0.31559372,  0.99999976]], dtype=float32),
 'excl_arist': array([[ 1.        ,  0.10405294,  0.06687076, ...,  0.11125881,
         -0.15194258, -0.18770437],
        [ 0.10405294,  0.99999976,  0.27991435, ..., -0.00367323,
         -0.10193241,  0.01235248],
        [ 0.06687076,  0.27991435,  1.        , ..., -0.13431397,
          0.09964176,  0.11343659],
        ...,
        [ 0.11125

# Analyzing categories

In [42]:
terms_translation_categories = get_as_dataframe(PIPA_data.worksheet("translation"))[["greek", "english", "category_clean"]]
terms_translation_categories = terms_translation_categories[terms_translation_categories["greek"].notnull()]
terms_translation_categories.rename(columns={"category_clean" : "category"}, inplace=True)
terms_translation_categories.head(5)

Unnamed: 0,greek,english,category
0,χαίρω (0.52),rejoice,opossite
1,ἀγανακτέω (0.5),to be displeased,emotion
2,ἥδομαι (0.49),feel pleasure,opossite
3,ἀπολαύω (0.48),enjoy,opossite
4,διάκειμαι (0.46),to be affected,suffering


In [43]:
terms_translation_categories["greek"] = terms_translation_categories["greek"].apply(lambda x: x.rpartition(" (")[0].replace(" ", ""))

In [44]:
terms_translation_categories = terms_translation_categories.drop_duplicates(subset="greek")

In [45]:
terms_translation_categories.groupby("category").size()

category
bodily parts    20
dietetics       27
emotion         15
moral           16
opossite        17
other           13
pain            15
pathology       19
suffering        9
dtype: int64

In [46]:
categories = list(set(terms_translation_categories["category"]))

In [47]:
term_category_dict = dict(zip(terms_translation_categories["greek"], terms_translation_categories["category"]))
term_category_dict

{'χαίρω': 'opossite',
 'ἀγανακτέω': 'emotion',
 'ἥδομαι': 'opossite',
 'ἀπολαύω': 'opossite',
 'διάκειμαι': 'suffering',
 'ἀκόλαστος': 'moral',
 'κακός': 'moral',
 'ἄχθομαι': 'emotion',
 'εὐφραίνω': 'opossite',
 'λυπηρός': 'pain',
 'ἐξαμαρτάνω': 'other',
 'πλησιάζω': 'other',
 'μισέω': 'moral',
 'φοβερός': 'emotion',
 'δυσχεραίνω': 'emotion',
 'λοιδορέω': 'suffering',
 'ὀργίζω': 'suffering',
 'σύνοιδα': 'other',
 'φθονέω': 'moral',
 'ἀνάξιος': 'moral',
 'ἀλγέω': 'pain',
 'ἀνόητος': 'moral',
 'δακρύω': 'emotion',
 'ψαύω': 'other',
 'ὀδυνάω': 'pain',
 'βαρύνω': 'suffering',
 'ἀλγεινός': 'pain',
 'ἄλγημα': 'pain',
 'νείαιρα': 'bodily parts',
 'ἰξύα': 'bodily parts',
 'ὑποχόνδριος': 'bodily parts',
 'λυπέω': 'pain',
 'βάρος': 'pathology',
 'ἧπαρ': 'bodily parts',
 'σπάω': 'pathology',
 'πλευρόν': 'bodily parts',
 'παραφρονέω': 'emotion',
 'πυρεταίνω': 'pathology',
 'ἀλγηδών': 'pain',
 'πόθος': 'opossite',
 'ὀδύρομαι': 'emotion',
 'θυμόω': 'suffering',
 'ἄλγος': 'pain',
 'οἰκτρός': 'emotion

In [48]:
term_translation_dict = dict(zip(terms_translation_categories["greek"], terms_translation_categories["english"]))
term_translation_dict

{'χαίρω': 'rejoice',
 'ἀγανακτέω': 'to be displeased',
 'ἥδομαι': 'feel pleasure',
 'ἀπολαύω': 'enjoy',
 'διάκειμαι': 'to be affected',
 'ἀκόλαστος': 'incontinent',
 'κακός': 'bad',
 'ἄχθομαι': 'to be grieved',
 'εὐφραίνω': 'enjoy oneself',
 'λυπηρός': 'painful',
 'ἐξαμαρτάνω': 'fail',
 'πλησιάζω': 'approach',
 'μισέω': 'hate',
 'φοβερός': 'fearful',
 'δυσχεραίνω': 'to be displeased',
 'λοιδορέω': 'abuse',
 'ὀργίζω': 'irritate',
 'σύνοιδα': 'know',
 'φθονέω': 'be jealous',
 'ἀνάξιος': 'worthless',
 'ἀλγέω': 'be in pain',
 'ἀνόητος': 'silly',
 'δακρύω': 'weep',
 'ψαύω': 'touch',
 'ὀδυνάω': 'be in pain',
 'βαρύνω': 'oppress',
 'ἀλγεινός': 'painful',
 'ἄλγημα': 'pain',
 'νείαιρα': 'abdomen',
 'ἰξύα': 'waist',
 'ὑποχόνδριος': 'abdomen',
 'λυπέω': 'be in pain',
 'βάρος': 'heaviness',
 'ἧπαρ': 'liver',
 'σπάω': 'spasm',
 'πλευρόν': 'rib',
 'παραφρονέω': 'to be deranged',
 'πυρεταίνω': 'have fever',
 'ἀλγηδών': 'painful',
 'πόθος': 'desire',
 'ὀδύρομαι': 'lament',
 'θυμόω': 'provoke',
 'ἄλγος

In [49]:
cat_terms_dict = {}
for cat in categories:
    terms = terms_translation_categories[terms_translation_categories["category"]==cat]["greek"].tolist()
    cat_terms_dict[cat] = terms

In [50]:
cat_terms_dict

{'bodily parts': ['νείαιρα',
  'ἰξύα',
  'ὑποχόνδριος',
  'ἧπαρ',
  'πλευρόν',
  'κλείς',
  'κενεών',
  'βουβών',
  'μετάφρενον',
  'τράχηλος',
  'ὀσφῦς',
  'βλέφαρον',
  'σῶμα',
  'σωματικός',
  'ῥάχις',
  'κνήμη',
  'στῆθος',
  'σφυρόν',
  'φλέβιον',
  'χόνδρος'],
 'pain': ['λυπηρός',
  'ἀλγέω',
  'ὀδυνάω',
  'ἀλγεινός',
  'ἄλγημα',
  'λυπέω',
  'ἀλγηδών',
  'ἄλγος',
  'ὠδίς',
  'πόνος',
  'ἐπίπονος',
  'λύπη',
  'ἄχος',
  'ὀδύνη',
  'ὀδυνώδης'],
 'moral': ['ἀκόλαστος',
  'κακός',
  'μισέω',
  'φθονέω',
  'ἀνάξιος',
  'ἀνόητος',
  'βλαβερός',
  'θαρσαλέος',
  'ἀκρασία',
  'ἀκολασία',
  'ἄνοια',
  'κακία',
  'αἰδώς',
  'λήθη',
  'θάρσος',
  'κακοήθης'],
 'pathology': ['βάρος',
  'σπάω',
  'πυρεταίνω',
  'ἕλκος',
  'ἑλκόω',
  'φόνιος',
  'θανάσιμος',
  'βήξ',
  'θέρμη',
  'οἴδημα',
  'στραγγουρία',
  'διάρροια',
  'ὕφαιμος',
  'κατάψυξις',
  'διαθερμαίνω',
  'πυός',
  'στάζω',
  'φρικώδης',
  'ἀσώδης'],
 'emotion': ['ἀγανακτέω',
  'ἄχθομαι',
  'φοβερός',
  'δυσχεραίνω',
  'δακρύω',
  '

In [51]:
vectors_dict["keyed_vectors_full"].n_similarity(["πόνος"], cat_terms_dict["dietetics"])

0.641673

In [52]:
sim_tups = []
for keyword in keywords:
    for cat in cat_terms_dict.keys():
        sim_tups.append((keyword, cat, vectors_dict["keyed_vectors_full"].n_similarity([keyword], cat_terms_dict[cat])))

In [53]:
sim_tups

[('ἀλγέω', 'bodily parts', 0.3907564),
 ('ἀλγέω', 'pain', 0.61059135),
 ('ἀλγέω', 'moral', 0.31691295),
 ('ἀλγέω', 'pathology', 0.39712083),
 ('ἀλγέω', 'emotion', 0.3932681),
 ('ἀλγέω', 'other', 0.38458845),
 ('ἀλγέω', 'suffering', 0.4330012),
 ('ἀλγέω', 'opossite', 0.27486372),
 ('ἀλγέω', 'dietetics', 0.19012412),
 ('ἄλγημα', 'bodily parts', 0.7029688),
 ('ἄλγημα', 'pain', 0.66649115),
 ('ἄλγημα', 'moral', 0.08219812),
 ('ἄλγημα', 'pathology', 0.70607466),
 ('ἄλγημα', 'emotion', 0.11782563),
 ('ἄλγημα', 'other', 0.25914767),
 ('ἄλγημα', 'suffering', 0.11568883),
 ('ἄλγημα', 'opossite', 0.047449432),
 ('ἄλγημα', 'dietetics', 0.3659613),
 ('ἄλγος', 'bodily parts', 0.36313546),
 ('ἄλγος', 'pain', 0.642763),
 ('ἄλγος', 'moral', 0.28199112),
 ('ἄλγος', 'pathology', 0.4151088),
 ('ἄλγος', 'emotion', 0.63292605),
 ('ἄλγος', 'other', 0.3345761),
 ('ἄλγος', 'suffering', 0.5664587),
 ('ἄλγος', 'opossite', 0.34056836),
 ('ἄλγος', 'dietetics', 0.36896706),
 ('λυπέω', 'bodily parts', -0.094281085)

# Extending categories

In [54]:
other_words = []
for term in term_category_dict.keys():
    other_words.extend([tup[0] for tup in vectors_dict["keyed_vectors_full"].most_similar(term, topn=5)])

In [55]:
len(other_words)

755

In [56]:
other_words_unique =  set(other_words)
len(other_words_unique)

374

In [57]:
other_words_unique_new = list(other_words_unique.difference(term_category_dict.keys()))
other_words_unique_new[:10]

['αἰσχρός',
 'δειλία',
 'σύγγονος',
 'ἐγκράτεια',
 'φλέψ',
 'ἰσχύς',
 'ἀναίδεια',
 'πυκνότης',
 'ἐπιπίνω',
 'φιλέω']

In [58]:
word_cat_extension = pd.DataFrame({"greek" : other_words_unique_new, "english" : [translator(word) for word in other_words_unique_new], "category" : ["" for el in other_words_unique_new]})

In [59]:
#set_with_dataframe(PIPA_data.add_worksheet("word_cat_extension", 1,1), word_cat_extension)

# Load back the manually coded extension

In [60]:
word_freqs = pickle.load(open("../data/word_freqs_all.pickle", "rb"))
word_freqs_dict = dict(word_freqs)

In [61]:
word_freqs_dict["ἀγαθός"]

8832

In [62]:
word_cat_extension_coded = get_as_dataframe(PIPA_data.worksheet("word_cat_extension"))[["greek", "english", "category"]]
word_cat_extension_coded = word_cat_extension_coded[word_cat_extension_coded["category"].notnull()]
word_cat_extension_coded.head(5)

Unnamed: 0,greek,english,category
0,ὠφέλιμος,"helping, useful, serviceable, profitable, adva...",opossite
1,δύσκολος,hard to satisfy with food;,dietetics
2,πονηρός,"toilsome, painful, grievous",pain
3,ἀγαπάω,"to treat with affection, to caress, love, be f...",opossite
4,ὑγραίνω,"to wet, moisten",dietetics


In [63]:
len(word_cat_extension_coded)

259

In [64]:
word_cat_extension2_coded = get_as_dataframe(PIPA_data.worksheet("word_cat_extension2"))[["greek", "english", "category"]]
word_cat_extension2_coded = word_cat_extension2_coded[word_cat_extension2_coded["category"].notnull()]
word_cat_extension2_coded.head(5)

Unnamed: 0,greek,english,category
0,πάθημα,"anything that befals one, a suffering, calamit...",pathology
1,νουθετέω,"to put in mind, to admonish, warn, advise",other
2,πυρέσσω,to be ill of a fever,pathology
3,παιδικός,"of, for",other
4,πλήσσω,"to strike, smite",other


In [65]:
len(word_cat_extension2_coded)

82

In [66]:
terms_translation_categories = pd.concat([terms_translation_categories, word_cat_extension_coded, word_cat_extension2_coded])

In [67]:
terms_translation_categories = terms_translation_categories[~terms_translation_categories["category"].str.contains("other")]

In [68]:
terms_translation_categories["count"] = terms_translation_categories["greek"].apply(lambda x: word_freqs_dict[x])
terms_translation_categories.sort_values("count", ascending=False, inplace=True)
terms_translation_categories.head(5)

Unnamed: 0,greek,english,category,count
249,ἀγαθός,good,moral,8832
6,κακός,bad,moral,4780
209,σῶμα,body,bodily parts,4651
32,φίλος,"loved, beloved, dear",moral,3534
248,ἀδικέω,to do wrong,moral,2134


In [69]:
len(terms_translation_categories)

332

In [70]:
terms_translation_categories.drop_duplicates(subset="greek", inplace=True)
len(terms_translation_categories)

332

In [75]:
terms_translation_categories["category"].replace("emotion", "emotions", inplace=True)
terms_translation_categories["category"].replace("moral", "morality", inplace=True)

In [76]:
pickle.dump(terms_translation_categories, open("../data/terms_translation_categories.pickle", "wb"))

In [77]:
terms_translation_categories.to_csv("../data/terms_translation_categories.csv")

In [78]:
set_with_dataframe(PIPA_data.add_worksheet("terms_translation_categories", 1,1), terms_translation_categories)