In [None]:
# IMPORTING CORPUS
from cltk.corpus.utils.importer import CorpusImporter

In [None]:
# SEE CORPUS CONTENT
# create variable
my_greek_downloader = CorpusImporter('greek')
# see what is on the list
my_greek_downloader.list_corpora

In [None]:
# to start to work with the language
my_greek_downloader.import_corpus('greek_models_cltk')

In [None]:
my_greek_downloader.import_corpus('greek_proper_names_cltk')

In [None]:
# importing perseus
my_greek_downloader.import_corpus('greek_text_perseus')
my_greek_downloader.import_corpus('greek_treebank_perseus')
my_greek_downloader.import_corpus('greek_lexica_perseus')

In [None]:
# importing first1kgreek
my_greek_downloader.import_corpus('greek_text_first1kgreek')

In [None]:
# importing TLGU
my_greek_downloader.import_corpus('greek_software_tlgu')

In [None]:
my_greek_downloader.import_corpus('tlg', '~/cltk/corpora/TLG_E')

In [None]:
# CONVERTING BETA CODE TO UNICODE
# the files are typically in TEI XML format in betacode
# we need to transform them into unicode
from cltk.corpus.greek.beta_to_unicode import Replacer

In [None]:
# STOPWORDS FILTERING
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stop.greek.stops import STOPS_LIST

In [None]:
# USING TLGU SOFTWARE
from cltk.corpus.greek.tlgu import TLGU


In [None]:
# CONVERTING TEI XML TO PLAINTEXT
# based on BeautifulSoup library (pip install bs4):
from cltk.corpus.greek.tei import onekgreek_tei_xml_to_text
onekgreek_tei_xml_to_text()

# Real work with real texts

In [3]:
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
import os

In [6]:
# INPUT
text_file = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/tlg0645.tlg003.perseus-grc2.txt')

In [8]:
# UPLOADING AND CLEANING
with open(text_file) as f:
    text_raw = f.read()

from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
text_cleaned = tlg_plaintext_cleanup(text_raw, rm_punctuation=True, rm_periods=True) [:]
#  rm_periods=True returns no interpunction

# LEMMATIZATION
# does not work so good as I would like
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')
text_lemmatized = lemmatizer.lemmatize(text_cleaned, return_string=True)


# STROPWORDS FILTERING
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stop.greek.stops import STOPS_LIST
p = PunktLanguageVars()

# STRING FORMATION
text_tokens = p.word_tokenize(text_lemmatized.lower())
text_without_stopwords = [w for w in text_tokens if not w in STOPS_LIST]
text_str = " ".join(text_without_stopwords)

# WORD FREQUENCY DICTIONARY/COUNTER
from cltk.utils.frequency import Frequency
freq = Frequency()
text_freq_counter = freq.counter_from_str(text_str)

In [9]:
text_freq_counter.most_common(10)

[('αὐτός', 967),
 ('σύ', 845),
 ('εἰμί', 830),
 ('ἐγώ', 703),
 ('οὐ', 570),
 ('αὐτὸν', 222),
 ('θεάομαι', 221),
 ('τὰς', 195),
 ('θεὸς', 175),
 ('γαῖα', 162)]

In [12]:
text_raw[:100]

'\n\n\n\nDialogus cum Tryphone\nJustinus Martyr\nGeorges Archambault\n\n\nTrustees of Tufts University\nMedford'

In [None]:
# UPLOADING AND CLEANING
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
import os
# Clement, Paedagogos
text_file = os.path.expanduser('~//cltk_data/greek/text/greek_text_first1kgreek_plaintext/tlg0555.tlg002.txt')
with open(text_file) as f:
    text_raw = f.read()
# text_cleaned = cleaned text:
clem_cleaned = tlg_plaintext_cleanup(socrates_raw, rm_punctuation=True, rm_periods=True) [:]
#  rm_periods=True returns no interpunction


In [None]:
# LEMMATIZATION
# does not work so good as I would like
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')
just_lemmatized = lemmatizer.lemmatize(just_cleaned, return_string=True)
clem_lemmatized = lemmatizer.lemmatize(clem_cleaned, return_string=True)

In [None]:
len(just_lemmatized)

In [None]:
len(clem_lemmatized)

In [None]:
# STOPWORDS FILTERING
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stop.greek.stops import STOPS_LIST
p = PunktLanguageVars()

just_tokens = p.word_tokenize(just_lemmatized.lower())
just_without_stopwords = [w for w in just_tokens if not w in STOPS_LIST]
just_str = " ".join(just_without_stopwords)

clem_tokens = p.word_tokenize(clem_lemmatized.lower())
clem_without_stopwords = [w for w in clem_tokens if not w in STOPS_LIST]
clem_str = " ".join(clem_without_stopwords)


In [None]:
from cltk.utils.frequency import Frequency
freq = Frequency()
just_freq_counter = freq.counter_from_str(just_str)
clem_freq_counter = freq.counter_from_str(clem_str)
# now we have a counter collection
# counters have some specific functions
# like this one: freq_counter.most_common(30)
# but in general they behave as dictionaries

In [None]:
len(just_freq_counter)

In [None]:
len(clem_freq_counter)

In [None]:
just_freq_counter.most_common(10)

In [None]:
clem_freq_counter.most_common(10)

In [None]:
# ANALYZING THE TEXT BY MEANS SKLEARN AND NUMPY
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

texts = (sozomenus_str, socrates_str)
vectorizer = CountVectorizer(texts)
dtm = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names()

In [None]:
dtm

In [None]:
dtm = dtm.toarray()
vocab = np.array(vocab)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)
np.round(dist, 2)

In [None]:
import os
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)

xs, ys = pos[:, 0], pos[:, 1]

for x, y in zip(xs, ys):
    #color = 'orange' if "socrates" in text else 'skyblue'
    plt.scatter(x, y)#, c=color) 

    


In [None]:
plt.show()