In [7]:
# IMPORTING CORPUS
from cltk.corpus.utils.importer import CorpusImporter

In [37]:
# SEE CORPUS CONTENT
# create variable
my_greek_downloader = CorpusImporter('greek')
# see what is on the list
my_greek_downloader.list_corpora

['greek_software_tlgu',
 'greek_text_perseus',
 'phi7',
 'tlg',
 'greek_proper_names_cltk',
 'greek_models_cltk',
 'greek_treebank_perseus',
 'greek_lexica_perseus',
 'greek_training_set_sentence_cltk',
 'greek_word2vec_cltk',
 'greek_text_lacus_curtius',
 'greek_text_first1kgreek']

In [38]:
# to start to work with the language
my_greek_downloader.import_corpus('greek_models_cltk')

In [39]:
my_greek_downloader.import_corpus('greek_proper_names_cltk')

In [70]:
# importing perseus
my_greek_downloader.import_corpus('greek_text_perseus')
my_greek_downloader.import_corpus('greek_treebank_perseus')
my_greek_downloader.import_corpus('greek_lexica_perseus')

In [41]:
# importing first1kgreek
my_greek_downloader.import_corpus('greek_text_first1kgreek')

In [76]:
# importing TLGU
my_greek_downloader.import_corpus('greek_software_tlgu')

In [80]:
my_greek_downloader.import_corpus('tlg', '~/cltk/corpora/TLG_E')

In [None]:
# CONVERTING BETA CODE TO UNICODE
# the files are typically in TEI XML format in betacode
# we need to transform them into unicode
from cltk.corpus.greek.beta_to_unicode import Replacer

In [23]:
# STOPWORDS FILTERING
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stop.greek.stops import STOPS_LIST

In [57]:
# USING TLGU SOFTWARE
from cltk.corpus.greek.tlgu import TLGU


In [64]:
# CONVERTING TEI XML TO PLAINTEXT
# based on BeautifulSoup library (pip install bs4):
from cltk.corpus.greek.tei import onekgreek_tei_xml_to_text
onekgreek_tei_xml_to_text()

# Real work with real texts

In [3]:
# UPLOADING AND CLEANING
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
import os
# Sozomenus
sozomenus_file = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/tlg2048.tlg001.1st1K-grc1.txt')
with open(sozomenus_file) as f:
    sozomenus_raw = f.read()
# text_cleaned = cleaned text:
sozomenus_cleaned = tlg_plaintext_cleanup(sozomenus_raw, rm_punctuation=True, rm_periods=True) [:]
#  rm_periods=True returns no interpunction
sozomenus_cleaned = sozomenus_cleaned.lower()

In [4]:
# UPLOADING AND CLEANING
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
import os
# Socrates
socrates_file = os.path.expanduser('~//cltk_data/greek/text/greek_text_first1kgreek_plaintext/tlg2057.tlg002.1st1K-grc1.txt')
with open(socrates_file) as f:
    socrates_raw = f.read()
# text_cleaned = cleaned text:
socrates_cleaned = tlg_plaintext_cleanup(socrates_raw, rm_punctuation=True, rm_periods=True) [:]
#  rm_periods=True returns no interpunction
socrates_cleaned = socrates_cleaned.lower()

In [5]:
# LEMMATIZATION
# does not work so good as I would like
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')
sozomenus_lemmatized = lemmatizer.lemmatize(sozomenus_cleaned, return_string=True)
socrates_lemmatized = lemmatizer.lemmatize(socrates_cleaned, return_string=True)

In [6]:
len(sozomenus_lemmatized)

681289

In [7]:
len(socrates_lemmatized)

617721

In [8]:
# STOPWORDS FILTERING
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stop.greek.stops import STOPS_LIST
p = PunktLanguageVars()

soz_tokens = p.word_tokenize(sozomenus_lemmatized.lower())
sozomenus_without_stopwords = [w for w in soz_tokens if not w in STOPS_LIST]
sozomenus_str = " ".join(sozomenus_without_stopwords)

soc_tokens = p.word_tokenize(socrates_lemmatized.lower())
socrates_without_stopwords = [w for w in soc_tokens if not w in STOPS_LIST]
socrates_str = " ".join(socrates_without_stopwords)


In [9]:
from cltk.utils.frequency import Frequency
freq = Frequency()
soz_freq_counter = freq.counter_from_str(sozomenus_str)
soc_freq_counter = freq.counter_from_str(socrates_str)
# now we have a counter collection
# counters have some specific functions
# like this one: freq_counter.most_common(30)
# but in general they behave as dictionaries

In [10]:
len(soz_freq_counter)

8622

In [11]:
len(soc_freq_counter)

8418

In [13]:
soz_freq_counter.most_common(10)

[('αὐτός', 1543),
 ('εἰμί', 1064),
 ('οὐ', 796),
 ('γίγνομαι', 613),
 ('ἐκκλησία', 606),
 ('αὐτὸν', 499),
 ('ἐπίσκοπος', 488),
 ('ἔχω', 472),
 ('πᾶς', 449),
 ('τὰς', 411)]

In [14]:
soc_freq_counter.most_common(10)

[('αὐτός', 1352),
 ('εἰμί', 982),
 ('γίγνομαι', 802),
 ('οὐ', 724),
 ('ἐπίσκοπος', 584),
 ('οὖν', 478),
 ('πᾶς', 471),
 ('ἐκκλησία', 462),
 ('βασιλίς', 449),
 ('ποιέω', 414)]

In [18]:
sozomenus_str[:500]

'é λογος προς τον αυτοκρατορα θεοδοσιον και υποθεσις της εκκλησιαστικης ιστοριας φημί πάλλω αὐτοκράτωρ ἐπιμελής χρῆμα γίγνομαι φιλόκοσμος ἁλουργίς στέφανος παραπλήσιος δ̓ αὖ2 λογόω σπουδάζω μυθώδης τινὰ ποίησις σύγγραμμα θέλγω δύναμαι πόλεμος ἀσκέω βέλος εὔστοχος ἀφίημι θήρ βάλλω δόρυ ἀκοντίζω ἵππος ἅλλομαι προσαγγέλλω ἑαυτοὺς βασίλειος ἕκαστος ἐπιτηδεύω ὃ κρατέω φίλος τυγχάνω δυσπόριστος ψηφίδα προσφέρω ἕτερος λαμπρός βαφὴν ἁλουργίς ὑποτίθημι ποίημα σύγγραμμα προσφωνέω εὔζωνος ξένος ὅπλο'

In [20]:
from cltk.tag import ner
text_str = 'λογος προς τον αυτοκρατορα θεοδοσιον και υποθεσις της εκκλησιαστικης ιστοριας φημί πάλλω αὐτοκράτωρ ἐπιμελής χρῆμα γίγνομαι φιλόκοσμος ἁλουργίς στέφανος παραπλήσιος δ̓ αὖ2 λογόω σπουδάζω μυθώδης τινὰ ποίησις σύγγραμμα θέλγω δύναμαι πόλεμος ἀσκέω βέλος εὔστοχος ἀφίημι θήρ βάλλω δόρυ ἀκοντίζω ἵππος ἅλλομαι προσαγγέλλω ἑαυτοὺς βασίλειος ἕκαστος ἐπιτηδεύω ὃ κρατέω φίλος τυγχάνω δυσπόριστος ψηφίδα προσφέρω ἕτερος λαμπρός βαφὴν ἁλουργίς ὑποτίθημι ποίημα σύγγραμμα προσφωνέω εὔζωνος ξένος ὅπλο'
ner.tag_ner('greek', input_text=text_str, output_type=list)

[('λογος',),
 ('προς',),
 ('τον',),
 ('αυτοκρατορα',),
 ('θεοδοσιον',),
 ('και',),
 ('υποθεσις',),
 ('της',),
 ('εκκλησιαστικης',),
 ('ιστοριας',),
 ('φημί',),
 ('πάλλω',),
 ('αὐτοκράτωρ',),
 ('ἐπιμελής',),
 ('χρῆμα',),
 ('γίγνομαι',),
 ('φιλόκοσμος',),
 ('ἁλουργίς',),
 ('στέφανος',),
 ('παραπλήσιος',),
 ('δ̓',),
 ('αὖ2',),
 ('λογόω',),
 ('σπουδάζω',),
 ('μυθώδης',),
 ('τινὰ',),
 ('ποίησις',),
 ('σύγγραμμα',),
 ('θέλγω',),
 ('δύναμαι',),
 ('πόλεμος',),
 ('ἀσκέω',),
 ('βέλος',),
 ('εὔστοχος',),
 ('ἀφίημι',),
 ('θήρ',),
 ('βάλλω',),
 ('δόρυ',),
 ('ἀκοντίζω',),
 ('ἵππος',),
 ('ἅλλομαι',),
 ('προσαγγέλλω',),
 ('ἑαυτοὺς',),
 ('βασίλειος',),
 ('ἕκαστος',),
 ('ἐπιτηδεύω',),
 ('ὃ',),
 ('κρατέω',),
 ('φίλος',),
 ('τυγχάνω',),
 ('δυσπόριστος',),
 ('ψηφίδα',),
 ('προσφέρω',),
 ('ἕτερος',),
 ('λαμπρός',),
 ('βαφὴν',),
 ('ἁλουργίς',),
 ('ὑποτίθημι',),
 ('ποίημα',),
 ('σύγγραμμα',),
 ('προσφωνέω',),
 ('εὔζωνος',),
 ('ξένος',),
 ('ὅπλο',)]