Some sources:

* Gensim LDA: https://radimrehurek.com/gensim/models/ldamodel.html
* Misc clustering with Python: http://brandonrose.org/clustering
* Scikit LDA: http://scikit-learn.org/0.16/modules/generated/sklearn.lda.LDA.html
* Scikit NMF: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
* WMD in Python: http://vene.ro/blog/word-movers-distance-in-python.html
* Original WMD paper: http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
* Scikit Affinity propagation: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html

# Greek (TLG)

In [1]:
import os
import time

In [2]:
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_index
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import select_authors_by_epithet
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.tokenize.word import nltk_tokenize_words

from greek_accentuation.characters import base

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [84]:
def stream_lemmatized_files(corpus_dir):
    # return all docs in a dir
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)

    for file in files:
        filepath = os.path.join(user_dir, file)
        with open(filepath) as fo:
            yield fo.read()

In [None]:
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author

def map_file_order_to_name(corpus_dir):
    # read contents of dir, order and their tlg name
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)
    
    map_id_author = get_id_author()  # {…, '1648': 'Pyrgion Hist.', '4017': 'Syrianus Phil.', …}

    for count, file in enumerate(files):
        print(file[3:-4])
map_file_order_to_name('tlg_lemmatized_no_accents_no_stops')

In [85]:
t0 = time.time()

data_samples = []
for text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    data_samples.append(text)

print('Time to collect texts: {}'.format(time.time() - t0))
print('Number of texts:', len(data_samples))

1823
Time to collect texts: 5.987490892410278
Number of texts: 1823


In [86]:
# tf-idf features
n_samples = 2000
n_features = 1000  # TODO: increase
n_topics = len(get_epithets())
n_top_words = 20

tfidf_vectorizer = TfidfVectorizer(max_df=1.0, 
                                   min_df=1,
                                   max_features=n_features,
                                   stop_words=None)
t0 = time.time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print('Time to extract tf-idf features: {} secs.'.format(time.time() - t0))

Time to extract tf-idf features: 67.61014795303345 secs.


In [87]:
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time.time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print('Fit NMF model in {}'.format(time.time() - t0))

Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...
Fit NMF model in 6.778363943099976


In [105]:
def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('Topic #{}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [106]:
print("Topics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

print_top_words(nmf, tfidf_feature_names, n_top_words)

Topics in NMF model:
Topic #0:
ειμι 2.55 | ου 2.46 | τος 1.31 | αυ 1.07 | εχω 0.97 | τας 0.68 | αλλ 0.52 | δεω1 0.43 | ποιεω 0.43 | εγω 0.42 | ωσπερ 0.42 | τουτων 0.41 | εαυτου 0.39 | πα 0.37 | οσος 0.33 | καθ 0.33 | ευ 0.33 | ος 0.33 | ουδεν 0.32 | δι 0.31 | 

Topic #1:
αυ 3.52 | τος 3.44 | αυτον 0.79 | αυτους 0.32 | αυτην 0.28 | τας 0.24 | τοτε 0.17 | εγενετο 0.15 | εαυτου 0.15 | εκει 0.15 | πολιν 0.15 | υπ 0.14 | μετ 0.14 | πολεως 0.14 | τουτου 0.13 | ηως 0.13 | αδελφος 0.13 | πολυς 0.13 | ερχομαι 0.13 | γυνη 0.12 | 

Topic #2:
εγω 4.05 | εμεω 0.24 | σε 0.23 | εμοι 0.17 | ειδον 0.16 | εμε 0.15 | συ 0.14 | παρ 0.14 | νυ 0.13 | οπως 0.12 | ταυτ 0.12 | οιδα 0.11 | ερχομαι 0.1 | πως 0.09 | ουδ 0.09 | σευω 0.09 | ποθεν 0.09 | ειπε 0.09 | καγω 0.09 | χρηματα 0.08 | 

Topic #3:
φημι 3.37 | αυτον 0.42 | ειμι 0.35 | φησι 0.24 | γενεσθαι 0.23 | λεγει 0.23 | φασι 0.21 | ιστορεω 0.19 | καθα 0.19 | φησιν 0.17 | καλεω 0.17 | αθηνη 0.16 | υστερος 0.16 | ποτε 0.15 | ομηρος 0.15 | ον 0.14 | λεγεται 

In [107]:
tfidf.shape

(1823, 1000)

In [118]:
doc_topic_distrib = nmf.transform(tfidf)  # numpy.ndarray

In [119]:
doc_topic_distrib.shape

(1823, 55)

In [120]:
import pandas

In [121]:
df = pandas.DataFrame(doc_topic_distrib)

In [122]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,0.008242,0.036524,0.031082,0.007385,0.030441,0.276695,0.000000,0.035696,0.000000,0.022861,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.027332,0.000000,0.000000,0.016152,0.000000
1,0.047067,0.000000,0.046855,0.000000,0.051642,0.134257,0.054331,0.023679,0.000000,0.000000,...,0.000000,0.008999,0.003188,0.000000,0.000000,0.013205,0.000000,0.000000,0.000000,0.000000
2,0.101537,0.065956,0.011589,0.000000,0.022067,0.021074,0.001556,0.017924,0.001924,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.006843,0.000000,0.000000,0.000000,0.000000
3,0.113660,0.046509,0.007916,0.074166,0.002784,0.000000,0.000000,0.033737,0.000000,0.006540,...,0.000000,0.002134,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.054391,0.015502,0.067163,0.000000,0.027391,0.166300,0.020212,0.033731,0.000000,0.000000,...,0.000000,0.000000,0.008156,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.064044,0.000000,0.061053,0.000000,0.056361,0.095156,0.026170,0.049990,0.002007,0.000000,...,0.000000,0.000000,0.005795,0.000000,0.000000,0.010141,0.000000,0.000000,0.000000,0.000000
6,0.158096,0.029795,0.001092,0.015922,0.019112,0.014260,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.123865,0.029537,0.014366,0.080690,0.009208,0.023742,0.018648,0.020011,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.024411,0.000000,0.108845,0.001343,0.033773,0.122476,0.024436,0.024487,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.010710,0.000000,0.000000,0.000000,0.000000
9,0.129990,0.033435,0.042834,0.000000,0.014657,0.000000,0.012045,0.000000,0.001898,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.017119,0.000000,0.000000,0.000000,0.000000


In [42]:
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author

In [58]:
rename = {}
for _id, name in get_id_author().items():
    try:
        rename[int(_id)] = name
    except ValueError:
        pass

In [59]:
rename

{4096: 'Dositheus Magister Gramm.',
 1: 'Apollonius Rhodius Epic.',
 2: 'Theognis Eleg.',
 3: 'Thucydides Hist.',
 4: 'Diogenes Laertius Biogr.',
 5: 'Theocritus Bucol.',
 6: 'Euripides Trag.',
 7: 'Plutarchus Biogr. et Phil.',
 8: 'Athenaeus Soph.',
 9: 'Sappho Lyr.',
 10: 'Isocrates Orat.',
 11: 'Sophocles Trag.',
 12: 'Homerus Epic., Homer',
 13: 'Hymni Homerici, Homeric Hymns',
 14: 'Demosthenes Orat.',
 15: 'Herodianus Hist.',
 16: 'Herodotus Hist.',
 17: 'Isaeus Orat.',
 18: 'Philo Judaeus Phil.',
 4115: 'Theophilus Scr. Eccl.',
 20: 'Hesiodus Epic.',
 4117: 'Eustathius Scr. Eccl. et Theol.',
 22: 'Nicander Epic.',
 23: 'Oppianus Epic.',
 24: 'Oppianus Epic.',
 4100: 'Aphthonius Rhet.',
 26: 'Aeschines Orat.',
 27: 'Andocides Orat.',
 4124: 'Eusebius Scr. Eccl.',
 29: 'Dinarchus Orat.',
 4126: 'Theodorus Scr. Eccl.',
 31: 'Novum Testamentum, New Testament',
 32: 'Xenophon Hist.',
 33: 'Pindarus Lyr.',
 34: 'Lycurgus Orat.',
 35: 'Moschus Bucol.',
 36: 'Bion Bucol.',
 37: 'Anachar

In [None]:
# either redo this all w/ countvectorizer, so no need to look at all docs at once
# or figure out how to preserve order or labels in/out of tfidfvectorizer

In [None]:
# http://scikit-learn.org/stable/modules/clustering.html