In [1]:
import datetime as dt
import os
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import pandas
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer

# Make vectorizer

In [2]:
def stream_lemmatized_files(corpus_dir):
    # return all docs in a dir
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)

    for file in files:
        filepath = os.path.join(user_dir, file)
        with open(filepath) as fo:
            #TODO rm words less the 3 chars long
            yield file[3:-4], fo.read()

In [3]:
t0 = dt.datetime.utcnow()

map_id_author = get_id_author()

df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet'])

for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    author = map_id_author[_id]
    epithet = get_epithet_of_author(_id)
    df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True)

print(df.shape)
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(df))

(1823, 5)
... finished in 0:00:13.411223
Number of texts: 1823


In [4]:
text_list = df['text'].tolist()

# make a list of short texts to drop
# For pres, get distributions of words per doc
short_text_drop_index = [index if len(text) > 500 else None for index, text in enumerate(text_list) ]  # ~100 words

In [19]:
t0 = dt.datetime.utcnow()

# TODO: Consdier using generator to CV http://stackoverflow.com/a/21600406

# time & size counts, w/ 50 texts:
# 0:01:15 & 202M @ ngram_range=(1, 3), min_df=2, max_features=500
# 0:00:26 & 80M @ ngram_range=(1, 2), analyzer='word', min_df=2, max_features=5000
# 0:00:24 & 81M @ ngram_range=(1, 2), analyzer='word', min_df=2, max_features=50000

# time & size counts, w/ 1823 texts:
# 0:02:18 & 46MB @ ngram_range=(1, 1), analyzer='word', min_df=2, max_features=500000
# 0:2:01 & 47 @ ngram_range=(1, 1), analyzer='word', min_df=2, max_features=1000000

# max features in the lemmatized data set: 551428
max_features = 100000
ngrams = 1
vectorizer = CountVectorizer(ngram_range=(1, ngrams), analyzer='word', 
                             min_df=2, max_features=max_features)
term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

# save matrix
vector_fp = os.path.expanduser('~/cltk_data/user_data/vectorizer_test_features{0}_ngrams{1}.pickle'.format(max_features, ngrams))
joblib.dump(term_document_matrix, vector_fp)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:01:24.611081


# Transform term matrix into feature table

In [20]:
# Put BoW vectors into a new df
term_document_matrix = joblib.load(vector_fp)  # scipy.sparse.csr.csr_matrix

In [21]:
term_document_matrix.shape

(1823, 100000)

In [22]:
term_document_matrix_array = term_document_matrix.toarray() 

In [23]:
dataframe_bow = pandas.DataFrame(term_document_matrix_array, columns=vectorizer.get_feature_names())

In [24]:
ids_list = df['id'].tolist()

In [25]:
len(ids_list)

1823

In [26]:
dataframe_bow.shape

(1823, 100000)

In [29]:
dataframe_bow['id'] = ids_list

In [30]:
authors_list = df['author'].tolist()
dataframe_bow['author'] = authors_list

In [31]:
epithets_list = df['epithet'].tolist()
dataframe_bow['epithet'] = epithets_list

In [32]:
# For pres, give distribution of epithets, including None
dataframe_bow['epithet']

0                  Historici/-ae
1                        Tragici
2                        Tragici
3                         Comici
4                           None
5                           None
6                  Historici/-ae
7               Philosophici/-ae
8                      Sophistae
9                     Theologici
10                 Historici/-ae
11      Scriptores Ecclesiastici
12                     Geographi
13                    Periegetae
14                          None
15                    Lyrici/-ae
16              Philosophici/-ae
17                       Tragici
18                          None
19                     Geographi
20                          None
21                        Medici
22                     Rhetorici
23                 Historici/-ae
24                        Medici
25                    Lyrici/-ae
26                  Onirocritici
27                Paradoxographi
28      Scriptores Ecclesiastici
29                       Tragici
          

In [33]:
# removes 334
#! remove rows whose epithet = None
# note on selecting none in pandas: http://stackoverflow.com/a/24489602
dataframe_bow = dataframe_bow[dataframe_bow.epithet.notnull()]
dataframe_bow.shape

(1489, 100003)

In [35]:
t0 = dt.datetime.utcnow()

dataframe_bow.to_csv(os.path.expanduser('~/cltk_data/user_data/tlg_bow.csv'))

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:04:17.580780


In [36]:
#! TODO Add pandas load from csv here

In [37]:
dataframe_bow.shape

(1489, 100003)

In [38]:
dataframe_bow.head(10)

Unnamed: 0,ʹʹ,ʹγʹ,ʹδʹ,αʹ,ααα,ααπτος,ααπτους,ααρων,αασαμην,αασχετον,...,ϲωμα,ϲωματα,ϲωματι,ϲωματοϲ,ϲωματων,ϲωμαϲι,ϲωμαϲιν,id,author,epithet
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1459,Lepidus Hist.,Historici/-ae
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,825,Melito Trag.,Tragici
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,331,[Polyidus] Trag.,Tragici
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,417,Archippus Comic.,Comici
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2475,Menecrates Hist.,Historici/-ae
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4075,Marinus Phil.,Philosophici/-ae
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2127,Troilus Soph.,Sophistae
9,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,2074,Apollinaris Theol.,Theologici
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2173,Antileon Hist.,Historici/-ae
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1419,"Hermas Scr. Eccl., Pastor Hermae",Scriptores Ecclesiastici


In [None]:
# write dataframe_bow to disk, for fast reuse while classifying
fp_df = os.path.expanduser('~/cltk_data/user_data/tlg_bow_df.pickle')
joblib.dump(dataframe_bow, fp_df)

# Classification

In [39]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [None]:
joblib.load(fp_df)

In [40]:
Y = dataframe_bow['epithet']

In [41]:
X = dataframe_bow.drop(['epithet', 'id', 'author'], 1)

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [43]:
from sklearn import clone
from sklearn import preprocessing
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

import datetime as dt

In [44]:
def scale_data(X_train, X_test, Y_train, Y_test):
    """Take Vectors, 
    """

    '''
    -PREPOCESSING 
    -Here, scaled data has zero mean and unit varience
    -We save the scaler to later use with testing/prediction data
    '''
    print('Scaling data ...')
    t0 = dt.datetime.utcnow()
    scaler = preprocessing.StandardScaler().fit(X_train)
    fp_scaler = os.path.expanduser('~/cltk_data/user_data/tlg_bow_scaler.pickle')
    joblib.dump(scaler, fp_scaler)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()

    return X_train_scaled, X_test_scaled, Y_train, Y_test

In [47]:
X_train_scaled, X_test_scaled, Y_train, Y_test = scale_data(X_train, X_test, Y_train, Y_test)

Scaling data ...
... finished in 0:00:12.989655 secs.



In [50]:
def run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Run decision tree with scikit.
    
    Experiment with: 'max_depth'
    """
    '''
    -This is where we define the models with pre-defined parameters
    -We can learn these parameters given our data
    '''
    print('Defining and fitting models ...')
    t0 = dt.datetime.utcnow()
    dec_tree = DecisionTreeClassifier()

    dec_tree.fit(X_train_scaled, Y_train)

    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_dt.pickle')
    joblib.dump(dec_tree, fp_model_pickle)

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()
    

    Y_prediction_tree = dec_tree.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction_tree)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----Tree_report--------------------------------')
    print(classification_report(expected, Y_prediction_tree))

In [51]:
run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test)

Defining and fitting models ...
... finished in 0:00:47.223415 secs.

tree_predictions  ['Medici' 'Historici/-ae' 'Lyrici/-ae' 'Philosophici/-ae' 'Tragici'
 'Comici' 'Tragici' 'Scriptores Ecclesiastici' 'Mimographi' 'Grammatici'
 'Lyrici/-ae' 'Musici' 'Comici' 'Historici/-ae' 'Tragici' 'Historici/-ae'
 'Theologici' 'Tragici' 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Rhetorici' 'Bucolici' 'Historici/-ae' 'Tragici'
 'Elegiaci' 'Grammatici' 'Comici' 'Tragici' 'Historici/-ae' 'Historici/-ae'
 'Tragici' 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Medici'
 'Comici' 'Philosophici/-ae' 'Historici/-ae' 'Epici/-ae'
 'Scriptores Ecclesiastici' 'Epici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Tragici' 'Historici/-ae' 'Tragici' 'Historici/-ae' 'Historici/-ae'
 'Epigrammatici/-ae' 'Paradoxographi' 'Elegiaci' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Comici' 'Historici/-ae' 'Historici/-ae'
 'Tragici' 'Scriptores Ecclesiastici' 'Philosophici/-ae' 'Histo

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [52]:
def run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Run SVC with scikit."""
    # This is where we define the models with pre-defined parameters
    # We can learn these parameters given our data
    print('Defining and fitting SVC model ...')
    t0 = dt.datetime.utcnow()   
    scv = svm.LinearSVC(C=100.)

    scv.fit(X_train_scaled, Y_train)

    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_svc.pickle')
    joblib.dump(scv, fp_model_pickle)

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()
    

    Y_prediction_svc = scv.predict(X_test_scaled)
    print('svc_predictions ', Y_prediction_svc)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----SVC_report--------------------------------')
    print(classification_report(expected, Y_prediction_svc))

In [None]:
run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test)

Defining and fitting SVC model ...


In [None]:
def run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Scikit random forest
    
    Experiment with 'n_estimators'
    """
    
    n_estimators = 30
    
    rf_model = RandomForestClassifier(n_estimators=n_estimators)

    # Train
    clf = clone(rf_model)
    clf = rf_model.fit(X_train_scaled, Y_train)
    
    #joblib.dump(clf, 'models/random_forest.pickle')

    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_fandom_forest.pickle')
    joblib.dump(clf, fp_model_pickle)
    
    scores = clf.score(X_train_scaled, Y_train)
    
    
    
    Y_prediction = clf.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----Random forest report--------------------------------')
    print(classification_report(expected, Y_prediction))

In [None]:
run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test)

In [None]:
def run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Scikit random forest.
    
    For plotting see:
    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_iris.html
    
    Experiment with 'n_estimators'
    """
    
    n_estimators = 30
    ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                        n_estimators=n_estimators)

    # Train
    clf = clone(ada_classifier)
    clf = ada_classifier.fit(X_train_scaled, Y_train)
    
    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_ada_boost.pickle')
    joblib.dump(clf, fp_model_pickle)
    
    scores = clf.score(X_train_scaled, Y_train)
    
    
    
    Y_prediction = clf.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print(classification_report(expected, Y_prediction))

In [None]:
run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test)