In [1]:
import os
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import pandas
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def stream_lemmatized_files(corpus_dir):
    # return all docs in a dir
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)

    for file in files:
        filepath = os.path.join(user_dir, file)
        with open(filepath) as fo:
            #TODO rm words less the 3 chars long
            yield file[3:-4], fo.read()

In [3]:
t0 = time.time()

map_id_author = get_id_author()

df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet'])

for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    author = map_id_author[_id]
    epithet = get_epithet_of_author(_id)
    df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True)
    
print(df.shape)
print('Time to collect texts: {}'.format(time.time() - t0))
print('Number of texts:', len(df))

(1823, 5)
Time to collect texts: 16.357985019683838
Number of texts: 1823


In [4]:
text_list = df['text'].tolist()

In [5]:
t0 = time.time()

vectorizer = CountVectorizer(min_df=2)
term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

print('Time to run vectorizer: {}'.format(time.time() - t0))

Time to run vectorizer: 138.72214102745056


In [8]:
# Put BoW vectors into a new df
dataframe_bow = pandas.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names())

In [9]:
ids_list = df['id'].tolist()

In [10]:
len(ids_list)

1823

In [11]:
dataframe_bow.shape

(1823, 551779)

In [12]:
dataframe_bow['id'] = ids_list

In [13]:
authors_list = df['author'].tolist()
dataframe_bow['author'] = authors_list

In [19]:
epithets_list = df['epithet'].tolist()
dataframe_bow['epithet'] = epithets_list

In [20]:
dataframe_bow['epithet']

0              Epici/-ae
1               Elegiaci
2          Historici/-ae
3              Biographi
4               Bucolici
5                Tragici
6       Philosophici/-ae
7              Sophistae
8             Lyrici/-ae
9               Oratores
10               Tragici
11             Epici/-ae
12                  None
13              Oratores
14         Historici/-ae
15         Historici/-ae
16              Oratores
17      Philosophici/-ae
18                Comici
19             Epici/-ae
20             Epici/-ae
21             Epici/-ae
22             Epici/-ae
23              Oratores
24              Oratores
25              Oratores
26              Oratores
27              Oratores
28                  None
29         Historici/-ae
              ...       
1793                None
1794                None
1795                None
1796                None
1797                None
1798                None
1799                None
1800                None
1801                None


In [22]:
# removes 334
#! remove rows whose epithet = None
# note on selecting none in pandas: http://stackoverflow.com/a/24489602
dataframe_bow = dataframe_bow[dataframe_bow.epithet.notnull()]
dataframe_bow.shape

(1489, 551782)

In [23]:
#! 2GB, 2 hrs to save
t0 = time.time()
dataframe_bow.to_csv(os.path.expanduser('~/cltk_data/user_data/tlg_bow.csv'))
td = (time.time() - t0) / 60
print('Time to save csv: {} mins'.format(td))

KeyboardInterrupt: 

In [16]:
dataframe_bow.shape

(1823, 551781)

In [15]:
dataframe_bow.head(10)

Unnamed: 0,ʹʹ,ʹγʹ,ʹδʹ,ʹν,ˈτων,αʹ,αʹβʹ,αʹδʹ,αʹιαʹ,αα,...,ϲωφρονουϲιν,ϲωφρονωϲ,ϲωφρων,ϲωϲ,ϲωϲειε,ϲωϲουϲιν,ϲϛ,ϲϲο,id,author
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Apollonius Rhodius Epic.
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,Theognis Eleg.
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,Thucydides Hist.
3,0,0,0,0,0,593,0,0,0,0,...,0,0,0,0,0,0,0,0,4,Diogenes Laertius Biogr.
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,Theocritus Bucol.
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,Euripides Trag.
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,Plutarchus Biogr. et Phil.
7,0,0,0,0,0,37,0,0,0,0,...,0,0,0,0,0,0,0,0,8,Athenaeus Soph.
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,Sappho Lyr.
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,10,Isocrates Orat.


# Classification

In [24]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [28]:
Y = dataframe_bow['epithet']

In [44]:
X = dataframe_bow.drop(['epithet', 'id', 'author'], 1)

In [45]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [46]:
from sklearn import clone
from sklearn import preprocessing
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

import datetime as dt

In [47]:
def scale_data(X_train, X_test, Y_train, Y_test):
    """Take Vectors, 
    """

    '''
    -PREPOCESSING 
    -Here, scaled data has zero mean and unit varience
    -We save the scaler to later use with testing/prediction data
    '''
    print('Scaling data ...')
    t0 = dt.datetime.utcnow()
    scaler = preprocessing.StandardScaler().fit(X_train)
    fp_scaler = os.path.expanduser('~/cltk_data/user_data/tlg_bow_scaler.pickle')
    joblib.dump(scaler, fp_scaler)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()

    return X_train_scaled, X_test_scaled, Y_train, Y_test

In [48]:
X_train_scaled, X_test_scaled, Y_train, Y_test = scale_data(X_train, X_test, Y_train, Y_test)

Scaling data ...
... finished in 0:08:49.920292 secs.



In [49]:
def run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Run decision tree with scikit.
    
    Experiment with: 'max_depth'
    """
    '''
    -This is where we define the models with pre-defined parameters
    -We can learn these parameters given our data
    '''
    print('Defining and fitting models ...')
    t0 = dt.datetime.utcnow()   
    dec_tree = DecisionTreeClassifier()

    dec_tree.fit(X_train_scaled, Y_train)

    joblib.dump(dec_tree, 'models/tree.pickle')  #! ch to cltk_data/user_data

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()
    

    Y_prediction_tree = dec_tree.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction_tree)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----Tree_report--------------------------------')
    print(classification_report(expected, Y_prediction_tree))

In [50]:
run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test)

Defining and fitting models ...
... finished in 0:58:13.494214 secs.

tree_predictions  ['Oratores' 'Scriptores Ecclesiastici' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Sophistae' 'Historici/-ae' 'Scriptores Ecclesiastici'
 'Historici/-ae' 'Tragici' 'Sophistae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Rhetorici' 'Lyrici/-ae' 'Apologetici' 'Comici'
 'Historici/-ae' 'Comici' 'Biographi' 'Historici/-ae' 'Comici' 'Iambici'
 'Comici' 'Tragici' 'Philosophici/-ae' 'Tragici' 'Comici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Sophistae' 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Poetae' 'Philosophici/-ae' 'Grammatici'
 'Historici/-ae' 'Poetae' 'Historici/-ae' 'Apologetici' 'Epici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Scriptores Ecclesiastici' 'Comici'
 'Historici/-ae' 'Grammatici' 'Philosophici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Phi

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
def run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Run SVC with scikit."""
    # This is where we define the models with pre-defined parameters
    # We can learn these parameters given our data
    print('Defining and fitting models ...')
    t0 = dt.datetime.utcnow()   
    scv = svm.LinearSVC(C=100.)

    scv.fit(X_train_scaled, Y_train)

    joblib.dump(scv, 'models/svc.pickle')

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()
    

    Y_prediction_svc = scv.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction_svc)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----SVC_report--------------------------------')
    print(classification_report(expected, Y_prediction_svc))

In [None]:
run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test)

In [None]:
def run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Scikit random forest
    
    Experiment with 'n_estimators'
    """
    
    n_estimators = 30
    
    rf_model = RandomForestClassifier(n_estimators=n_estimators)

    # Train
    clf = clone(rf_model)
    clf = rf_model.fit(X_train_scaled, Y_train)
    
    joblib.dump(clf, 'models/random_forest.pickle')
    
    scores = clf.score(X_train_scaled, Y_train)
    
    
    
    Y_prediction = clf.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----Random forest report--------------------------------')
    print(classification_report(expected, Y_prediction))

In [None]:
run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test)

In [None]:
def run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Scikit random forest.
    
    For plotting see:
    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_iris.html
    
    Experiment with 'n_estimators'
    """
    
    n_estimators = 30
    ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                        n_estimators=n_estimators)

    # Train
    clf = clone(ada_classifier)
    clf = ada_classifier.fit(X_train_scaled, Y_train)
    
    joblib.dump(clf, 'models/ada_boost.pickle')
    
    scores = clf.score(X_train_scaled, Y_train)
    
    
    
    Y_prediction = clf.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print(classification_report(expected, Y_prediction))

In [None]:
run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test)