In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('shared/engl64.05/lib')
from tsvdro import tsvdro

In [None]:
# load Underwood metadata
metadata = pd.read_csv("shared/engl64.05/data/Underwood_ch1/allgenremeta.csv")

In [None]:
# take a sample of 150 texts
sample = metadata.sample(150,random_state=1)

In [None]:
# what did we find in our sample (genre distribution)?
sample['genretags'].value_counts()

In [None]:
# create label list
labels = [g for g in sample['genretags']]

In [None]:
# create a vectorizer instance
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer()

In [None]:
# import NLTK's set of stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# add some additional words and marks
puntuation = [",",";",".",":",'"',"'","—","(",")"]
stop_words = stop_words + puntuation

In [None]:
doc_data = list()
for filename in [f + '.dro' for f in sample['docid'].tolist()]:
    tmp_data = tsvdro.load("shared/engl64.05/data/Underwood_ch1/" + filename)
    for w in list(tmp_data['data'].keys()):
        if w in stop_words:
            del tmp_data['data'][w]
            
        # this will filter out all features with digits and 
        # some other miscelleanous punctuation
        if w.startswith('|'):
            del tmp_data['data'][w]
    doc_data.append(tmp_data['data'])

In [None]:
# vectorize from DRO documents
dtm = vectorizer.fit_transform(doc_data)

In [None]:
# this is the right NB classifier with the ability to extract feature counts
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(dtm.toarray(),labels)
feature_names = vectorizer.get_feature_names()

In [None]:
n_features = 30
if len(clf.classes_) > 1:
    for i, class_label in enumerate(clf.classes_):
        print("Class label:",class_label)
        values = clf.feature_log_prob_[i].ravel()[np.argsort(clf.feature_log_prob_[i].ravel())]
        terms = np.argsort(clf.feature_log_prob_[i])
        
        tv=list()
        for i, t in enumerate(terms):
            tv.append([feature_names[t],values[i]])
        tv = sorted(tv, key = lambda x: x[1], reverse=True)[:n_features]
        for t, v in tv:
            print("{0} ({1}), ".format(t,-np.round(v,3)),end="")
        print("\n")