In [67]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
event_name = '[TEDxNations]'

In [6]:
event_data = pd.read_table('data/%s_data_clean.txt' % event_name, sep='\t', header=0, encoding='utf-8')
event_data_class = pd.read_table('data/%s_data_clean_annotated.txt' % event_name, sep='\t', header=0, encoding='utf-8', index_col=0)

In [10]:
event_data = pd.merge(event_data, event_data_class, left_index=True, right_index=True)

In [55]:
train = event_data[event_data['Class'].notnull()]
train.shape[0]

263

In [34]:
#initialize a vectorizer, require minimum freq. of terms at 2
count_vect = CountVectorizer(min_df=2)

In [35]:
#Learn the vocabulary dictionary and return term-document matrix
train_matrix_cnt = count_vect.fit_transform(train['text_clean_tokens'])

In [36]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
#first fit transformer which computes idf values
tfidf_transformer = TfidfTransformer().fit(train_matrix_cnt)
#second transform back to sparse matrix with tfidf values
train_matrix_tfidf = tfidf_transformer.transform(train_matrix_cnt)

In [40]:
classifier = MultinomialNB().fit(train_matrix_tfidf, train['Class'])

In [56]:
all_predictions = classifier.predict(train_matrix_tfidf)
len(all_predictions)

263

In [64]:
i = 5
print('predicted:', all_predictions[i])
print('actual:', train.iloc[i]['Class'])

predicted: 2.0
actual: 3.0


In [75]:
print('accuracy', accuracy_score(train['Class'], all_predictions))
print('confusion matrix\n', confusion_matrix(train['Class'], all_predictions))

accuracy 0.859315589354
confusion matrix
 [[  0   2   0]
 [  0 167   2]
 [  0  33  59]]


In [80]:
dfpredicted = pd.DataFrame(all_predictions, columns=['predicted'])
event_data = pd.merge(event_data, dfpredicted, left_index=True, right_index=True)

In [82]:
event_data.shape

(263, 9)