In [177]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score

In [99]:
event_name = '[TEDxNations]'

In [244]:
event_data = pd.read_table('data/%s_data_clean_features.txt' % event_name, sep='\t', header=0, encoding='utf-8')
event_data_class = pd.read_table('data/%s_data_clean_annotated.txt' % event_name, sep='\t', header=0, encoding='utf-8', index_col=0)

In [245]:
event_data = pd.merge(event_data, event_data_class, left_index=True, right_index=True)

In [246]:
#remove data without annotation
event_data = event_data[event_data['Class'].notnull()]

In [247]:
feature_cols = ['count_links',
                 'count_hashtags',
                 'count_mentions',
                 'count_words',
                 'count_characters',
                 'count_non_characters',
                 'count_upper',
                 'bool_question',
                 'bool_elongation',
                 'bool_ellipsis',
                 't_distinct',
                 't_sum',
                 'tfidf_sum',
                 'tfidf_mean',
                 'pos_cnt',
                 'nes_cnt',
                 'pos_cnt_NN',
                 'pos_cnt_RP',
                 'pos_cnt_POS',
                 'pos_cnt_VB',
                 'pos_cnt_(',
                 'pos_cnt_``',
                 "pos_cnt_''",
                 'pos_cnt_WP',
                 'pos_cnt_VBD',
                 'pos_cnt_NNPS',
                 'pos_cnt_NNP',
                 'pos_cnt_.',
                 'pos_cnt_JJR',
                 'pos_cnt_CC',
                 'pos_cnt_EX',
                 'pos_cnt_PDT',
                 'pos_cnt_DT',
                 'pos_cnt_WRB',
                 'pos_cnt_PRP$',
                 'pos_cnt_)',
                 'pos_cnt_SYM',
                 'pos_cnt_RBR',
                 'pos_cnt_VBP',
                 'pos_cnt_FW',
                 'pos_cnt_CD',
                 'pos_cnt_JJ',
                 'pos_cnt_$',
                 'pos_cnt_WDT',
                 'pos_cnt_JJS',
                 'pos_cnt_VBN',
                 'pos_cnt_RBS',
                 'pos_cnt_IN',
                 'pos_cnt_,',
                 'pos_cnt_UH',
                 'pos_cnt_PRP',
                 'pos_cnt_VBG',
                 'pos_cnt_TO',
                 'pos_cnt_VBZ',
                 'pos_cnt_MD',
                 'pos_cnt_NNS',
                 'pos_cnt_RB',
                 'pos_cnt_:',
                 'ne_cnt_PERSON',
                 'ne_cnt_GSP',
                 'ne_cnt_ORGANIZATION',
                 'ne_cnt_GPE',
                 'ne_cnt_LOCATION']

In [248]:
train, test = train_test_split(event_data, test_size=.2)

dat = event_data.shape[0]
tr = len(train)
te = len(test)
print('data: %s' % dat)
print('train: %s (%s%%)' % (tr, round(100*tr/dat)))
print('test: %s (%s%%)' % (te, round(100*te/dat)))

data: 263
train: 210 (80%)
test: 53 (20%)


In [249]:
train_class = train['Class']
train_features = train[feature_cols]

test_class = test['Class']
test_features = test[feature_cols]

In [239]:
#build pipeline for easy classifying using tfidf bag of words
pipeline = Pipeline([('count_vect', CountVectorizer()), 
                     ('X_tfidf', TfidfTransformer()), 
                     ('classifier', MultinomialNB()),
                    ])

In [283]:
scores = cross_val_score(pipeline,
                         train['text'],
                         train_class,
                         scoring='accuracy')



In [284]:
print('tfidf bow model')
print('accuracy scores:', scores)
print('mean:', scores.mean())
print('std:', scores.std())

tfidf bow model
accuracy scores: [ 0.63380282  0.64285714  0.65217391]
mean: 0.642944624267
std: 0.00750022369058


In [286]:
#create using original features
classifier = MultinomialNB().fit(train_features, train_class)

In [304]:
#test on train !!!BAD!!!
train_predictions = classifier.predict(train_features)

In [308]:
#GARBAGE Model i.e. annotation data (can't predict good on training data)
print('accuracy', sklearn.metrics.accuracy_score(train_class, train_predictions))
print('confusion matrix\n', sklearn.metrics.confusion_matrix(train_class, train_predictions))

accuracy 0.695238095238
confusion matrix
 [[  0   1   0]
 [  0 112  23]
 [  0  40  34]]


In [311]:
print(sklearn.metrics.classification_report(train_class, train_predictions))

             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         1
        2.0       0.73      0.83      0.78       135
        3.0       0.60      0.46      0.52        74

avg / total       0.68      0.70      0.68       210



  'precision', 'predicted', average, warn_for)


In [287]:
#test
test_predictions = classifier.predict(test_features)

In [309]:
print('accuracy', sklearn.metrics.accuracy_score(test_class, test_predictions))
print('confusion matrix\n', sklearn.metrics.confusion_matrix(test_class, test_predictions))

accuracy 0.584905660377
confusion matrix
 [[ 0  1  0]
 [ 0 26  8]
 [ 0 13  5]]


In [299]:
#check random test doc index
i = 20
print('doc:', test.iloc[i]['text'])
print()
print('predicted:', test_predictions[i])
print('actual:', test_class.iloc[i]) # or test.iloc[i]['Class']

doc: RT unfoundation: "Smoke inhalation from cooking results in 4 millions deaths per year" -RanyeeCleanCook | #TEDxNations #cleancooking

predicted: 2.0
actual: 3.0


In [312]:
print(sklearn.metrics.classification_report(test_class, test_predictions))

             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         1
        2.0       0.65      0.76      0.70        34
        3.0       0.38      0.28      0.32        18

avg / total       0.55      0.58      0.56        53



  'precision', 'predicted', average, warn_for)
