In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score

In [1]:
event_name = '[egyptair]'

In [3]:
event_data = pd.read_table('data/%s_data_clean_features.txt' % event_name, sep='\t', header=0, encoding='utf-8')
event_data_class = pd.read_table('data/%s_data_clean_annotated.txt' % event_name, sep='\t', header=0, encoding='utf-8', index_col=0)

In [4]:
event_data = pd.merge(event_data, event_data_class, left_index=True, right_index=True)

In [6]:
#remove data without annotation
event_data = event_data[event_data['class'].notnull()]

In [7]:
feature_cols = ['count_links',
                 'count_hashtags',
                 'count_mentions',
                 'count_words',
                 'count_characters',
                 'count_non_characters',
                 'count_upper',
                 'bool_question',
                 'bool_elongation',
                 'bool_ellipsis',
                 't_distinct',
                 't_sum',
                 'tfidf_sum',
                 'tfidf_mean',
                 'pos_cnt',
                 'nes_cnt',
                 'pos_cnt_NN',
                 'pos_cnt_RP',
                 'pos_cnt_POS',
                 'pos_cnt_VB',
                 'pos_cnt_(',
                 'pos_cnt_``',
                 "pos_cnt_''",
                 'pos_cnt_WP',
                 'pos_cnt_VBD',
                 'pos_cnt_NNPS',
                 'pos_cnt_NNP',
                 'pos_cnt_.',
                 'pos_cnt_JJR',
                 'pos_cnt_CC',
                 'pos_cnt_EX',
                 'pos_cnt_PDT',
                 'pos_cnt_DT',
                 'pos_cnt_WRB',
                 'pos_cnt_PRP$',
                 'pos_cnt_)',
                 'pos_cnt_SYM',
                 'pos_cnt_RBR',
                 'pos_cnt_VBP',
                 'pos_cnt_FW',
                 'pos_cnt_CD',
                 'pos_cnt_JJ',
                 'pos_cnt_$',
                 'pos_cnt_WDT',
                 'pos_cnt_JJS',
                 'pos_cnt_VBN',
                 'pos_cnt_RBS',
                 'pos_cnt_IN',
                 'pos_cnt_,',
                 'pos_cnt_UH',
                 'pos_cnt_PRP',
                 'pos_cnt_VBG',
                 'pos_cnt_TO',
                 'pos_cnt_VBZ',
                 'pos_cnt_MD',
                 'pos_cnt_NNS',
                 'pos_cnt_RB',
                 'pos_cnt_:',
                 'ne_cnt_PERSON',
                 'ne_cnt_GSP',
                 'ne_cnt_ORGANIZATION',
                 'ne_cnt_GPE',
                 'ne_cnt_LOCATION']

In [8]:
train, test = train_test_split(event_data, test_size=.2)

dat = event_data.shape[0]
tr = len(train)
te = len(test)
print('data: %s' % dat)
print('train: %s (%s%%)' % (tr, round(100*tr/dat)))
print('test: %s (%s%%)' % (te, round(100*te/dat)))

data: 186
train: 148 (80%)
test: 38 (20%)


In [10]:
train_class = train['class']
train_features = train[feature_cols]

test_class = test['class']
test_features = test[feature_cols]

In [11]:
#build pipeline for easy classifying using tfidf bag of words
pipeline = Pipeline([('count_vect', CountVectorizer()), 
                     ('X_tfidf', TfidfTransformer()), 
                     ('classifier', MultinomialNB()),
                    ])

In [12]:
scores = cross_val_score(pipeline,
                         train['text'],
                         train_class,
                         scoring='accuracy')



In [14]:
print('tfidf bow model')
print('accuracy scores:', scores)
print('mean:', scores.mean())
print('std:', scores.std())

tfidf bow model
accuracy scores: [ 0.80769231  0.77083333  0.72916667]
mean: 0.769230769231
std: 0.0320779803333


In [15]:
#create using original features
classifier = MultinomialNB().fit(train_features, train_class)

In [16]:
#test on train !!!BAD!!!
train_predictions = classifier.predict(train_features)

In [17]:
#GARBAGE Model i.e. annotation data (can't predict good on training data)
print('accuracy', sklearn.metrics.accuracy_score(train_class, train_predictions))
print('confusion matrix\n', sklearn.metrics.confusion_matrix(train_class, train_predictions))

accuracy 0.736486486486
confusion matrix
 [[ 2  0  0  0  2]
 [ 0 64  1  0 11]
 [ 0  2 10  0  6]
 [ 0  0  1  0  0]
 [ 0  8  8  0 33]]


In [18]:
print(sklearn.metrics.classification_report(train_class, train_predictions))

             precision    recall  f1-score   support

        0.0       1.00      0.50      0.67         4
        1.0       0.86      0.84      0.85        76
        2.0       0.50      0.56      0.53        18
        3.0       0.00      0.00      0.00         1
        4.0       0.63      0.67      0.65        49

avg / total       0.74      0.74      0.74       148



  'precision', 'predicted', average, warn_for)


In [19]:
#test
test_predictions = classifier.predict(test_features)

In [20]:
print('accuracy', sklearn.metrics.accuracy_score(test_class, test_predictions))
print('confusion matrix\n', sklearn.metrics.confusion_matrix(test_class, test_predictions))

accuracy 0.657894736842
confusion matrix
 [[ 1  0  0  0  1]
 [ 1 13  1  0  3]
 [ 0  1  2  0  3]
 [ 0  0  1  0  0]
 [ 0  0  2  0  9]]


In [21]:
#check random test doc index
i = 20
print('doc:', test.iloc[i]['text'])
print()
print('predicted:', test_predictions[i])
print('actual:', test_class.iloc[i]) # or test.iloc[i]['Class']

doc: This man will surely have a story to tell his grand children #EgyptAir

predicted: 1.0
actual: 1.0


In [22]:
print(sklearn.metrics.classification_report(test_class, test_predictions))

             precision    recall  f1-score   support

        0.0       0.50      0.50      0.50         2
        1.0       0.93      0.72      0.81        18
        2.0       0.33      0.33      0.33         6
        3.0       0.00      0.00      0.00         1
        4.0       0.56      0.82      0.67        11

avg / total       0.68      0.66      0.66        38



  'precision', 'predicted', average, warn_for)
