In [19]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score

In [2]:
features = [
 'favorite_count',
 'retweet_count',
 'entities_count_hashtags',
 'entities_count_symbols',
 'entities_count_urls',
 'entities_count_user_mentions',
 'user_id_verified',
 'user_id_statuses_count',
 'user_id_favourites_count',
 'user_id_followers_count',
 'user_id_friends_count',
 'user_id_listed_count',
 'count_words',
 'count_stops',
 'count_characters',
 'count_non_characters',
 'count_upper',
 'bool_question',
 'bool_elongation',
 'bool_ellipsis',
 'lexical_diversity',
 'query_grams_coverage',
 'topk_terms_coverage',
 'tfidf_sum',
 'tfidf_mean',
 'event_centroid_distance']

In [3]:
#load data
df = pd.read_table('data/final/event_panama_papers_data.txt', sep='\t', encoding='utf-8', header=0)
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(2295892, 37)

In [4]:
df['user_id_verified'] = df['user_id_verified'].astype(object).replace(np.nan, 0)

In [5]:
#load annotated data
df_ann = pd.read_table('data/final/cf_report_pp_test100.csv', sep=',', encoding='utf-8', header=0)
df_ann.shape

(100, 12)

In [6]:
#remove bad data
#-1 was loading error in annotation
df_ann = df_ann[(df_ann['1_is_of_high_quality'] != -1) & (df_ann['2_is_informative'] != -1) & (df_ann['3_relevant_to_event'] != -1)]
df_ann.shape

(99, 12)

In [7]:
#user class names for reporting
target_names=['No', 'Maybe', 'Yes']

In [8]:
#get classes on master id
df_y = df_ann[['master_id', '1_is_of_high_quality', '2_is_informative', '3_relevant_to_event']]

In [9]:
#add anotations to data
df_ann = pd.merge(df, df_y, on='master_id')
df_ann.shape

(99, 40)

In [23]:
#split
#train, test = train_test_split(df_ann, test_size=.2)
#use only extreme classes "No"=0 or "Yes"=3
#separation is larger, model accuracy is higher
#train, test = train_test_split(df_ann[df_ann['1_is_of_high_quality'].isin([0,2])], test_size=.2)
#train, test = train_test_split(df_ann[df_ann['2_is_informative'].isin([0,2])], test_size=.2)
train, test = train_test_split(df_ann[df_ann['3_relevant_to_event'].isin([0,2])], test_size=.2)

dat = df_ann.shape[0]
tr = len(train)
te = len(test)
print('data: %s' % dat)
print('train: %s (%s%%)' % (tr, round(100*tr/dat)))
print('test: %s (%s%%)' % (te, round(100*te/dat)))

data: 99
train: 59 (60%)
test: 15 (15%)


In [41]:
#train_quality = train['1_is_of_high_quality']
#train_informative = train['2_is_informative']
train_relevant = train['3_relevant_to_event']
train_features = train[features]

#test_quality = test['1_is_of_high_quality']
#test_informative = test['2_is_informative']
test_relevant = test['3_relevant_to_event']
test_features = test[features]

In [42]:
train_features.shape

(59, 26)

In [43]:
train_relevant.shape

(59,)

In [None]:
#Execute Models

In [None]:
#Multinomial Naive Bayes

In [None]:
#bag of words features using tf-idf

In [44]:
#build pipeline for easy classifying using tfidf bag of words
clf_pipe = Pipeline([('count_vect', CountVectorizer()), 
                     ('X_tfidf', TfidfTransformer()), 
                     ('classifier', MultinomialNB()),
                    ])

In [58]:
text_clf = clf_pipe.fit(train['text'], train_relevant)

In [61]:
test_predictions = text_clf.predict(test['text'])

In [62]:
print(classification_report(test_relevant, test_predictions))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         3
          2       0.80      1.00      0.89        12

avg / total       0.64      0.80      0.71        15



  'precision', 'predicted', average, warn_for)


In [63]:
scores = cross_val_score(text_clf,
                         train['text'],
                         train_relevant,
                         scoring='accuracy')

In [68]:
print('MNB BOW TF-IDF Model')
print('accuracy scores:', scores)
print('mean:', scores.mean())
print('std:', scores.std())

MNB BOW TF-IDF Model
accuracy scores: [ 0.47619048  0.42105263  0.68421053]
mean: 0.527151211362
std: 0.113315979419


In [29]:
#create using custom twitter features
MNB_classifier = MultinomialNB().fit(train_features, train_relevant)

In [30]:
#predict test data
test_predictions = MNB_classifier.predict(test_features)
#results
print('accuracy', sklearn.metrics.accuracy_score(test_relevant, test_predictions))
print('confusion matrix\n', sklearn.metrics.confusion_matrix(test_relevant, test_predictions))

accuracy 0.4
confusion matrix
 [[1 2]
 [7 5]]


In [31]:
print(classification_report(test_relevant, test_predictions))

             precision    recall  f1-score   support

          0       0.12      0.33      0.18         3
          2       0.71      0.42      0.53        12

avg / total       0.60      0.40      0.46        15



In [65]:
scores = cross_val_score(MNB_classifier,
                         train_features,
                         train_relevant,
                         scoring='accuracy')

In [67]:
print('MNB Custom Model')
print('accuracy scores:', scores)
print('mean:', scores.mean())
print('std:', scores.std())

MNB Custom Model
accuracy scores: [ 0.47619048  0.42105263  0.68421053]
mean: 0.527151211362
std: 0.113315979419
