In [1]:
import json
import csv
import numpy as np
import pandas as pd

In [2]:
def toLabel(label):
    if label =='ongoing-event':
        return 0
    elif label =='news':
        return 1
    elif label =='meme':
        return 2
    elif label =='commemorative':
        return 3

In [3]:
dataset = []
count = [0,0,0,0]
#read every trending and export json file
with open('../../data/TT-annotations.csv', newline='', encoding="utf8") as csvfile:
    trendingTopicArr = csv.reader(csvfile, delimiter=';')
    for trendingTopic in trendingTopicArr:
        path='../../../features/'+trendingTopic[0]+'.json'
        data = json.load(open(path))
        numberItem = len(data)
        curLabel = toLabel(trendingTopic[3])
        if count[curLabel] < 5:
            count[curLabel]+=1
            if (numberItem > 0):
                for tweetJson in data:
                    tweetJson = data[tweetJson]
                    dataset.append([trendingTopic[3], tweetJson['tweet'].replace("RT ","")])
dataset = np.array(dataset)
df = pd.DataFrame(data=dataset[0:,0:],    # values
             columns=['label', 'tweet'])

In [4]:
from io import StringIO
col = ['label', 'tweet']
df = df[col]
df = df[pd.notnull(df['tweet'])]
df.columns = ['label', 'tweets']
df['label_id'] = df['label'].factorize()[0]
label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
label_to_id = dict(label_id_df.values)
id_to_category = dict(label_id_df[['label_id', 'label']].values)
df.head()

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
print(df.groupby('label').tweets.count())
df.groupby('label').tweets.count().plot.bar(ylim=0)
plt.show()

label
commemorative    2353
meme             1440
news             1990
ongoing-event    3388
Name: tweets, dtype: int64


<Figure size 800x600 with 1 Axes>

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.tweets).toarray()
labels = df.label_id
features.shape

from sklearn.feature_selection import chi2
import numpy as np
N = 10
for label, label_id in sorted(label_to_id.items()):
  features_chi2 = chi2(features, labels == label_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#      bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(label))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
#      print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'commemorative':
  . Most correlated unigrams:
. vanifacts
. jesus
. christ
. bon
. jovi
. jon
. mamonas
. helau
. welsh
. killer
# 'meme':
  . Most correlated unigrams:
. sad
. applications
. update
. building
. rachel
. charliesheen
. valley
. sober
. lodge
. dealwithit
# 'news':
  . Most correlated unigrams:
. director
. corp
. lse
. davies
. howard
. sky
. news
. rip
. russell
. jane
# 'ongoing-event':
  . Most correlated unigrams:
. goal
. block
. vélez
. year
. leap
. sergeant
. morkel
. pepper
. velez
. mileyonsnl


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
#X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['label'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df['tweets'])
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_train_counts)
y = df['tweets']
#    clf = MultinomialNB().fit(X_train_tfidf, y_train)
#    print('-------------',clf.predict(count_vect.transform(["News to day: The world will destroy"])))

# from sklearn.svm import SVC
# classifier = SVC(kernel = 'linear', probability = True)
# classifier.fit(X, df['label'])
# import pickle
# with open('SVM_BagofWords.pkl', 'wb') as fout:
#     pickle.dump(classifier, fout)

In [None]:
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.gaussian_process.kernels import RBF
models = [
#    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(kernel = 'linear'),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df,
              size=3, jitter=True, edgecolor="gray", linewidth=1)
plt.xticks(rotation=90)
plt.show()

print(cv_df.groupby('model_name').accuracy.mean())



# compare

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
models = [
#    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    SGDClassifier(loss="hinge", penalty="l2"),
#    MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),
    KNeighborsClassifier(3),
    SVC(kernel="linear"),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df,
              size=3, jitter=True, edgecolor="gray", linewidth=1)
plt.xticks(rotation=90)
plt.show()

print(cv_df.groupby('model_name').accuracy.mean())