In [1]:
# 1 - Anger
# 2 - Fear
# 3 - Sad
# 4 - Joy

import numpy as np
import pandas as p
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_score, classification_report, recall_score, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
trainanger=p.read_csv("EI-reg-En-anger-train.txt", delimiter='\t')
trainfear=p.read_csv("EI-reg-En-fear-train.txt", delimiter='\t')
trainsad=p.read_csv("EI-reg-En-sadness-train.txt", delimiter='\t')
trainjoy=p.read_csv("EI-reg-En-joy-train.txt", delimiter='\t')

In [3]:
train = p.concat([trainanger,trainfear,trainsad,trainjoy])

In [4]:
train

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Score
0,2017-En-10264,@xandraaa5 @amayaallyn6 shut up hashtags are c...,anger,0.562
1,2017-En-10072,it makes me so fucking irate jesus. nobody is ...,anger,0.750
2,2017-En-11383,Lol Adam the Bull with his fake outrage...,anger,0.417
3,2017-En-11102,@THATSSHAWTYLO passed away early this morning ...,anger,0.354
4,2017-En-11506,@Kristiann1125 lol wow i was gonna say really?...,anger,0.438
...,...,...,...,...
1611,2017-En-30316,Watch this amazing live.ly broadcast by @kana_...,joy,0.558
1612,2017-En-31092,Watching @melissamccarthy in #Spy she's one of...,joy,0.780
1613,2017-En-31037,Could not be happier!!,joy,0.885
1614,2017-En-31270,@strictlysimilak something about English spark...,joy,0.360


In [5]:
train['Affect Dimension'] = train['Affect Dimension'].map(dict(anger=1, fear=2, sadness=3, joy=4))
y=train['Affect Dimension'].values
X=train['Tweet'].values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(4971,) (4971,) (2131,) (2131,)


In [7]:
unique, counts = np.unique(y_test, return_counts=True)
print(unique)
print(counts)

[1 2 3 4]
[502 685 469 475]


#### MultiNomial Naive Bayes

In [34]:
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1,
                                           stop_words='english', token_pattern='[a-z][a-z][a-z]+')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
X_test_vec = unigram_count_vectorizer.transform(X_test)

nb_clf= MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
y_pred = nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[1,2,3,4])
print(cm)
target_names = ['1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names,digits = 3))

[[432  27  33  10]
 [ 30 604  39  12]
 [ 44  69 344  12]
 [ 22  26  14 413]]
              precision    recall  f1-score   support

           1      0.818     0.861     0.839       502
           2      0.832     0.882     0.856       685
           3      0.800     0.733     0.765       469
           4      0.924     0.869     0.896       475

    accuracy                          0.841      2131
   macro avg      0.844     0.836     0.839      2131
weighted avg      0.842     0.841     0.841      2131



In [9]:
# bigram Count vectorizer, set minimum document frequency to 5
bigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, ngram_range = (1,2),
                                           stop_words='english', token_pattern='[a-z][a-z][a-z]+')
X_train_vec = bigram_count_vectorizer.fit_transform(X_train)
X_test_vec = bigram_count_vectorizer.transform(X_test)

nb_clf= MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
y_pred = nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[1,2,3,4])
print(cm)
target_names = ['1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names,digits=3))

[[428  34  28  12]
 [ 24 619  35   7]
 [ 48  81 330  10]
 [ 21  30  15 409]]
              precision    recall  f1-score   support

           1      0.821     0.853     0.837       502
           2      0.810     0.904     0.854       685
           3      0.809     0.704     0.753       469
           4      0.934     0.861     0.896       475

    accuracy                          0.838      2131
   macro avg      0.844     0.830     0.835      2131
weighted avg      0.840     0.838     0.837      2131



In [10]:
# trigram Count vectorizer, set minimum document frequency to 5
trigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, ngram_range = (1,3),
                                           stop_words='english', token_pattern='[a-z][a-z][a-z]+')
X_train_vec = trigram_count_vectorizer.fit_transform(X_train)
X_test_vec = trigram_count_vectorizer.transform(X_test)

nb_clf= MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
y_pred = nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[1,2,3,4])
print(cm)
target_names = ['1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names,digits = 3))

[[425  38  28  11]
 [ 23 620  35   7]
 [ 48  83 327  11]
 [ 22  30  15 408]]
              precision    recall  f1-score   support

           1      0.820     0.847     0.833       502
           2      0.804     0.905     0.852       685
           3      0.807     0.697     0.748       469
           4      0.934     0.859     0.895       475

    accuracy                          0.835      2131
   macro avg      0.841     0.827     0.832      2131
weighted avg      0.838     0.835     0.834      2131



#### Linear SVC

In [11]:
# unigram Count vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, 
                                            stop_words='english',token_pattern='[a-z][a-z][a-z]+')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
X_test_vec = unigram_count_vectorizer.transform(X_test)

svm_clf = LinearSVC(C=0.1)
svm_clf.fit(X_train_vec,y_train)
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[1,2,3,4])
print(cm)
target_names = ['1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names,digits = 3))

[[443  30  25   4]
 [ 16 627  38   4]
 [ 34  46 380   9]
 [  6  27   8 434]]
              precision    recall  f1-score   support

           1      0.888     0.882     0.885       502
           2      0.859     0.915     0.886       685
           3      0.843     0.810     0.826       469
           4      0.962     0.914     0.937       475

    accuracy                          0.884      2131
   macro avg      0.888     0.880     0.884      2131
weighted avg      0.885     0.884     0.884      2131



In [38]:
# bigram Count vectorizer, set minimum document frequency to 5
bigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, ngram_range = (1,2),
                                           stop_words='english', token_pattern='[a-z][a-z][a-z]+')
X_train_vec = bigram_count_vectorizer.fit_transform(X_train)
X_test_vec = bigram_count_vectorizer.transform(X_test)

svm_clf = LinearSVC(C=0.1)
svm_clf.fit(X_train_vec,y_train)
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[1,2,3,4])
print(cm)
target_names = ['1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names,digits = 3))

[[451  20  26   5]
 [ 12 636  33   4]
 [ 35  46 380   8]
 [  6  19  10 440]]
              precision    recall  f1-score   support

           1      0.895     0.898     0.897       502
           2      0.882     0.928     0.905       685
           3      0.846     0.810     0.828       469
           4      0.963     0.926     0.944       475

    accuracy                          0.895      2131
   macro avg      0.897     0.891     0.893      2131
weighted avg      0.895     0.895     0.895      2131



In [13]:
# trigram Count vectorizer, set minimum document frequency to 5
trigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, ngram_range = (1,3),
                                           stop_words='english', token_pattern='[a-z][a-z][a-z]+')
X_train_vec = trigram_count_vectorizer.fit_transform(X_train)
X_test_vec = trigram_count_vectorizer.transform(X_test)

svm_clf = LinearSVC(C=0.2)
svm_clf.fit(X_train_vec,y_train)
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[1,2,3,4])
print(cm)
target_names = ['1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names,digits = 3))

[[451  21  25   5]
 [ 11 634  34   6]
 [ 34  50 376   9]
 [  6  19   9 441]]
              precision    recall  f1-score   support

           1      0.898     0.898     0.898       502
           2      0.876     0.926     0.900       685
           3      0.847     0.802     0.824       469
           4      0.957     0.928     0.942       475

    accuracy                          0.893      2131
   macro avg      0.894     0.889     0.891      2131
weighted avg      0.893     0.893     0.892      2131



*The most hated and joyous words for MNB*

In [35]:
log_ratios = []
features = unigram_count_vectorizer.get_feature_names()
vneg_cond_prob = nb_clf.feature_log_prob_[0]
vpos_cond_prob = nb_clf.feature_log_prob_[3]

for i in range(0, len(features)):
  log_ratio = vpos_cond_prob[i] - vneg_cond_prob[i]
  log_ratios.append(log_ratio)

exercise_C_ranks = sorted(zip(log_ratios, features))
print('Top 10 anger words:\n',exercise_C_ranks[:10])
print('\nTop 10 joyous words:\n',exercise_C_ranks[-10:])

Top 10 anger words:
 [(-4.121765710437097, 'anger'), (-4.039073994591983, 'rage'), (-3.91045661676989, 'bitter'), (-3.8498319949534547, 'fuming'), (-3.8498319949534547, 'revenge'), (-3.7628206179638255, 'offended'), (-3.6162171437719497, 'outrage'), (-3.4126181885307103, 'madden'), (-3.4126181885307103, 'offend'), (-3.4126181885307103, 'sting')]

Top 10 joyous words:
 [(3.35357352612964, 'breezy'), (3.3886648459409106, 'joyful'), (3.3886648459409106, 'rejoice'), (3.422566397616592, 'pleasing'), (3.4553562204395822, 'cheer'), (3.487104918754163, 'smiling'), (3.734941082658744, 'musically'), (3.759038634237804, 'glee'), (3.759038634237804, 'lively'), (4.0103530625187105, 'optimism')]


In [37]:
feature_ranks = sorted(zip(svm_clf.coef_[0], bigram_count_vectorizer.get_feature_names()))

very_negative_10 = feature_ranks[-10:]
print("Top 10 anger words")
print([very_negative_10[i] for i in range(0, len(very_negative_10))])
print()

feature_ranks = sorted(zip(svm_clf.coef_[3], bigram_count_vectorizer.get_feature_names()))

very_positive_10 = feature_ranks[-10:]
print("Top 10 joyous words")
print([very_positive_10[i] for i in range(0, len(very_positive_10))])

Top 10 anger words
[(0.8968690446349229, 'angry'), (0.9062321576616679, 'offend'), (0.9243026107808726, 'offended'), (0.9268462738973147, 'furious'), (0.9601926917265275, 'snap'), (0.9702555501086053, 'madden'), (0.9715170605001081, 'rage'), (1.0472395348944064, 'bitter'), (1.0545588952246616, 'revenge'), (1.065822254594867, 'fuming')]

Top 10 joyous words
[(0.8695607449629719, 'smiling'), (0.8814337434027376, 'breezy'), (0.88379940342064, 'elated'), (0.8885026481329541, 'lively'), (0.9019140733061577, 'rejoicing'), (0.9181140131095553, 'cheer'), (0.9207691010252558, 'cheery'), (0.9661417167794855, 'glee'), (1.0251091529761611, 'optimism'), (1.0301524191390283, 'hilarious')]


*Error Analysis*

In [41]:
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==4 and y_pred[i]==1):
        print(X_test[i])
        err_cnt = err_cnt+1
print()
print("errors:", err_cnt)

#ukedchat A4 Just go outside (or to the gym hall) and play! \n #education  #learning
@walterdonovanSS @mrb_rides_again evil, rich white men and their fucking cronies in intelligence/gov/academia using blithe blacks as fodder
Just watched Django Unchained, Other people may frown, but I titter in delight! 2/5
second day on the job and i already got a 45 dollar tip from a dude whose was constantly twitching his eye LOLOLOL 
@Casper10666 I assure you there is no laughter, but increasing anger at the costs, and arrogance of Westminster.
Never make a #decision when you're #angry and never make a #promise when you're #happy. #wisewords

errors: 6


In [17]:
nbTF_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False, min_df=1, stop_words='english', 
                                        token_pattern='[a-z][a-z][a-z]+')),('nbTF', MultinomialNB())])
scores = cross_val_score(nbTF_clf_pipe, X, y, cv=6)
avg=sum(scores)/len(scores)
print('MultinomialNB with TF vectors Score:',avg)

MultinomialNB with TF vectors Score: 0.8514472926457929


In [18]:
svcTF_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False, min_df=1, stop_words='english', 
                                ngram_range = (1,2), token_pattern='[a-z][a-z][a-z]+')),('svcTF', LinearSVC(C=0.2))])
scores = cross_val_score(svcTF_clf_pipe, X, y, cv=6)
avg=sum(scores)/len(scores)
print('SVM with TF vectors Score:',avg)

SVM with TF vectors Score: 0.8997421948039239


In [19]:
bigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=1, ngram_range = (1,2),
                                           stop_words='english', token_pattern='[a-z][a-z][a-z]+')
X_train_vec = bigram_count_vectorizer.fit_transform(X_train)
X_test_vec = bigram_count_vectorizer.transform(X_test)

dic = {}

for i in [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45]:
    svm_clf = LinearSVC(C= i)
    svm_clf.fit(X_train_vec,y_train)
    y_pred = svm_clf.predict(X_test_vec)
    dic[i] = accuracy_score(y_test,y_pred)

dic

{0.05: 0.8897231346785547,
 0.1: 0.8948850305021117,
 0.15: 0.8948850305021117,
 0.2: 0.8939465039887377,
 0.25: 0.8939465039887377,
 0.3: 0.8934772407320507,
 0.35: 0.8934772407320507,
 0.4: 0.8944157672454247,
 0.45: 0.8939465039887377}

In [32]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

count_vectorizer = CountVectorizer(binary=False, stop_words='english', min_df=2, 
                                   token_pattern='[a-z][a-z][a-z]+', max_features = 1000, max_df = 0.95, ngram_range = (1,2))
vecs = count_vectorizer.fit_transform(X)
tf_feature_names = count_vectorizer.get_feature_names()

no_topics = 10

lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                random_state=0)
lda_z = lda.fit_transform(vecs)

print("Log Likelihood: ", lda.score(vecs))
print("Perplexity: ", lda.perplexity(vecs))
print()

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(") (".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, tf_feature_names, 20)

Log Likelihood:  -224792.77766766364
Perplexity:  993.465857054348

Topic 0:
depression) (good) (anxiety) (life) (really) (sober) (having) (nightmare) (tonight) (morning) (feeling) (funny) (going) (panic) (pleasing) (years) (animated) (guys) (nervous) (future
Topic 1:
smile) (optimism) (start) (feel) (way) (like) (horror) (hilarious) (good) (face) (rejoice) (wanna) (delight) (heart) (hilarity) (little) (bright) (frown) (fuck) (beautiful
Topic 2:
want) (lost) (just) (sadness) (week) (awful) (afraid) (thing) (dreadful) (away) (pout) (gbbo) (dark) (terrorism) (don) (pakistan) (time) (hope) (friend) (shake
Topic 3:
amp) (know) (don) (bad) (man) (fucking) (old) (better) (like) (makes) (sparkling) (don know) (guy) (joy) (white) (place) (cause) (playful) (life) (took
Topic 4:
glee) (smiling) (hate) (cheer) (doesn) (just) (need) (unhappy) (bully) (news) (breezy) (stop) (exhilarating) (fuming) (hearty) (offended) (mean) (talk) (nice) (shy
Topic 5:
people) (just) (love) (make) (think) (don) (sad

In [28]:
vecs.shape

(7102, 1000)

In [33]:
top = lda_z[:, 2].argsort()[::-1]
for iter_idx, aqe_idx in enumerate(top[:20]):
    print('top #%d: ' % (iter_idx + 1))
    print(X[aqe_idx][:300], '…')
    print('\n')

top #1: 
Someone needs to tell Candice she'll be stuck with that shitty pout if the wind changes. #GBBO …


top #2: 
Someone needs to tell Candice she'll be stuck with that shitty pout if the wind changes. #GBBO …


top #3: 
Some moving clips on youtube tonight of the vigil held at Tulsa Metropolitan Baptist church for #TerenceCruther #justice  #sadness …


top #4: 
Pakistan is the biggest victim of terrorism - Nawaz Sharif \nReally? It should have been biggest creator of terrorism. #UNGA …


top #5: 
@mandyjohnson I'll be honest.. I hope that annoying Southern bint with the 'look at me' pout goes out this week! Selasi #FTW …


top #6: 
@mandyjohnson I'll be honest.. I hope that annoying Southern bint with the 'look at me' pout goes out this week! Selasi #FTW …


top #7: 
penny dreadful just cleaved off a fraction of my heart …


top #8: 
penny dreadful just cleaved off a fraction of my heart …


top #9: 
Saga: When all of your devices and teles fail just in time for bake off #panic #g