In [3]:
from __future__ import print_function
from __future__ import division
import collections
import nltk
import numpy as np
from nltk.tokenize.casual import TweetTokenizer
import sys;
sys.path.insert(0, '../code')
from w266_common import utils, vocabulary
import re
np.random.seed(266)

In [5]:
import csv
tokenizer = TweetTokenizer()
x_data = []
labels = []
sentences = []
     
  
with open('../data/merged_data_v3.csv', 'r') as csvfile:
    linereader = csv.reader(csvfile, delimiter = '|')
    for i, row in enumerate(linereader):
        if i == 0:
            continue
        sentence, _, sarcasm = row
        sentence = re.sub("RT @[^\s]+:", "retweet", sentence)
        #sentences.append(sentence)
        tokenized_words = tokenizer.tokenize(sentence)
        x_tokens = utils.canonicalize_words(tokenized_words, hashtags =True)
        index = np.argwhere(x_tokens=="sarcasm")
        x_tokens = np.delete(x_tokens, index)
        index = np.argwhere(x_tokens=="sarcastic")
        x_tokens = np.delete(x_tokens, index)
        x_data.append(x_tokens)
        labels.append(int(sarcasm))


shuffle_indices = np.random.permutation(np.arange(len(labels)))
train_split_idx = int(0.7 * len(labels))
test_split_idx  = int(0.9 * len(labels))

train_indices = shuffle_indices[:train_split_idx]
validation_indices = shuffle_indices[train_split_idx:test_split_idx]
test_indices = shuffle_indices[test_split_idx:]

x_data = np.array(x_data)
labels = np.array(labels)
train_sentences = x_data[train_indices]
train_labels= labels[train_indices] 
validation_sentences = x_data[validation_indices]
validation_labels = labels[validation_indices]
test_sentences = x_data[test_indices]  
test_labels = labels[test_indices]  



In [6]:
vocab = vocabulary.Vocabulary(utils.flatten(train_sentences),5000)
x_ids = vocab.words_to_ids(train_sentences[0])
print("x_ids =", vocab.word_to_id)
print(x_ids)

[10, 71, 67, 8, 12, 74, 54, 24, 3, 3, 75, 22, 3, 77, 70, 3, 16, 76, 72, 49, 13]


In [7]:
list_of_train_ids = list(map(vocab.words_to_ids, train_sentences))
print(list_of_train_ids[0:20])


list_of_validation_ids = list(map(vocab.words_to_ids, validation_sentences))

list_of_test_ids = list(map(vocab.words_to_ids, test_sentences))


[[10, 71, 67, 8, 12, 74, 54, 24, 3, 3, 75, 22, 3, 77, 70, 3, 16, 76, 72, 49, 13], [10, 3590, 15, 2, 902, 329, 156, 311, 529, 3591, 2760, 5, 3, 3, 3, 80, 588, 13], [10, 3592, 306, 7, 2252, 732, 9, 29, 26, 48, 117, 21, 2, 396, 396, 396, 366, 47, 3, 3, 3, 3, 80, 5], [10, 903, 169, 2253, 194, 19, 2254, 53, 1910, 21, 7, 125, 6, 837, 342, 8, 330, 703, 11, 377, 165, 3593, 11, 3], [2761, 2255, 628, 3, 335, 230, 1317, 12, 784, 11, 5, 3, 3, 3, 66, 8, 733, 12, 107, 2762, 356, 21, 978, 38, 529, 3], [979, 14, 7, 125, 2763, 2764, 3, 3, 3, 3, 3, 3, 5], [20, 557, 91, 7, 238, 785, 36, 170, 8, 32, 19, 3, 3], [10, 3, 3594, 343, 3, 63, 3595, 3, 5, 5], [10, 665, 3, 2256, 38, 629, 3596, 2, 6, 3596, 16, 12, 2765, 3597, 15, 3, 2, 18, 3, 9, 629, 14, 3, 13], [10, 2, 8, 56, 7, 3, 16, 980, 9, 213, 630, 357, 3598, 83, 46, 1318, 289, 1641, 25, 256, 15, 122, 2, 7, 2, 13], [5, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 350, 631, 12, 104, 45, 838, 42, 786, 4, 14, 1191, 3, 3, 3,

In [8]:
collections.Counter(list_of_train_ids[0])

Counter({10: 1,
         71: 1,
         67: 1,
         8: 1,
         12: 1,
         74: 1,
         54: 1,
         24: 1,
         3: 4,
         75: 1,
         22: 1,
         77: 1,
         70: 1,
         16: 1,
         76: 1,
         72: 1,
         49: 1,
         13: 1})

In [9]:
from scipy.sparse import csr_matrix 
def sparsify_data(list_of_ids):
    row_indices = []
    col_indices = []
    values = []
    rows = len(list_of_ids)

    for row, x_ids in enumerate(list_of_ids):
        x_fdict = collections.Counter(x_ids)
        for wordid, count in x_fdict.items():
            row_indices.append(row)       
            col_indices.append(wordid)  # column is word id
            values.append(count)        # value is count
    x_sparse = csr_matrix((values, (row_indices, col_indices)),
                              shape=[rows, vocab.size])
    return x_sparse

In [10]:
x_train_sparse = sparsify_data(list_of_train_ids)
x_validation_sparse = sparsify_data(list_of_validation_ids)
x_test_sparse = sparsify_data(list_of_test_ids)


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

nb = MultinomialNB(alpha =  1)
nb.fit(x_train_sparse, train_labels)
y_pred = nb.predict(x_test_sparse)





f1 = f1_score(test_labels, y_pred)
acc = accuracy_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print(" acc: {:.02%}, recall: {:.02%}, precision: {:.02%}, f1: {:.02%},".format(acc, recall, precision, f1))


 acc: 86.03%, recall: 87.76%, precision: 84.63%, f1: 86.17%,


In [12]:
linear_weights = nb.feature_log_prob_[1,:] - nb.feature_log_prob_[0,:]  # populate this with actual values

top_negative_features = np.argsort(linear_weights)[0:40]
top_positive_features = np.argsort(-linear_weights)[0:40]


In [13]:
print("Most negative features:")
for idx in top_negative_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
print("")
print("Most positive features:")
for idx in top_positive_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
    


Most negative features:
  üî• (-5.02)
  ‚Ü¥ (-4.66)
  ‚≠ê (-4.46)
  üï∞ (-4.15)
  üéÆ (-4.10)
  üíé (-4.07)
  üñ• (-4.05)
  registration (-4.01)
  cams (-3.98)
  tokkens (-3.98)
  giveaway (-3.94)
  token (-3.92)
  airdrop (-3.87)
  blockchain (-3.81)
  crypto (-3.71)
  rt (-3.68)
  enter (-3.65)
  telegram (-3.65)
  es (-3.65)
  summer (-3.51)
  ico (-3.48)
  latest (-3.46)
  üì¢ (-3.41)
  bitcoin (-3.30)
  camp (-3.26)
  ‚úî (-3.18)
  digital (-3.13)
  easter (-3.13)
  international (-3.13)
  üöÄ (-3.13)
  coin (-3.09)
  recommend (-2.99)
  ‚û° (-2.99)
  DGDGDGDGDG (-2.95)
  üí∞ (-2.93)
  tokens (-2.88)
  technology (-2.88)
  usd (-2.82)
  fee (-2.82)
  DGDG:DGDG (-2.82)

Most positive features:
  straight (5.47)
  nicer (5.46)
  ex (5.46)
  couldnt (4.77)
  walking (4.76)
  agent (4.76)
  believes (4.08)
  sarcasm (3.95)
  happen (3.89)
  surprise (3.87)
  marr (3.72)
  previ (3.69)
  oh (3.57)
  guy (3.53)
  form (3.39)
  lol (3.15)
  ü§£ (3.13)
  wonder (3.00)
  quotes (3

In [15]:
predictions = nb.predict(x_test_sparse)
wrong = predictions != test_labels
logits = nb.predict_log_proba(x_test_sparse)

def incorrect_confidence(wrong, logits, predictions):
    indeces = np.where(wrong)
    wrong_predictions = predictions[indeces]
    wrong_logits = logits[indeces]
    
    return [[wrong_logits[i][value] - wrong_logits[i][1-value], indeces[0][i]] for i, value in enumerate(wrong_predictions)]

sorted(incorrect_confidence(wrong, logits, predictions), key = lambda logit: -logit[0])



#[a] -  nb.predict_log_proba(x_test_sparse)[wrong][1-a]

[[22.70006569341828, 428],
 [19.564028840343752, 122],
 [18.048814645709825, 21],
 [17.510263915035495, 499],
 [16.900739704183337, 526],
 [15.70761812813572, 342],
 [15.584446969080659, 500],
 [13.511992504467003, 847],
 [12.956582996787688, 775],
 [12.724207000969841, 409],
 [12.597961983997862, 270],
 [12.597961983997862, 290],
 [10.731889664929355, 621],
 [10.543382212552615, 208],
 [10.222899746232514, 540],
 [9.89247035344846, 410],
 [9.693345075147874, 740],
 [9.130467889193596, 84],
 [9.121308255760539, 406],
 [9.01526761908002, 115],
 [8.82174158082097, 493],
 [8.649694846695468, 858],
 [8.52122600786521, 314],
 [8.025289090692695, 585],
 [7.844676863377003, 271],
 [7.64884239726112, 395],
 [7.204270810442068, 53],
 [7.168279033758523, 466],
 [7.008576351425376, 19],
 [6.715659779246522, 450],
 [6.697963007788815, 281],
 [6.227488044758104, 640],
 [5.893576233481241, 855],
 [5.867050593290806, 473],
 [5.24556521507472, 678],
 [5.211127145052444, 176],
 [5.156631631854452, 413]

In [16]:
index = 428
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['hangover' '!' 'üç∫' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'üíã' 'LINK']
1
0


In [17]:
index = 122
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['this' 'one' 'enter' 'gan' '!' '!' '!' '-' '-' '-' 'follow' 'ACCOUNT' '-'
 '-' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'LINK']
1
0


In [18]:
index =21
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['LINK' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶' '‚Ä¶'
 '‚Ä¶' '‚Ä¶' 'once' 'upon' 'a' 'time' '(' 'spin' ')' 'featuring' 'ACCOUNT'
 'of' 'd12' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' '#' 'hiphop' 'HASHTAG']
1
0


In [19]:
index = 270
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['meme' 'war' '!' 'a' 'massive' 'collection' '.' 'LINK' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG']
1
0


In [20]:
index = 208
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['wide' 'awake' 'for' 'once' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG' 'HASHTAG'
 'HASHTAG' 'HASHTAG' ':/' '/' 't.co/fuubiehpxu']
1
0


In [None]:
# No hashtag model has a big problem when a message is made up of many hashtags