In [2]:
from __future__ import print_function
from __future__ import division
import collections
import nltk
import numpy as np
from nltk.tokenize.casual import TweetTokenizer
import sys;
sys.path.insert(0, '../code')
from w266_common import utils, vocabulary
import re
np.random.seed(266)

In [3]:
import csv
tokenizer = TweetTokenizer()
x_data = []
labels = []
sentences = []
     
  
with open('../data/merged_data_v3.csv', 'r') as csvfile:
    linereader = csv.reader(csvfile, delimiter = '|')
    for i, row in enumerate(linereader):
        if i == 0:
            continue
        sentence, _, sarcasm = row
        sentence = re.sub("RT @[^\s]+:", "retweet", sentence)
        #sentences.append(sentence)
        tokenized_words = tokenizer.tokenize(sentence)
        x_tokens = utils.canonicalize_words(tokenized_words, hashtags =False)
        index = np.argwhere(x_tokens=="sarcasm")
        x_tokens = np.delete(x_tokens, index)
        index = np.argwhere(x_tokens=="sarcastic")
        x_tokens = np.delete(x_tokens, index)
        x_data.append(x_tokens)
        labels.append(int(sarcasm))


shuffle_indices = np.random.permutation(np.arange(len(labels)))
train_split_idx = int(0.7 * len(labels))
test_split_idx  = int(0.9 * len(labels))

train_indices = shuffle_indices[:train_split_idx]
validation_indices = shuffle_indices[train_split_idx:test_split_idx]
test_indices = shuffle_indices[test_split_idx:]

x_data = np.array(x_data)
labels = np.array(labels)
train_sentences = x_data[train_indices]
train_labels= labels[train_indices] 
validation_sentences = x_data[validation_indices]
validation_labels = labels[validation_indices]
test_sentences = x_data[test_indices]  
test_labels = labels[test_indices]  



In [4]:
vocab = vocabulary.Vocabulary(utils.flatten(train_sentences),5000)
x_ids = vocab.words_to_ids(train_sentences[0])
print("x_ids =", vocab.word_to_id)
print(x_ids)

[9, 71, 67, 7, 11, 74, 53, 23, 56, 78, 75, 21, 76, 79, 70, 80, 15, 77, 72, 48, 12]


In [5]:
list_of_train_ids = list(map(vocab.words_to_ids, train_sentences))
print(list_of_train_ids[0:20])


list_of_validation_ids = list(map(vocab.words_to_ids, validation_sentences))

list_of_test_ids = list(map(vocab.words_to_ids, test_sentences))


[[9, 71, 67, 7, 11, 74, 53, 23, 56, 78, 75, 21, 76, 79, 70, 80, 15, 77, 72, 48, 12], [9, 4585, 14, 2, 1066, 370, 167, 349, 612, 4586, 3477, 4, 2, 4587, 4588, 83, 684, 12], [9, 4589, 343, 6, 2803, 858, 8, 28, 25, 47, 123, 20, 2, 455, 455, 455, 417, 46, 2, 2, 2, 2, 83, 4], [9, 1067, 181, 2804, 212, 18, 2805, 52, 2349, 20, 6, 132, 5, 986, 384, 7, 371, 820, 10, 429, 177, 4590, 10, 4591], [3478, 2806, 729, 344, 376, 256, 1584, 11, 921, 10, 4, 2, 2, 2, 66, 7, 859, 11, 112, 3479, 404, 20, 1161, 37, 612, 2], [1162, 13, 6, 132, 3480, 3481, 248, 1068, 470, 1996, 4592, 2, 4], [19, 647, 94, 6, 264, 922, 35, 182, 7, 31, 18, 588, 2], [9, 4593, 4594, 385, 4595, 63, 4596, 248, 4, 4], [9, 771, 4597, 2807, 37, 730, 4598, 2, 5, 4598, 15, 11, 3482, 4599, 14, 100, 2, 17, 2, 8, 730, 13, 2, 12], [9, 2, 7, 55, 6, 4600, 15, 1163, 8, 235, 731, 405, 4601, 86, 45, 1585, 321, 1997, 24, 283, 14, 129, 2, 6, 2, 12], [4, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 396, 732, 11, 

In [6]:
collections.Counter(list_of_train_ids[0])

Counter({9: 1,
         71: 1,
         67: 1,
         7: 1,
         11: 1,
         74: 1,
         53: 1,
         23: 1,
         56: 1,
         78: 1,
         75: 1,
         21: 1,
         76: 1,
         79: 1,
         70: 1,
         80: 1,
         15: 1,
         77: 1,
         72: 1,
         48: 1,
         12: 1})

In [7]:
from scipy.sparse import csr_matrix 
def sparsify_data(list_of_ids):
    row_indices = []
    col_indices = []
    values = []
    rows = len(list_of_ids)

    for row, x_ids in enumerate(list_of_ids):
        x_fdict = collections.Counter(x_ids)
        for wordid, count in x_fdict.items():
            row_indices.append(row)       
            col_indices.append(wordid)  # column is word id
            values.append(count)        # value is count
    x_sparse = csr_matrix((values, (row_indices, col_indices)),
                              shape=[rows, vocab.size])
    return x_sparse

In [8]:
x_train_sparse = sparsify_data(list_of_train_ids)
x_validation_sparse = sparsify_data(list_of_validation_ids)
x_test_sparse = sparsify_data(list_of_test_ids)


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

nb = MultinomialNB(alpha =  0.1)
nb.fit(x_train_sparse, train_labels)
y_pred = nb.predict(x_test_sparse)





f1 = f1_score(test_labels, y_pred)
acc = accuracy_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print(" acc: {:.02%}, recall: {:.02%}, precision: {:.02%}, f1: {:.02%},".format(acc, recall, precision, f1))


 acc: 91.29%, recall: 93.07%, precision: 89.76%, f1: 91.38%,


In [10]:
linear_weights = nb.feature_log_prob_[1,:] - nb.feature_log_prob_[0,:]  # populate this with actual values

top_negative_features = np.argsort(linear_weights)[0:100]
top_positive_features = np.argsort(-linear_weights)[0:100]


In [11]:
print("Most negative features:")
for idx in top_negative_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
print("")
print("Most positive features:")
for idx in top_positive_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
    


Most negative features:
  🔥 (-7.32)
  #win (-7.14)
  ↴ (-6.96)
  #job (-6.84)
  #giveaway (-6.82)
  ⭐ (-6.76)
  #hiring (-6.53)
  #ico (-6.45)
  🕰 (-6.45)
  #photography (-6.39)
  🎮 (-6.39)
  💎 (-6.36)
  🖥 (-6.34)
  registration (-6.30)
  cams (-6.26)
  tokkens (-6.26)
  #fortnite (-6.26)
  #education (-6.24)
  giveaway (-6.22)
  token (-6.20)
  airdrop (-6.16)
  #competition (-6.14)
  blockchain (-6.09)
  crypto (-5.99)
  es (-5.94)
  telegram (-5.94)
  #careerarc (-5.94)
  #architecture (-5.88)
  #elementary (-5.88)
  #decoration (-5.79)
  #nowplaying (-5.79)
  summer (-5.79)
  #robotics (-5.79)
  #live (-5.75)
  ico (-5.75)
  #programming (-5.75)
  #interiordesign (-5.72)
  📢 (-5.69)
  #tron (-5.69)
  #ai (-5.65)
  #rock (-5.65)
  #startup (-5.61)
  bitcoin (-5.57)
  #cybersecurity (-5.57)
  #iot (-5.57)
  camp (-5.53)
  #pop (-5.53)
  #tokensale (-5.53)
  ✔ (-5.44)
  #airdrop (-5.44)
  #fintech (-5.44)
  #eth (-5.44)
  digital (-5.40)
  easter (-5.40)
  #modeling (-5.40)
  internat

In [12]:
predictions = nb.predict(x_test_sparse)
wrong = predictions != test_labels
logits = nb.predict_log_proba(x_test_sparse)

def incorrect_confidence(wrong, logits, predictions):
    indeces = np.where(wrong)
    wrong_predictions = predictions[indeces]
    wrong_logits = logits[indeces]
    
    return [[wrong_logits[i][value] - wrong_logits[i][1-value], indeces[0][i]] for i, value in enumerate(wrong_predictions)]

sorted(incorrect_confidence(wrong, logits, predictions), key = lambda logit: -logit[0])


[[15.223133915609225, 847],
 [14.965058407186632, 342],
 [13.823467766792476, 300],
 [13.216119694079595, 775],
 [10.965648194848484, 585],
 [10.362002226302195, 409],
 [10.035153385354562, 314],
 [9.387737981543268, 271],
 [8.991363632546097, 410],
 [8.847556362336803, 395],
 [8.44153350464478, 621],
 [8.09078187367021, 855],
 [7.758889444138305, 582],
 [7.250589588382866, 540],
 [7.1846149817828575, 22],
 [7.090177483266814, 309],
 [7.048915483332337, 858],
 [6.730667201258257, 421],
 [6.585760481008663, 423],
 [6.155928645373436, 473],
 [5.940185057526435, 516],
 [5.6490432555499694, 176],
 [5.6163715347107654, 823],
 [5.388195398076903, 740],
 [5.273136999459183, 360],
 [4.9447085512005415, 652],
 [4.858775333360249, 1],
 [4.782645534285848, 219],
 [4.748313323451754, 53],
 [4.372939686036915, 115],
 [3.956363694463846, 281],
 [3.8799871119856277, 378],
 [3.686931494813898, 715],
 [3.621502856693354, 466],
 [3.593783350603104, 459],
 [3.4491213438095656, 640],
 [3.4406785635005335,

In [13]:
index = 847
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['ACCOUNT' 'ACCOUNT' 'that' 'was' 'very' 'thoughtful' '&' 'supportive'
 'of' 'mr' '.' 'kraft' '.' 'all' 'the' 'other' '#nfl' 'teams' 'should'
 'have' 'done' 'the' 'same' 'thing' '!']
0
1


In [14]:
index = 342
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['finished' 'a' 'way' 'out' 'with' 'ACCOUNT' 'yesterday' 'night' '.' 'i'
 'really' 'like' 'the' 'game' '.' 'it' '’' 's' 'a' 'story' 'drive' 'coop'
 'game' 'with' 'cool' 'protagonists' '.' 'DG-DG' 'hours' 'of' 'good' 'fun'
 ',' 'tension' 'and' 'surprises' '.' 'especially' 'during' 'the' 'final'
 'hours' ',' 'this' 'game' 'feels' 'like' 'a' 'blockbuster' 'movie' '.'
 '#awayout' '#gaming']
0
1


In [15]:
index = 300
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['so' 'happy' 'the' 'old' '#conservative' ',' 'bigoted' ',' 'xenophobic'
 ',' 'homophobic' 'gun' 'idiots' 'are' 'dying' 'off' 'and' 'the' 'smart'
 'new' 'kids' 'will' 'be' 'able' 'to' 'vote' 'this' 'november' '.' 'yup'
 '.' 'i' 'said' 'it' '.' 'good' 'riddance' 'to' 'trash' '.' '#gop' '#nra'
 '#bluewave' '#bluewave2018' '#racist' '#trumpsamerikkka' '#nazis' '#maga'
 '#marchforourlives']
0
1


In [16]:
index = 775
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['always' 'knew' 'this' 'was' 'false.but' 'heres' 'the' 'question' ','
 'what' 'should' 'a' 'person' 'do' 'if' 'taken' 'to' 'an' 'atm' 'under'
 'duress' '?' 'any' 'help' '?' 'LINK']
0
1


In [17]:
index = 715
print(test_sentences[index])
print(test_labels[index])
print(predictions[index])

['#epl' 'goalkeepers' 'lookin' 'like' 'studs' 'today' '.' '#france'
 '#colombia' '#lloris' '#ospina' 'LINK']
1
0


In [18]:
top_emojis = ["😒","🤦","🤷","🙃","😱"]
print("")
print("Most positive emoji features:")
for idx in top_positive_features:
    word = vocab.id_to_word[idx]
    if word in top_emojis:
        print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))



Most positive emoji features:
  😒 (5.21)
  🤦 (4.83)
  😱 (4.83)
  🙃 (4.66)
  🤷 (4.66)
