# SVM on GloVE

In [20]:
# Build and analyze baseline algorithms

from google.colab import drive
drive.mount('/content/drive')

import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
def process_data(file_name):
  x, y = [], []
  f = open('/content/drive/MyDrive/data/' + file_name, encoding='utf-8')
  for line in f:
    _, label, sentence = line.split(" ", 2)
    x.append(sentence.rstrip('\n'))
    y.append(label)
  f.close()
  x = np.asarray(x)
  y = np.asarray(y)
  return x, y

In [22]:
# load sarcasm data
x_tr, y_tr = process_data('train_main.txt')

In [23]:
x_te, y_te = process_data('test_main.txt')

In [None]:
print(x_tr.shape)
print(y_tr.shape)
print(x_te.shape)
print(y_te.shape)

(257082,)
(257082,)
(64666,)
(64666,)


In [24]:
# step 1: create embeddings dictionary w/ glove.6B.300d.txt

embeddings_index = {}
f = open('/content/drive/My Drive/data/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print('length of word vector: ', len(embeddings_index["from"]))

vocab = embeddings_index.keys()

Found 400000 word vectors.
length of word vector:  300


In [25]:
# 2. for each token in the sentence, get the corresponding word embedding -- something like embedding[token]
# tokenize each sentence

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def composite_sentence_embedding(vocab, sent):
  arr = np.asarray([embeddings_index[w] for w in sent if w in vocab])
  if arr.size == 0: # empty -- none of the words in the sentence exist in the vocab
    return np.zeros(shape=(300,))
  return np.sum(arr, axis=0)

def transform(X, Y):
  """ Transforms sentences into its word embedding form, while eliminating
        sentences where all words in the sentence do not exist in the vocabulary
    Arguments:
        X  (numpy ndarray, shape = (samples,)):
            sentences to transform
        Y  (numpy ndarray, shape = (samples,)):
            sarcasm label 0 or 1

    Returns:
        x  (numpy ndarray, shape = (N,300)):
            remaining sentence embeddings
        y  (numpy ndarray, shape = (N,)):
            remaining labels
        s  (list, shape = (N,)):
            remaining sentences
  """
  composite = []
  mask = []
  sentences = []
  for i in range(X.shape[0]):
    sentence = X[i]
    tokenized = word_tokenize(sentence.lower().strip()) # tokenized sentence
    comp = composite_sentence_embedding(vocab, tokenized) # composed sentence embedding
    if np.sum(comp) != 0:
      composite.append(comp)
      mask.append(i)
      sentences.append(sentence)
  composite = np.stack(composite)
  return composite, Y[mask], sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [33]:
X_tr, Y_tr, _ = transform(x_tr, y_tr) 

In [28]:
X_te, Y_te, te_sent = transform(x_te, y_te)

In [34]:
# 80-10-10 split of train/validation/test  

# split test into validation and test sets -- should only run this cell once
n = round(X_te.shape[0]/2)
X_val, Y_val, val_sent = X_te[:n], Y_te[:n], te_sent[:n]
X_te, Y_te, te_sent = X_te[n:], Y_te[n:], te_sent[n:]
print(X_te.shape, Y_te.shape, len(te_sent))
print(X_val.shape, Y_val.shape, len(val_sent))

(32112, 300) (32112,) 32112
(32112, 300) (32112,) 32112


In [35]:
# 3. fit the linear svc on training data, where x contains word embeddings for each sentence, and y is 0/1 for sarcasm
# also hyperparameter tuning

from sklearn.svm import LinearSVC

# roughly 2 minutes to train

svc = LinearSVC(C=0.05, dual=False) # dual=False when n_samples > n_features
svc.fit(X_tr, Y_tr)
print(svc.score(X_val,Y_val)) # 0.6150660189337319

0.6150660189337319


In [37]:
# retrain model on X_tr + X_val
X_comb = np.concatenate((X_tr, X_val))
Y_comb = np.concatenate((Y_tr, Y_val))
svc.fit(X_comb,Y_comb)

# get accuracy on test set
print(svc.score(X_te,Y_te)) # 0.6153774289985052

0.6153774289985052


In [38]:
# need to include precision, recall, F1 scores

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

y_pred = svc.predict(X_te)
print('precision score: ', precision_score(Y_te, y_pred, pos_label='1'))
print('recall score: ', recall_score(Y_te, y_pred, pos_label='1'))
print('f1 score: ', f1_score(Y_te, y_pred, pos_label='1')) # score for positive label
print()

from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(Y_te, y_pred, labels=['0','1']) # scores for neg/pos label
# number of negative labels: 16005
# number of positive labels: 16107

precision score:  0.6231152484594205
recall score:  0.5901160985906748
f1 score:  0.6061668951882911



(array([0.60837585, 0.62311525]),
 array([0.64079975, 0.5901161 ]),
 array([0.624167 , 0.6061669]),
 array([16005, 16107]))

In [51]:
# error analysis -- what examples does the model fail at? take 100-200 validation 
# set examples that your model incorrectly predicted and categorize them
# model cannot predict sentences where word embeddings don't exist

predictions = svc.predict(X_val)
mask = Y_val != predictions
misclassified = np.asarray(val_sent)[mask][:100]
print(misclassified)

# np.save('/content/drive/MyDrive/data/misclassified_sentences.npy', misclassified)
# d = np.load('/content/drive/MyDrive/data/misclassified_sentences.npy')
# print(d)

# it's a useful debugging tool -- predict 0; could be interpreted as non-sarcastic, really depends on more context
# public interface someservice { void provideservice ( ) throws checkedexceptionofeverypossibleimplementationtype } neat abstraction -- makes no sense
# 'hmm ... cdj-2000 for $ 900 ... seems legit -- probably hmm was not in vocab -- couldn't understand shift in tone


['chalk it up to the ever-increasing cost of freedom .'
 "we 're about to finally get affordable housing , and now the politicians are doing everything they can to keep prices * high * ."
 "it 's a useful debugging tool"
 'including my peers in 5th grade , who did their series report on this topic !'
 'well that makes me want to vote for them .'
 "why is it that the minority parties seem incapable of producing any publicity material that is n't , y'know , shit ?"
 "this 'll end well ."
 "i only read the headline but , i do n't think a survey is too accurate unless you talk to more than 30 travelers ."
 'should be labeled nsfw , and might be illegal in some countries with some child nudity there .'
 'public interface someservice { void provideservice ( ) throws checkedexceptionofeverypossibleimplementationtype } neat abstraction'
 'clearly engadget is using the coercive power of the state to attack their rivals over at gizmodo .'
 "congratulations- you 'll be homeless before you get a j

In [42]:
# ensure that there is no data w/ all zeros
print(np.all(np.sum(X_tr, axis=1)))
print(np.all(np.sum(X_te, axis=1)))
print(np.all(np.sum(X_val, axis=1)))

True
True
True
