In [47]:
from __future__ import print_function
from __future__ import division
import tensorflow as tf
import collections
import nltk
import numpy as np
from nltk.tokenize.casual import TweetTokenizer
import sys;
sys.path.insert(0, '../code')
from w266_common import utils, vocabulary
import time
import datetime
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import re
np.random.seed(266)

In [48]:
import csv
tokenizer = TweetTokenizer()
x_data = []
x_contexts = []
labels = []
sentences  = []
contexts = []
with open('../data/merged_data_v3.csv', 'r') as csvfile:
    linereader = csv.reader(csvfile, delimiter = '|')
    for i, row in enumerate(linereader):
        if i == 0:
            continue
        sentence, context, sarcasm = row
        sentence = re.sub("RT @[^\s]+:", "retweet", sentence)
        sentences.append(sentence)
        contexts.append(context)
        x_tokens = utils.canonicalize_words(tokenizer.tokenize(sentence), hashtags = True)
        context_tokens = utils.canonicalize_words(tokenizer.tokenize(context), hashtags = True)
        x_data.append(x_tokens)
        x_contexts.append(context_tokens)
        labels.append(int(sarcasm))



shuffle_indices = np.random.permutation(np.arange(len(labels)))
train_split_idx = int(0.7 * len(labels))
test_split_idx  = int(0.9 * len(labels))

train_indices = shuffle_indices[:train_split_idx]
validation_indices = shuffle_indices[train_split_idx:test_split_idx]
test_indices = shuffle_indices[test_split_idx:]


train_sentences = np.array(x_data, dtype="object")[train_indices]
train_contexts = np.array(x_contexts, dtype="object")[train_indices]
train_labels= np.array(labels, dtype="object")[train_indices] 
validation_sentences = np.array(x_data, dtype="object")[validation_indices]
validation_labels = np.array(labels, dtype="object")[validation_indices]
validation_contexts = np.array(x_contexts, dtype="object")[validation_indices]
test_sentences = np.array(x_data, dtype="object")[test_indices]  
test_contexts =  np.array(x_contexts, dtype="object")[test_indices]
test_labels = np.array(labels, dtype="object")[test_indices]  

In [49]:
a = [2]*4
a[2] = 6
a

[2, 2, 6, 2]

In [50]:
def transform_labels(raw_label_set, size):
    label_set = []
    for label in raw_label_set:
        labels = [0] * size
        labels[label] = 1
        label_set.append(labels)
    return np.array(label_set, dtype="object")

expanded_train_labels = transform_labels(train_labels, 2)
expanded_validation_labels = transform_labels(validation_labels,2)
expanded_test_labels = transform_labels(test_labels,2)

In [51]:
a = [1,3,4,5,6,7,8,8,1,5,6,7]
a + ["<PADDING>"]*5

[1,
 3,
 4,
 5,
 6,
 7,
 8,
 8,
 1,
 5,
 6,
 7,
 '<PADDING>',
 '<PADDING>',
 '<PADDING>',
 '<PADDING>',
 '<PADDING>']

In [52]:
class PaddingAndTruncating:
    def __init__(self, max_len):
        self.max_len = max_len
    
    def pad_or_truncate(self, sentence):
        sen_len = len(sentence)
        paddings = self.max_len - sen_len
        if paddings >=0:
            return list(sentence) + ["<PADDING>"] * paddings
        return sentence[0:paddings]
        
PadAndTrunc = PaddingAndTruncating(10)
        
        
PadAndTrunc.pad_or_truncate(["the","angry","man","is","angry"])

['the',
 'angry',
 'man',
 'is',
 'angry',
 '<PADDING>',
 '<PADDING>',
 '<PADDING>',
 '<PADDING>',
 '<PADDING>']

In [53]:
from tensorflow.contrib import learn
vocab_size = 5000

#max_len = max([len(sent) for sent  in train_sentences])
PadAndTrunc =PaddingAndTruncating(40)
train_sentences_padded = list(map(PadAndTrunc.pad_or_truncate, train_sentences))
train_context_padded = list(map(PadAndTrunc.pad_or_truncate, train_contexts))
validation_sentences_padded = list(map(PadAndTrunc.pad_or_truncate, validation_sentences))
validation_context_padded = list(map(PadAndTrunc.pad_or_truncate, validation_contexts))
test_sentences_padded = list(map(PadAndTrunc.pad_or_truncate, test_sentences))
test_context_padded = list(map(PadAndTrunc.pad_or_truncate, test_contexts))

vocab = vocabulary.Vocabulary(utils.flatten(list(train_sentences_padded) + list(train_context_padded)), vocab_size)
train_s = np.array(list(map(vocab.words_to_ids, train_sentences_padded)), dtype="object")
train_c = np.array(list(map(vocab.words_to_ids, train_context_padded)), dtype="object")
validation_s = np.array(list(map(vocab.words_to_ids, validation_sentences_padded)), dtype="object")
validation_c = np.array(list(map(vocab.words_to_ids, validation_context_padded)), dtype="object")
test_s = np.array(list(map(vocab.words_to_ids, test_sentences_padded)), dtype="object")
test_c = np.array(list(map(vocab.words_to_ids, test_context_padded)), dtype="object")
train_s

array([[13, 82, 75, ..., 3, 3, 3],
       [13, 4227, 17, ..., 3, 3, 3],
       [13, 4229, 296, ..., 3, 3, 3],
       ...,
       [33, 140, 19, ..., 3, 3, 3],
       [4, 9, 606, ..., 3, 3, 3],
       [13, 256, 11, ..., 3, 3, 3]], dtype=object)

In [54]:
i = 3
("conv-maxpool-%s" % i)

'conv-maxpool-3'

In [55]:
class TextCNN(object):

    def __init__(
      self, sequence_length, num_classes, vocab_size, 
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
        self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
            self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
            self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2)
            self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1)

        # Create a convolution + avgpool layer for each filter size
        pooled_outputs = []

        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("conv-avgpool-%s" % i) as scope:
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]

                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
              
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded1,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h1 = tf.nn.tanh(tf.nn.bias_add(conv, b), name="tanh1")
               
                # Maxpooling over the outputs
                pooled = tf.nn.avg_pool(
                    h1,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                
                pooled_outputs.append(pooled)
                #pooled_outputs.append(pooled)
                scope.reuse_variables()
                 
                
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded2,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h2 = tf.nn.tanh(tf.nn.bias_add(conv, b), name="tanh2")
                # Maxpooling over the outputs
                pooled2 = tf.nn.avg_pool(
                    h2,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled2)

        # Combine all the pooled features

        num_filters_total = num_filters * len(filter_sizes) * 2

        self.h_pool = tf.concat(pooled_outputs, 1)

        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])


        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
        
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
           
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            self.labels = tf.argmax(self.input_y, 1)
            TP = tf.count_nonzero(self.predictions * self.labels)
            TN = tf.count_nonzero((self.predictions - 1) * (self.labels - 1))
            FP = tf.count_nonzero(self.predictions * (self.labels - 1))
            FN = tf.count_nonzero((self.predictions - 1) * self.labels)
            self.correct_predictions = tf.equal(self.predictions, self.labels)
            self.precision = TP / (TP + FP)
            self.recall = TP / (TP + FN)
            self.f1_score = 2 * self.precision * self.recall / (self.precision + self.recall)
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy")

In [56]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data, dtype="object")
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [59]:
learning_rate = 0.1

for name in list(tf.flags.FLAGS):
    delattr(tf.flags.FLAGS,name)

tf.flags.DEFINE_float("dev_sample_percentage", 0.75, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 40, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "1", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 40, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.4, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularization lambda (default: 0.0)")

# Training parameters
batch_size = 200
tf.flags.DEFINE_integer("batch_size", batch_size, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 20, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')
# FLAGS._parse_flags()
FLAGS.flag_values_dict()
# FLAGS(sys.argv)


print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================


with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():

        cnn = TextCNN(
            sequence_length=train_s.shape[1],
            num_classes=2,
            vocab_size=vocab_size,
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)
        
         # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
        # Write vocabulary
        #vocab_processor.save(os.path.join(out_dir, "vocab"))
        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x_batch1, x_batch2, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x1: x_batch1,
              cnn.input_x2: x_batch2,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }     
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            print("step {}, loss {:g}, acc {:g}".format(step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch1, x_batch2, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x1: x_batch1,
              cnn.input_x2: x_batch2,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            #r 
            step, summaries, loss, accuracy, recall, precision, f1 = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy,cnn.recall, cnn.precision, cnn.f1_score],
                feed_dict)
            print("step {}, loss {:g}, acc {:g}, rec {:g}, pre {:g}, f1 {:g}".format(step, loss, accuracy, recall, precision, f1))# recall, precision, f1))
            if writer:
                writer.add_summary(summaries, step)
        
        def error_analysis(x_batch1, x_batch2, y_batch, writer=None):
            feed_dict = {
              cnn.input_x1: x_batch1,
              cnn.input_x2: x_batch2,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            #r 
            step, summaries, loss, accuracy, recall, precision, f1, correct, scores,predictions  = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy,cnn.recall, cnn.precision, cnn.f1_score, cnn.correct_predictions, cnn.scores, cnn.predictions],
                feed_dict)
            print("step {}, loss {:g}, acc {:g}, rec {:g}, pre {:g}, f1 {:g}".format(step, loss, accuracy, recall, precision, f1))# recall, precision, f1))
            if writer:
                writer.add_summary(summaries, step)
            return correct, scores, predictions

        # Generate batches
        batches = batch_iter(
            list(zip(train_s, train_c,  expanded_train_labels)), FLAGS.batch_size, FLAGS.num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch1, x_batch2, y_batch = zip(*batch)
            train_step(x_batch1, x_batch1, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(validation_s, validation_c, expanded_validation_labels, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))
print("\nTest Set:")
correct, logits, predictions = error_analysis(test_s, test_c, expanded_test_labels, writer=dev_summary_writer)
print("")



Parameters:
ALLOW_SOFT_PLACEMENT=<absl.flags._flag.BooleanFlag object at 0x7f87e0a8b490>
BATCH_SIZE=<absl.flags._flag.Flag object at 0x7f88132e0050>
CHECKPOINT_EVERY=<absl.flags._flag.Flag object at 0x7f87e0a8ba10>
DEV_SAMPLE_PERCENTAGE=<absl.flags._flag.Flag object at 0x7f8810ff9110>
DROPOUT_KEEP_PROB=<absl.flags._flag.Flag object at 0x7f87e0a7b410>
EMBEDDING_DIM=<absl.flags._flag.Flag object at 0x7f88132dca50>
EVALUATE_EVERY=<absl.flags._flag.Flag object at 0x7f87e0a99110>
F=<absl.flags._flag.Flag object at 0x7f884072c8d0>
FILTER_SIZES=<absl.flags._flag.Flag object at 0x7f88132df710>
L2_REG_LAMBDA=<absl.flags._flag.Flag object at 0x7f87e0a7bc90>
LOG_DEVICE_PLACEMENT=<absl.flags._flag.BooleanFlag object at 0x7f87e0a8f790>
NEGATIVE_DATA_FILE=<absl.flags._flag.Flag object at 0x7f87d89e1fd0>
NUM_CHECKPOINTS=<absl.flags._flag.Flag object at 0x7f87e0a8f690>
NUM_EPOCHS=<absl.flags._flag.Flag object at 0x7f88132ddb90>
NUM_FILTERS=<absl.flags._flag.Flag object at 0x7f88132ddb50>
POSITIVE_DAT

step 160, loss 0.358314, acc 0.86533, rec 0.903643, pre 0.834056, f1 0.867456

step 161, loss 0.298358, acc 0.875
step 162, loss 0.224821, acc 0.9
step 163, loss 0.33862, acc 0.835
step 164, loss 0.252674, acc 0.895
step 165, loss 0.241839, acc 0.9
step 166, loss 0.310959, acc 0.85
step 167, loss 0.373855, acc 0.83
step 168, loss 0.325978, acc 0.865
step 169, loss 0.373955, acc 0.855
step 170, loss 0.249473, acc 0.895
step 171, loss 0.328127, acc 0.885
step 172, loss 0.232697, acc 0.895
step 173, loss 0.398439, acc 0.83
step 174, loss 0.327522, acc 0.875
step 175, loss 0.240252, acc 0.88
step 176, loss 0.306505, acc 0.87
step 177, loss 0.264091, acc 0.925
step 178, loss 0.332541, acc 0.87
step 179, loss 0.299228, acc 0.87
step 180, loss 0.271901, acc 0.89

Evaluation:
step 180, loss 0.345757, acc 0.856734, rec 0.768508, pre 0.925035, f1 0.839538

step 181, loss 0.2918, acc 0.895
step 182, loss 0.332334, acc 0.89
step 183, loss 0.181826, acc 0.94
step 184, loss 0.341207, acc 0.87
step 1

In [41]:
wrong = correct == False
def incorrect_confidence(wrong, logits, predictions):
    indeces = np.where(wrong)
    wrong_predictions = predictions[indeces]
    wrong_logits = logits[indeces]
    
    return [[wrong_logits[i][value] - wrong_logits[i][1-value], indeces[0][i]] for i, value in enumerate(wrong_predictions)]


sorted(incorrect_confidence(wrong, logits, predictions), key = lambda logit: -logit[0])

[[6.39266, 342],
 [5.061942, 493],
 [5.046752, 585],
 [4.3829613, 409],
 [3.927161, 847],
 [3.618387, 775],
 [3.448583, 858],
 [3.4101298, 740],
 [3.3196685, 655],
 [3.1696293, 540],
 [2.993722, 115],
 [2.9151819, 99],
 [2.9145448, 22],
 [2.8299224, 477],
 [2.7938733, 101],
 [2.7797494, 413],
 [2.6001554, 378],
 [2.5578806, 300],
 [2.5279865, 281],
 [2.4553905, 53],
 [2.3987052, 242],
 [2.3681984, 667],
 [2.3521419, 265],
 [2.337718, 271],
 [2.3268445, 727],
 [2.2695088, 706],
 [2.2588248, 776],
 [2.1821654, 280],
 [2.0302563, 638],
 [2.0271137, 331],
 [2.0019164, 486],
 [1.9776791, 347],
 [1.9058044, 562],
 [1.901355, 841],
 [1.8306621, 729],
 [1.7550875, 219],
 [1.7546061, 65],
 [1.710851, 421],
 [1.7022494, 528],
 [1.6827189, 621],
 [1.6374197, 176],
 [1.6350754, 859],
 [1.6116165, 355],
 [1.6096758, 185],
 [1.6092, 645],
 [1.5809809, 640],
 [1.5646458, 648],
 [1.5629072, 362],
 [1.5617416, 755],
 [1.5539796, 459],
 [1.544835, 119],
 [1.5102644, 741],
 [1.4786234, 589],
 [1.4213389,

In [42]:
index = 499
print(test_sentences[index])
print(test_contexts[index])
print(test_labels[index])
print(predictions[index])
#definitely sarcastic

['LINK', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', '…', 'once', 'upon', 'a', 'time', '(', 'spin', ')', 'featuring', 'ACCOUNT', 'of', 'd12', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG']
['nan']
1
1


In [43]:
index = 122
print(test_sentences[index])
print(test_contexts[index])
print(test_labels[index])
print(predictions[index])

# hard to say what's going on here. seems to be slightly sarcastic, not really sure though. Throwing me off too

['this', 'one', 'enter', 'gan', '!', '!', '!', '-', '-', '-', 'follow', 'ACCOUNT', '-', '-', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'LINK']
['nan']
1
1


In [44]:
index = 423
print(test_sentences[index])
print(test_contexts[index])
print(test_labels[index])
print(predictions[index])
# heavy use of hashtags is throwing this one off too

['march', 'for', 'our', 'lives', 'brings', 'out', 'mlks', 'granddaughter', 'and', 'demands', 'for', 'a', 'gun', 'free', 'world', '.', 'yes', 'lets', 'all', 'rally', 'behind', 'a', 'DG', 'year', 'olds', 'beliefs', '.', 'HASHTAG']
['nan']
1
0


In [45]:
index = 84
print(test_sentences[index])
print(test_contexts[index])
print(test_labels[index])
print(predictions[index])
# emoji is throwing it off

['these', 'kids', 'have', 'managed', 'to', 'score', 'the', 'kind', 'of', 'extracurricular', 'HASHTAG', 'we', '’', 've', 'been', 'eviscerating', 'for', 'decades', 'in', 'the', 'united', 'states', '.', 'these', 'kids', 'aren', '’', 't', 'prodigiously', 'gifted', '.', 'they', '’', 've', 'just', 'had', 'the', 'gift', 'of', 'the', 'kind', 'of', 'education', 'we', 'no', 'longer', 'value', '.', 'HASHTAG', 'HASHTAG']
['kasi', 'c', '.', 'on', 'twitter', ':', 'the', 'students', 'of', 'HASHTAG', 'have', 'been', 'the', 'beneficiaries', 'of', 'the', 'kind', 'of', '1950s', '-', 'style', 'public', 'HASHTAG', 'that', 'has', 'all', 'but', 'vanished', 'in', 'america', '&', 'that', 'is', 'being', 'dismantled', 'with', 'great', 'deliberation', 'as', 'funding', 'for', 'things', 'like', 'the', 'arts', ',', 'civics', ',', '&', 'enrichment', 'are', 'zeroed', 'out', '.', 'HASHTAG']
0
1


In [46]:
index = 395
print(test_sentences[index])
print(test_contexts[index])
print(test_labels[index])
print(predictions[index])

# heavy use of hashtags is throwing this one off. Majority of tweets with heavy hashtag usage (as decribed in EDA)
# are non-sarcatic

['retweet', 'ACCOUNT', 'ACCOUNT', '☜', '🍊', '☞', '.', '&', '(', '◐', '_', '◕', ')', '_', 'HASHTAG', '.', 'des', 'HASHTAG', '!', '[', '!', 'HASHTAG', '*', ']', '(', '*', 'HASHTAG', 'of', ',', 'sad', 'as', '…']
['nan']
1
1
