In [0]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import random

import torch
from torchtext import data
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.optim as optim
import torch.nn as nn

Read in the data

In [0]:
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines = True)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.theonion.com/thirtysomething-scien...,thirtysomething scientists unveil doomsday clo...,1
1,https://www.huffingtonpost.com/entry/donna-edw...,dem rep. totally nails why congress is falling...,0
2,https://www.huffingtonpost.com/entry/eat-your-...,eat your veggies: 9 deliciously different recipes,0
3,https://local.theonion.com/inclement-weather-p...,inclement weather prevents liar from getting t...,1
4,https://www.theonion.com/mother-comes-pretty-c...,mother comes pretty close to using word 'strea...,1


Preprocessing

In [0]:
df.drop(columns = 'article_link', inplace = True)
df.head()

Unnamed: 0,headline,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1


In [0]:
df.shape

(28619, 2)

Save data frame to a text file

In [0]:
df.to_csv('sarcasm_headlines.txt', header = None, index = None, sep ='\t', mode = 'a')

In [5]:
!pip install -U sacremoses

SEED = 421
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize='moses')
LABEL = data.LabelField(dtype = torch.float)
BATCH_SIZE = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/8d/ca/c0e81fc25a09ec0d4ea96c02801a8e8b5b77744acf4ad480481fca127fc6/sacremoses-0.0.31.tar.gz (802kB)
[K     |████████████████████████████████| 808kB 43.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.31-cp36-none-any.whl size=832904 sha256=bd4eea97d8454c1639533752f6006df940051afed074b03abecd7442b462d47a
  Stored in directory: /root/.cache/pip/wheels/fa/0a/18/7c470ae4c30f82ff0f4e61ce4c0603ffcf609cbc033129e4de
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.31


In [0]:
pos = data.TabularDataset(
    path='sarcasm_headlines.txt', format='csv',
    csv_reader_params={'delimiter':"\t"},
    fields=[('text', TEXT),
            ('label', LABEL)])

# Split data into 90/10 training/test
trainandval, test_data=pos.split(split_ratio=0.90,random_state=random.seed(421))

# Of the remaining training data, 80/20 train/validation
train_data, valid_data = trainandval.split(split_ratio=0.80,random_state=random.seed(421))

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),
    device=device)

In [0]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

In [0]:
#Maximum vocabulary, choose word vectors
TEXT.build_vocab(train_data,max_size=750, vectors="glove.twitter.27B.100d")
LABEL.build_vocab(train_data)
#Network Hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.925

In [0]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [15]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.8537,  0.2277,  0.6947,  ..., -0.0190,  0.1615,  0.2776],
        ...,
        [ 0.6397, -0.3348, -0.2175,  ..., -0.4405, -0.0152, -0.0467],
        [ 0.6466,  0.1152,  0.4458,  ...,  0.3864,  0.1331,  0.3730],
        [ 0.1481,  0.4934, -0.2522,  ...,  0.3715,  0.2636, -0.2121]])

In [0]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
  #round predictions to the closest integer
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float() #convert into float for division
  acc = correct.sum()/len(correct)
  return acc

In [0]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
N_EPOCHS=100
bestmodelvalue=0
for epoch in range(N_EPOCHS):
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  if valid_acc >= bestmodelvalue:
    torch.save(model.state_dict(), "sarcasm_detect_model.pt")
    bestmodelvalue=valid_acc
  print(f'Epoch: {epoch+1:02} | Train Acc: {train_acc*100:.2f}% Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Train Acc: 75.59% Val. Acc: 80.88%
Epoch: 02 | Train Acc: 76.36% Val. Acc: 81.03%
Epoch: 03 | Train Acc: 76.71% Val. Acc: 80.99%
Epoch: 04 | Train Acc: 77.75% Val. Acc: 79.94%
Epoch: 05 | Train Acc: 78.34% Val. Acc: 81.40%
Epoch: 06 | Train Acc: 78.25% Val. Acc: 81.34%
Epoch: 07 | Train Acc: 78.51% Val. Acc: 81.05%
Epoch: 08 | Train Acc: 78.86% Val. Acc: 81.40%
Epoch: 09 | Train Acc: 79.35% Val. Acc: 81.50%
Epoch: 10 | Train Acc: 79.93% Val. Acc: 81.65%
Epoch: 11 | Train Acc: 79.28% Val. Acc: 81.38%
Epoch: 12 | Train Acc: 79.62% Val. Acc: 81.17%
Epoch: 13 | Train Acc: 80.41% Val. Acc: 80.84%
Epoch: 14 | Train Acc: 79.85% Val. Acc: 81.48%
Epoch: 15 | Train Acc: 79.88% Val. Acc: 82.27%
Epoch: 16 | Train Acc: 80.34% Val. Acc: 81.11%
Epoch: 17 | Train Acc: 80.56% Val. Acc: 81.52%
Epoch: 18 | Train Acc: 80.65% Val. Acc: 80.91%
Epoch: 19 | Train Acc: 81.01% Val. Acc: 80.31%
Epoch: 20 | Train Acc: 81.08% Val. Acc: 81.26%
Epoch: 21 | Train Acc: 80.95% Val. Acc: 81.69%
Epoch: 22 | T

In [22]:
model.load_state_dict(torch.load("sarcasm_detect_model.pt"))
model.eval()
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print("Test Accuracy: ",test_acc)

Test Accuracy:  0.8233134925365448
