# Sarcasm Detection from News Headlines

In [1]:
# Import necessary libraries and modules
!pip install pandas scikit-learn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import json
import pprint
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

np.random.seed(1234)
pp = pprint.PrettyPrinter(indent=4)
%matplotlib inline

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### Make sure GPU is connected

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Reading and Preprocessing the data

In [3]:
# Read the data
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data1 = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
data2 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
data = pd.concat([data1, data2])

In [4]:
# Print the first 5 pieces of data
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
# Remove "article_link" column
del data["article_link"]
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [6]:
# Check if the data has some null values
data[data.isnull()].sum()

headline        0.0
is_sarcastic    0.0
dtype: float64

In [7]:
# Show how many examples we have in our dataset
len(data)

55328

In [8]:
# Split the data
X = data["headline"].values
y = data["is_sarcastic"].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(49795,) (5533,) (49795,) (5533,)


In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="OOV", num_words=100000, filters='!"#$%&()*+,-./:;<=>@[\]^_`{|}~ ')
tokenizer.fit_on_texts(X_train)

In [11]:
# Test the tokenizer
print(tokenizer.texts_to_sequences(["The quick brown fox jumped over the lazy dog."]))

[[4, 1681, 1266, 594, 15601, 46, 4, 3433, 211]]


In [12]:
# Pad the sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
max_seq_len = max(len(x) for x in X_train_seq)
X_train_seq = np.array([np.pad(x, (0, max_seq_len - len(x))) for x in X_train_seq])

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_seq = np.array([np.pad(x, (0, max_seq_len - len(x))) for x in X_test_seq])

### Model Configuration

In [13]:
vocab_size = len(tokenizer.word_index) + 1
embed_size = 2

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_size, input_shape=[None]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embed_size)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 2)           62410     
_________________________________________________________________
bidirectional (Bidirectional (None, 4)                 80        
_________________________________________________________________
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 62,683
Trainable params: 62,683
Non-trainable params: 0
_________________________________________________________________


### Model training

In [14]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train_seq, y_train, epochs=30, validation_data=(X_test_seq, y_test), verbose=2)

Epoch 1/30
1557/1557 - 24s - loss: 0.3452 - accuracy: 0.8382 - val_loss: 0.2214 - val_accuracy: 0.9093
Epoch 2/30
1557/1557 - 20s - loss: 0.1270 - accuracy: 0.9560 - val_loss: 0.1421 - val_accuracy: 0.9516
Epoch 3/30
1557/1557 - 20s - loss: 0.0602 - accuracy: 0.9811 - val_loss: 0.1047 - val_accuracy: 0.9691
Epoch 4/30
1557/1557 - 20s - loss: 0.0323 - accuracy: 0.9899 - val_loss: 0.0938 - val_accuracy: 0.9763
Epoch 5/30
1557/1557 - 20s - loss: 0.0183 - accuracy: 0.9950 - val_loss: 0.0979 - val_accuracy: 0.9783
Epoch 6/30
1557/1557 - 20s - loss: 0.0106 - accuracy: 0.9970 - val_loss: 0.0944 - val_accuracy: 0.9819
Epoch 7/30
1557/1557 - 21s - loss: 0.0073 - accuracy: 0.9981 - val_loss: 0.1055 - val_accuracy: 0.9816
Epoch 8/30
1557/1557 - 20s - loss: 0.0053 - accuracy: 0.9984 - val_loss: 0.1315 - val_accuracy: 0.9816
Epoch 9/30
1557/1557 - 20s - loss: 0.0046 - accuracy: 0.9985 - val_loss: 0.1235 - val_accuracy: 0.9841
Epoch 10/30
1557/1557 - 20s - loss: 0.0036 - accuracy: 0.9990 - val_loss:

<tensorflow.python.keras.callbacks.History at 0x7f1f60865518>

### Model Evaluation

In [15]:
# Predict the testing data to find accuracy
pred = model.predict(X_test_seq).round()
print(accuracy_score(y_test, pred))

0.9821073558648111
