In [1]:
import pandas as pd
import json
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_json("Sarcasm_Headlines_Dataset.json", lines = True)

In [4]:
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
data.tail()

Unnamed: 0,article_link,headline,is_sarcastic
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0
26708,https://www.huffingtonpost.com/entry/gourmet-g...,gourmet gifts for the foodie 2014,0


In [6]:
data.drop(['article_link'], axis=1)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [7]:
sentence = data['headline'].tolist()

In [8]:
label = data['is_sarcastic'].tolist()

In [9]:
sentence[:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
stop_words = set(stopwords.words("english"))

#Function to remove stopwords from a single sentence
def remove_stopwords(sentence):
    words = nltk.word_tokenize(sentence)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

#Remove stopwords from each sentence in the list
sentence = [remove_stopwords(sentence) for sentence in sentence]

#Print the sentences without stopwords
sentence[:10]

["former versace store clerk sues secret 'black code ' minority shoppers",
 "'roseanne ' revival catches thorny political mood , better worse",
 "mom starting fear son 's web series closest thing grandchild",
 'boehner wants wife listen , come alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday magical way',
 "advancing world 's women",
 'fascinating case eating lab-grown meat',
 'ceo send kids school , work company',
 'top snake handler leaves sinking huckabee campaign',
 "friday 's morning email : inside trump 's presser ages"]

In [12]:
train_size = round(len(sentence)*0.75)
train_sen = sentence[0:train_size]
test_sen = sentence[train_size:]
train_label = label[0:train_size]
test_label = label[train_size:]

In [13]:
vocab_size = 10000
oov_tok = "oov" #if a word is not present in the dictionary, it will be assigned in oov_tok (token)

In [14]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(train_sen)
word_index = tokenizer.word_index

In [15]:
word_index

{'oov': 1,
 "'s": 2,
 "'": 3,
 'trump': 4,
 'new': 5,
 'man': 6,
 "n't": 7,
 'year': 8,
 'one': 9,
 'report': 10,
 'area': 11,
 'woman': 12,
 'u': 13,
 'day': 14,
 'donald': 15,
 'says': 16,
 'time': 17,
 's': 18,
 'first': 19,
 'obama': 20,
 'women': 21,
 'like': 22,
 'old': 23,
 'get': 24,
 'world': 25,
 'people': 26,
 'life': 27,
 'nation': 28,
 'clinton': 29,
 'house': 30,
 'back': 31,
 'white': 32,
 'could': 33,
 'still': 34,
 'make': 35,
 '5': 36,
 'americans': 37,
 'way': 38,
 'family': 39,
 'gop': 40,
 'study': 41,
 'president': 42,
 'black': 43,
 'show': 44,
 'would': 45,
 'best': 46,
 'school': 47,
 'bill': 48,
 'years': 49,
 '3': 50,
 'police': 51,
 'america': 52,
 'know': 53,
 'hillary': 54,
 'watch': 55,
 'last': 56,
 'really': 57,
 '10': 58,
 'things': 59,
 'video': 60,
 'ca': 61,
 'going': 62,
 'death': 63,
 'good': 64,
 'state': 65,
 'american': 66,
 'finds': 67,
 'mom': 68,
 'home': 69,
 'love': 70,
 'may': 71,
 'need': 72,
 'child': 73,
 'health': 74,
 'say': 75,
 '2'

## Padding

In [16]:
max_length = 100 #max length of a sentence can be 100; if not specified, then the length of the longest sentence is sent
trunc_type = 'post' #sentence ke baad truncate karenge
padding_type = 'post'

In [17]:
training_sequences = tokenizer.texts_to_sequences(train_sen)

training_padded = pad_sequences(training_sequences, max_length, padding = padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_sen)

testing_padded = pad_sequences(testing_sequences, maxlen = max_length, padding = padding_type, truncating=trunc_type)

In [18]:
embedding_dim = 16 #determines the context of text
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),

    tf.keras.layers.GlobalAveragePooling1D(), 

    tf.keras.layers.Dense(24, activation = 'relu'), 

    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
training_padded = np.array(training_padded)
training_labels = np.array(train_label)
testing_padded = np.array(testing_padded)
testing_labels = np.array(test_label)
#converting the list to array for tensorflow

In [20]:
#training the model

num_epochs = 50
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data = (testing_padded, testing_labels), verbose=2)

Epoch 1/50
626/626 - 6s - loss: 0.6825 - accuracy: 0.5603 - val_loss: 0.6687 - val_accuracy: 0.5660 - 6s/epoch - 9ms/step
Epoch 2/50
626/626 - 3s - loss: 0.5815 - accuracy: 0.7050 - val_loss: 0.5030 - val_accuracy: 0.7749 - 3s/epoch - 6ms/step
Epoch 3/50
626/626 - 3s - loss: 0.4148 - accuracy: 0.8244 - val_loss: 0.4419 - val_accuracy: 0.7972 - 3s/epoch - 6ms/step
Epoch 4/50
626/626 - 4s - loss: 0.3382 - accuracy: 0.8588 - val_loss: 0.4256 - val_accuracy: 0.8058 - 4s/epoch - 6ms/step
Epoch 5/50
626/626 - 4s - loss: 0.2941 - accuracy: 0.8800 - val_loss: 0.4273 - val_accuracy: 0.8075 - 4s/epoch - 6ms/step
Epoch 6/50
626/626 - 4s - loss: 0.2612 - accuracy: 0.8954 - val_loss: 0.4300 - val_accuracy: 0.8108 - 4s/epoch - 6ms/step
Epoch 7/50
626/626 - 4s - loss: 0.2392 - accuracy: 0.9047 - val_loss: 0.4398 - val_accuracy: 0.8095 - 4s/epoch - 6ms/step
Epoch 8/50
626/626 - 4s - loss: 0.2180 - accuracy: 0.9132 - val_loss: 0.4549 - val_accuracy: 0.8078 - 4s/epoch - 6ms/step
Epoch 9/50
626/626 - 4s 

In [21]:
sen = ["Coworkers at bathroom sink locked in tense standoff over who is going to wash hands longer",
      "The covid cases are rising"]
seq = tokenizer.texts_to_sequences(sen)
padded = pad_sequences(seq,maxlen=max_length, padding = padding_type, truncating = trunc_type)
print(model.predict(padded))

[[1.000000e+00]
 [9.784837e-05]]
