In [59]:
!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/sarcasm.json \
    -O /tmp/sarcasm.json

--2024-02-24 04:34:45--  https://storage.googleapis.com/learning-datasets/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.15.251, 172.217.164.27, 172.217.0.91, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.15.251|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2024-02-24 04:34:45 (196 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [60]:
import json

with open("/tmp/sarcasm.json", 'r') as f:
  data = json.load(f)

In [61]:
data

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [62]:
sentences = []
labels = []

for item in data:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

In [63]:
sentences

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages",
 'airline passengers tackle man who rushes cockpit in bomb threat',
 'facebook reportedly working on healthcare features and apps',
 "north korea praises trump and urges us voters to reject 'dull hillary'",
 "actually, cnn's jeffrey lord has been 'indefensible' for a while",
 'barcelona holds huge protest in su

In [64]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [65]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [66]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

In [67]:
word_index = tokenizer.word_index

Stop words and Lemma

In [68]:
num_sencetence = len(sentences)

In [69]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy
nlp = spacy.load('en_core_web_sm')

# Sample text
stop_words = set(stopwords.words('english'))

for i in range(num_sencetence):
    text = sentences.pop(0)
    filtered_lemma_text = " ".join(token.lemma_ for token in nlp(text) if token.text.lower() not in stop_words)
    sentences.append(filtered_lemma_text)


In [70]:
sentences

["former versace store clerk sue secret ' black code ' minority shopper",
 "' roseanne ' revival catch thorny political mood , well bad",
 "mom start fear son 's web series close thing grandchild",
 'boehner want wife listen , come alternative debt - reduction idea',
 'j.k . rowling wish snape happy birthday magical way',
 "advance world 's woman",
 'fascinating case eat lab - grow meat',
 'ceo send kid school , work company',
 'top snake handler leave sink huckabee campaign',
 "friday 's morning email : inside trump 's presser age",
 'airline passenger tackle man rush cockpit bomb threat',
 'facebook reportedly work healthcare feature app',
 "north korea praise trump urge we voter reject ' dull hillary '",
 "actually , cnn 's jeffrey lord ' indefensible '",
 'barcelona hold huge protest support refugee',
 "nuclear bomb detonate rehearsal ' spider - man ' musical",
 'cosby lawyer ask accuser not come forward smear legal team year ago',
 'stock analyst confuse , frighten boar market',
 

In [71]:
len(dict(word_index))

29657

In [72]:
sequences = tokenizer.texts_to_sequences(sentences)

In [73]:
from tensorflow.keras.utils import pad_sequences

In [74]:
padded = pad_sequences(sequences, padding="post")

In [75]:
padded.shape

(26709, 43)

In [76]:
import numpy as np


num_test = int(num_sencetence * 0.8)
padded_train = padded[:num_test]
padded_valid = padded[num_test:]

label_train  = np.array(labels[:num_test])
label_valid = np.array(labels[num_test:])

In [77]:
padded_valid.shape

(5342, 43)

BUILD MODEL

In [78]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, RNN, Dense, LSTM, Bidirectional, SimpleRNN
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
import tensorflow as tf

In [79]:
opt = Adam(learning_rate=0.0001)

In [80]:
callbacks = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

In [81]:
vocab_size = len(list(word_index)) + 1
embedding_size = 32
max_length = 43

In [82]:
model4 = Sequential()
model4.add(Embedding(vocab_size, embedding_size, input_length = max_length))
model4.add(Bidirectional(LSTM(64)))
model4.add(Dropout(0.2))
model4.add(Dense(16, activation='relu'))
model4.add(Dropout(0.2))
model4.add(Dense(1, activation='sigmoid'))

In [85]:
model4.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])

In [84]:
model4.fit(padded_train,label_train, epochs=10, validation_data = (padded_valid, label_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f5fe38a6200>