In [1]:
!pip install tensorflow
import zipfile
import pandas as pd
import re
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score, classification_report
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')



Loading Dataset

In [2]:
!mkdir -p ~/.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d danofer/sarcasm

Dataset URL: https://www.kaggle.com/datasets/danofer/sarcasm
License(s): copyright-authors
Downloading sarcasm.zip to /content
 89% 192M/216M [00:00<00:00, 365MB/s]
100% 216M/216M [00:00<00:00, 380MB/s]


In [5]:
zip_ref=zipfile.ZipFile('sarcasm.zip','r')
zip_ref.extractall('/content')
zip_ref.close()

In [6]:
df = pd.read_csv('train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [7]:
df.shape

(1010826, 10)

In [8]:
df = df[:10000]

In [9]:
df = df[['label','comment']]
df.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [10]:
df.isna().sum()

Unnamed: 0,0
label,0
comment,1


In [11]:
df.dropna(inplace=True)
df.isna().sum()

Unnamed: 0,0
label,0
comment,0


In [12]:
def clean_and_lower(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    return text.lower().strip()

df['comment'] = df['comment'].apply(clean_and_lower)

In [13]:
df.head()

Unnamed: 0,label,comment
0,0,nc and nh.
1,0,you do know west teams play against west teams...
2,0,"they were underdogs earlier today, but since g..."
3,0,"this meme isn't funny none of the ""new york ni..."
4,0,i could use one of those tools.


Tokenization

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
def tokenize_data(texts, max_length=256):
    enc = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
    )
    return {
        'input_ids': np.array(enc['input_ids']),
        'attention_mask': np.array(enc['attention_mask'])
    }


tokenized_data = tokenize_data(df['comment'])

In [16]:
tokenized_data

{'input_ids': array([[  101, 13316,  1998, ...,     0,     0,     0],
        [  101,  2017,  2079, ...,     0,     0,     0],
        [  101,  2027,  2020, ...,     0,     0,     0],
        ...,
        [  101,  5095,  2305, ...,     0,     0,     0],
        [  101, 29420,  2015, ...,     0,     0,     0],
        [  101,  2016, 28719, ...,     0,     0,     0]]),
 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])}

Splitting

In [17]:
X = tokenized_data['input_ids']
y = df['label']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  stratify=y, random_state=42)

In [19]:
X_train.shape, X_test.shape

((7999, 256), (2000, 256))

In [20]:
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))

{np.int64(0): np.int64(5031), np.int64(1): np.int64(2968)}


Model Building (research paper architecture)

In [21]:
class HierarchicalBERT(tf.keras.Model):
    def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
        super(HierarchicalBERT, self).__init__()
        self.bert = bert_model

        # sentence encoding layer
        self.dense_sentence = tf.keras.layers.Dense(768, activation='relu')

        # context summarization layer
        self.mean_pooling = tf.keras.layers.GlobalAveragePooling1D()

        # context encoder layer
        self.bilstm_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True))

        # CNN layer
        self.cnn_layer = tf.keras.layers.Conv1D(filters=cnn_filters,kernel_size=2,activation='relu')

        # pooling layer after CNN
        self.pool = tf.keras.layers.GlobalMaxPooling1D()

        # fully connected layer
        self.dense_layer = tf.keras.layers.Dense(dense_units, activation='relu')

        # output layer
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        # BERT embeddings
        bert_embeddings = self.bert(inputs)[0]  # shape: (batch_size, seq_len, 768)

        # sentence encoding layer
        sentence_embeddings = self.dense_sentence(bert_embeddings)

        # context summarization
        context_summary = self.mean_pooling(sentence_embeddings)  # shape: (batch_size, 768)

        # expand the dimension (for LSTM input)
        context_summary = tf.expand_dims(context_summary, axis=1)  # shape: (batch_size, 1, 768)

        # context encoder layer
        context_encoding = self.bilstm_encoder(context_summary)  # shape: (batch_size, 1, 2*lstm_units)

        # squeezing the dimension (we're done with LSTM)
        context_encoding = tf.squeeze(context_encoding, axis=1)  # shape: (batch_size, 2*lstm_units)

        # adding the channel dimension to match the required shape by conv layer
        context_encoding_expanded = tf.expand_dims(context_encoding, axis=-1)  # shape: (batch_size, features, 1)

        # CNN layer
        cnn_output = self.cnn_layer(context_encoding_expanded)  # shape: (batch_size, new_len, filters)

        # pooling after CNN
        pooled_output = self.pool(cnn_output)  # shape: (batch_size, filters)

        # fully connected layer
        dense_output = self.dense_layer(pooled_output)  # shape: (batch_size, dense_units)

        # output layer
        final_output = self.output_layer(dense_output)  # shape: (batch_size, 1)

        return final_output


In [22]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [23]:
# defining the hierarchical BERT model
model = HierarchicalBERT(bert_model, lstm_units=128, cnn_filters=64, dense_units=32)

In [24]:
def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        bce = K.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        return alpha * K.pow(1. - p_t, gamma) * bce
    return loss

In [25]:
model.compile(optimizer='adam', loss=focal_loss(gamma=2.0, alpha=0.75),metrics=['accuracy'])

In [26]:
!pip uninstall -y keras # transformers, it automatically installed the standalone keras==3.x (confusion)

Found existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successfully uninstalled keras-3.8.0


In [27]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1)
]


In [28]:
model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks
)

Epoch 1/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 709ms/step - accuracy: 0.6255 - loss: 0.1251 - val_accuracy: 0.6212 - val_loss: 0.1253 - learning_rate: 0.0010
Epoch 2/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 786ms/step - accuracy: 0.6350 - loss: 0.1245 - val_accuracy: 0.6212 - val_loss: 0.1250 - learning_rate: 0.0010
Epoch 3/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 582ms/step - accuracy: 0.6300 - loss: 0.1249
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 788ms/step - accuracy: 0.6300 - loss: 0.1249 - val_accuracy: 0.6212 - val_loss: 0.1250 - learning_rate: 0.0010
Epoch 4/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 580ms/step - accuracy: 0.6376 - loss: 0.1238
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x7d929653da50>

In [29]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=4))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 681ms/step
              precision    recall  f1-score   support

           0     0.6290    1.0000    0.7723      1258
           1     0.0000    0.0000    0.0000       742

    accuracy                         0.6290      2000
   macro avg     0.3145    0.5000    0.3861      2000
weighted avg     0.3956    0.6290    0.4857      2000



In [31]:
model.save('/content/sarcasm_model.keras')
tokenizer.save_pretrained('/content/bert_tokenizer')

('/content/bert_tokenizer/tokenizer_config.json',
 '/content/bert_tokenizer/special_tokens_map.json',
 '/content/bert_tokenizer/vocab.txt',
 '/content/bert_tokenizer/added_tokens.json')