In [1]:
import kagglehub
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

In [2]:
# Download latest version
path = kagglehub.dataset_download("danofer/sarcasm")

print("Path to dataset files:", path)

Path to dataset files: /home/eugen/.cache/kagglehub/datasets/danofer/sarcasm/versions/4


In [3]:
dataset = pd.read_csv(path + '/train-balanced-sarcasm.csv')
dataset.head(3)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.


In [4]:
dataset = dataset[['label', 'comment']]
dataset = dataset.sample(n=20000)
dataset.info(), dataset.head(3), dataset.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 10702 to 25661
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    20000 non-null  int64 
 1   comment  19999 non-null  object
dtypes: int64(1), object(1)
memory usage: 468.8+ KB


(None,
         label                                            comment
 10702       1                                       i forgot the
 82187       0                    And the quick goodbye to Canada
 967628      0  I'm arguing the metric for measuring how well ...,
 label      0
 comment    1
 dtype: int64)

In [5]:
dataset.dropna(inplace=True)
dataset.isna().sum()

label      0
comment    0
dtype: int64

In [6]:
dataset['comment'] = dataset['comment'].apply(lambda comment: comment.lower())
dataset['comment'] = dataset['comment'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
dataset.sample(10)

Unnamed: 0,label,comment
483736,1,oh so you loved the deckbuilder one
661743,1,says the class with overpowered cold snap heals
527469,0,its a different world down there come brother ...
258498,1,ledeclined to give respect to the rim
28751,1,thanks for that mental image
79873,1,i think we can safely say the game is unplayab...
549783,0,optionf
167223,0,we should talk to the mods about getting some ...
643023,1,oh great another chore like the necrochasm tha...
372358,1,but its not a book they wrote its the literal ...


In [7]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [8]:
def tokenize_text(text):
    return tokenizer(
         text.tolist(),
         max_length = 100,
         truncation = True,
         padding = 'max_length',
         return_tensors = 'np'
    )

tokenized_comments = tokenize_text(dataset['comment'])
tokenized_comments

{'input_ids': array([[  101,  1045,  9471, ...,     0,     0,     0],
       [  101,  1998,  1996, ...,     0,     0,     0],
       [  101, 10047,  9177, ...,     0,     0,     0],
       ...,
       [  101, 16421,  8758, ...,     0,     0,     0],
       [  101,  2026,  5542, ...,     0,     0,     0],
       [  101,  2763,  2026, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

In [24]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
model = tf.keras.Sequential([
    bert_model,
    tf.keras.layers.Lambda(lambda bert_output: bert_output.pooler_output, name="extract_pooler_output"),
    tf.keras.layers.Dense(128, activation='relu'),
    #tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    #tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [25]:
from sklearn.model_selection import train_test_split


labels_tf = tf.constant(dataset['label'].values)
indices = np.arange(dataset.shape[0])
train_indices, test_indices = train_test_split(
    indices,
    test_size=0.2,
    stratify=dataset['label'].values
)

X_train = {
        'input_ids': tf.gather(tokenized_comments['input_ids'], train_indices),
        'attention_mask': tf.gather(tokenized_comments['attention_mask'], train_indices)
    }

X_test = {
    'input_ids': tf.gather(tokenized_comments['input_ids'], test_indices),
    'attention_mask': tf.gather(tokenized_comments['attention_mask'], test_indices)
}

y_train = tf.gather(labels_tf, train_indices)
y_test = tf.gather(labels_tf, test_indices)

model.fit(X_train, y_train, epochs=3, batch_size=16)

Epoch 1/3


I0000 00:00:1749135957.323350  281117 service.cc:152] XLA service 0x7f3884960b90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1749135957.323376  281117 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2025-06-05 18:05:57.330202: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1749135957.346995  281117 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1749135957.464110  281117 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7f39d84f15d0>

In [26]:
y_pred_trained = model.predict(X_train)
y_pred_test = model.predict(X_test)



In [None]:
max(y_pred_trained), min(y_pred_trained), max(y_pred_test), min(y_pred_test)

(array([0.99869955], dtype=float32),
 array([0.01339717], dtype=float32),
 array([0.99862707], dtype=float32),
 array([0.01438333], dtype=float32))

In [29]:
y_pred_train_results = (y_pred_trained > 0.5).astype(int).flatten()
y_pred_test_results = (y_pred_test > 0.5).astype(int).flatten()

full_pred = np.empty(len(dataset), dtype=int)

full_pred[train_indices] = y_pred_train_results
full_pred[test_indices] = y_pred_test_results
    
dataset['predicted'] = full_pred
dataset.sample(20)

Unnamed: 0,label,comment,predicted
589228,0,haha no if boston fans hate bart then im expec...,0
507884,1,but dont worry we made up for it with sorry day,1
512443,1,got rejected from dream resume booster plz help,0
528637,1,yeah acne is a really secret brand and the chi...,1
483220,1,positive navi comment fangay that needs to stfu,0
740575,1,shhhh nothing to see here move along people mo...,0
765010,0,sure sounds like it,0
321043,0,i hated being made to feel i was doing somethi...,0
837152,1,is it a picture of koishi,0
94617,0,i wished the cartoon looked actually good inst...,0
