# Sarcasm Detection with Fine-tuned BERT
`@author Haemin Choi`
- Set up
- Load dataset
- Tokenize dataset
- Fine-tuning
- Save model
- Load model
- Evaluation

### Set up

In [None]:
from google.colab import drive
drive.mount('/content/drive/MyDrive/UOL')

In [None]:
!pip install transformers



In [None]:
import os
!pip install --upgrade transformers
!pip install tf-keras
os.environ['TF_USE_LEGACY_KERAS']= '1'



In [None]:
import pandas as pd
import numpy as np
import urllib.request
import tensorflow as tf
import transformers
import re
import os
from tqdm import tqdm
from tensorflow import keras

### Load Dataset

In [None]:
from bs4 import BeautifulSoup
import json

with open("sarcasm.json", 'r') as f:
  datastore = json.load(f)

sentences = []
labels = []
urls = []
for item in datastore:
    sentence = item['headline'].lower()
    sentences.append(sentence)
    labels.append(item['is_sarcastic'])

In [None]:
print(len(sentences))
print(len(labels))

26709
26709


In [None]:
sentences[:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [None]:
from sklearn.model_selection import train_test_split

# divide dataset into train/validation/test set
# train:val:test = 6:2:2
train_X, temp_X, train_y, temp_y = train_test_split(sentences, labels, test_size=0.4, random_state=0)
val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=0.5, random_state=0)

In [None]:
print(f'train set length: {len(train_X)}')
print(f'valid set length: {len(val_X)}')
print(f'test  set length: {len(test_X)}')

train set length: 16025
valid set length: 5342
test  set length: 5342


### Toknize Dataset

In [None]:
from transformers import BertTokenizerFast

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base", max_len=50, truncation=True, padding=True) # klue/bert

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [None]:
train_X = tokenizer(train_X, truncation=True, padding=True)
val_X = tokenizer(val_X, truncation=True, padding=True)

In [None]:
# converting to TensorSliceDataset object
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_X), train_y))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_X), val_y))

In [None]:
train_dataset

<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(122,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(122,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(122,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

### Fine-tuning


In [None]:
from transformers import TFBertForSequenceClassification

In [None]:
model_name = "klue/bert-base"
num_labels = 2
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=2e-5)

In [None]:
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, from_pt=True)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
callback_earlystop = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=0
)

In [None]:
train_dataset = train_dataset.shuffle(10000)
train_dataset = train_dataset.batch(64)
val_dataset = val_dataset.shuffle(10000)
val_dataset = val_dataset.batch(64)

In [None]:
history = model.fit(
    train_dataset, epochs=3, batch_size=64,
    validation_data = val_dataset,
    callbacks = [callback_earlystop]
)

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


### Save Model

In [None]:
MODEL_NAME = 'best_bert'
MODEL_SAVE_PATH = os.path.join("/content/drive/MyDrive/UOL", MODEL_NAME)

if os.path.exists(MODEL_SAVE_PATH):
  print(f"{MODEL_SAVE_PATH} -- Folder already exists \n")
else:
  os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
  print(f"{MODEL_SAVE_PATH} -- Folder create complete \n")

model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

/content/drive/MyDrive/UOL/best_bert -- Folder create complete 



('/content/drive/MyDrive/UOL/best_bert/tokenizer_config.json',
 '/content/drive/MyDrive/UOL/best_bert/special_tokens_map.json',
 '/content/drive/MyDrive/UOL/best_bert/vocab.txt',
 '/content/drive/MyDrive/UOL/best_bert/added_tokens.json',
 '/content/drive/MyDrive/UOL/best_bert/tokenizer.json')

### Load Model

In [None]:
from transformers import TextClassificationPipeline

# Load Fine-tuned model
MODEL_NAME = 'best_bert'
MODEL_SAVE_PATH = os.path.join("/content/drive/MyDrive/UOL", MODEL_NAME)

loaded_tokenizer = BertTokenizerFast.from_pretrained(MODEL_SAVE_PATH)
loaded_model = TFBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer,
    model=loaded_model,
    framework='tf',
    return_all_scores=True
)

Some layers from the model checkpoint at /content/drive/MyDrive/UOL/best_bert were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/UOL/best_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


### Evaluation

In [None]:
test_set = pd.DataFrame({'sentence': test_X, 'label': test_y})

In [None]:
predicted_label_list = []
predicted_score_list = []

for text in tqdm(test_set['sentence']):
  preds_list = text_classifier(text)[0]

  sorted_preds_list = sorted(preds_list, key=lambda x: x['score'], reverse=True)
  predicted_label_list.append(int(re.sub(r'[^0-9]', '', sorted_preds_list[0]['label'])))
  predicted_score_list.append(sorted_preds_list[0]['score'])

100%|██████████| 5342/5342 [37:07<00:00,  2.40it/s]


In [None]:
test_set['pred'] = predicted_label_list
test_set['score'] = predicted_score_list
test_set.sample(n=10)

Unnamed: 0,sentence,label,pred,score
4629,billy bush reportedly out at 'today' and negot...,0,0,0.995132
624,flaws in how we evaluate leaders (from kahnema...,0,0,0.998795
4716,charlottesville shows that states must amend t...,0,0,0.985195
282,blm's alicia garza launches census project to ...,0,0,0.904739
3844,college graduate accepts position above parent...,1,1,0.784654
193,pastor going on little spiel about seeing how ...,1,1,0.997881
2270,how to read a bad book by a great author,0,0,0.998635
4167,eu court issues landmark data ruling,0,1,0.826723
4951,china's potemkin villages,0,0,0.894045
657,florida man killed after standing up for gay f...,0,0,0.99716


In [None]:
test_set.groupby(by=['pred']).count()

Unnamed: 0_level_0,sentence,label,score
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3056,3056,3056
1,2286,2286,2286


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=test_set['label'], y_pred=test_set['pred'], digits=4))

              precision    recall  f1-score   support

           0     0.8825    0.8948    0.8886      3014
           1     0.8613    0.8458    0.8535      2328

    accuracy                         0.8735      5342
   macro avg     0.8719    0.8703    0.8711      5342
weighted avg     0.8733    0.8735    0.8733      5342



In [None]:
from sklearn.metrics import precision_score , recall_score , confusion_matrix

precision = precision_score(y_true=test_set['label'], y_pred=test_set['pred'],average= "macro")
recall = recall_score(y_true=test_set['label'], y_pred=test_set['pred'], average= "macro")
f1 = 2*precision*recall / (precision+recall)

print("Precision: {}".format(round(precision, 4)))
print("Recall: {}".format(round(recall, 4)))
print("F1 Score {}".format(round(f1, 4)))

Precision: 0.8719
Recall: 0.8703
F1 Score 0.8711
