# GPT-2 Transformer Model for Sarcasm Detection

In [None]:
import torch
import numpy as np
import pandas as pd
import os
import json
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import copy
import spacy
import gc
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from tqdm import tqdm
from transformers import GPT2TokenizerFast
from tokenizers import ByteLevelBPETokenizer

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

print(torch.cuda.is_available())
print(spacy.prefer_gpu())

True
True


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
df = pd.read_json(path_or_buf="./Sarcasm_Headlines_Dataset.json", lines=True)
df.rename(columns={"headline": "text", "is_sarcastic": "label"}, inplace=True)

# # For convenience
# train_df = train_df.head(100)

sarcastic_df = df[df['label'] == 1]
non_sarcastic_df = df[df['label'] == 0]
sarcastic_df = sarcastic_df.reset_index(drop=True)
non_sarcastic_df = non_sarcastic_df.reset_index(drop=True)

sarcastic_df

Unnamed: 0,article_link,text,label
0,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
1,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
2,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1
3,https://entertainment.theonion.com/nuclear-bom...,nuclear bomb detonates during rehearsal for 's...,1
4,https://www.theonion.com/cosby-lawyer-asks-why...,cosby lawyer asks why accusers didn't come for...,1
...,...,...,...
11719,https://www.theonion.com/new-bailiff-tired-of-...,new bailiff tired of hearing how old bailiff d...,1
11720,https://www.theonion.com/breaking-the-onion-in...,breaking: 'the onion' in kill range of boston ...,1
11721,https://www.theonion.com/seaworld-crowd-applau...,seaworld crowd applauds for dolphin playfully ...,1
11722,https://politics.theonion.com/pentagon-to-with...,pentagon to withhold budget figures out of res...,1


In [11]:
def train_gpt2_tokenizer_from_df(df, save_path, vocab_size=52000):
    # Create the directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    # Extract texts from the specified column and drop NaNs
    texts = df["text"].dropna().astype(str).tolist()
    
    # Save texts to a temporary file (ByteLevelBPETokenizer requires a file input)
    with open("train_data.txt", "w", encoding="utf-8") as f:
        for line in texts:
            f.write(line.strip() + "\n")
    
    # Train the tokenizer
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(
        files="train_data.txt",
        vocab_size=vocab_size,
        min_frequency=2,
        special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>"]
    )
    
    # Save the tokenizer files
    tokenizer.save_model(save_path)
    print(f"Saved tokenizer in {save_path}")

def load_gpt2_tokenizer(path):
    # Load and return as a Hugging Face-compatible tokenizer
    return GPT2TokenizerFast.from_pretrained(path)

# train_gpt2_tokenizer_from_df(sarcastic_df, save_path="./tokenizer_sarcastic")
sarcastic_tokenizer = load_gpt2_tokenizer("./tokenizer_sarcastic")
# train_gpt2_tokenizer_from_df(non_sarcastic_df, save_path="./tokenizer_non_sarcastic")
non_sarcastic_tokenizer = load_gpt2_tokenizer("./tokenizer_non_sarcastic")

print(sarcastic_tokenizer.vocab_size)
print(non_sarcastic_tokenizer.vocab_size)

16465
16871


In [12]:
# Split into train, test, validation (80% train, 20% validation from train)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42) # Leave 20% of the dataset for Test

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42) # Leave 80(0.1)=8% of the dataset for Validation

print(len(train_texts))
print(len(train_labels))

# Reset the indexes (first column) to ensure that they are continuous
train_texts = train_texts.reset_index(drop=True) 
train_labels = train_labels.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)
test_texts = test_texts.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

19230
19230


In [13]:
print(GPT2Model.from_pretrained("gpt2")) # Display the architecture of our front layer



GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [None]:
class DoubleFeedTokenizerClassifier(torch.nn.Module): # Double Feed
    def __init__(self, hidden_size: int, num_classes: int, max_seq_len: int):
        super(DoubleFeedTokenizerClassifier,self).__init__()
        self.frontLayer = GPT2Model.from_pretrained('gpt2')
        self.fc = torch.nn.Linear(in_features=max_seq_len*(hidden_size + hidden_size), out_features=num_classes)

    def forward(self, input_id_sarcastic, mask_sarcastic, input_id_non_sarcastic, mask_non_sarcastic):
        """
        Args: input_id: encoded input of ids that were sent
        """
        trf_feats_sarcastic, _ = self.frontLayer(input_ids=input_id_sarcastic, attention_mask=mask_sarcastic, return_dict=False)
        trf_feats_non_sarcastic, _ = self.frontLayer(input_ids=input_id_non_sarcastic, attention_mask=mask_non_sarcastic, return_dict=False)
        n = trf_feats_sarcastic.shape[0]
        extracted_features = torch.concat((trf_feats_sarcastic, trf_feats_non_sarcastic), dim=-1) # Shape: (n, seq_len, 768 * 2)
        extracted_features = extracted_features.view(n, -1) # Flatten features
        linear_output = self.fc(extracted_features)
        return linear_output
    
class DoubleConcatTokenizerClassifier(torch.nn.Module):
    def __init__(self, hidden_size: int, num_classes: int, max_seq_len: int):
        super(DoubleConcatTokenizerClassifier,self).__init__()
        self.frontLayer = GPT2Model.from_pretrained('gpt2')
        self.fc = torch.nn.Linear(in_features=max_seq_len*(hidden_size + hidden_size), out_features=num_classes)

    def forward(self, input_id_sarcastic, mask_sarcastic, input_id_non_sarcastic, mask_non_sarcastic):
        """
        Args: input_id: encoded input of ids that were sent
        """
        input_ids = torch.concat((input_id_sarcastic, input_id_non_sarcastic), dim=-1)
        masks = torch.concat((mask_sarcastic, mask_non_sarcastic), dim=-1)
        extracted_features, _ = self.frontLayer(input_ids=input_ids, attention_mask=masks, return_dict=False)
        extracted_features = extracted_features.view(extracted_features.shape[0], -1)
        linear_output = self.fc(extracted_features)
        return linear_output

class SarcasmDataset(Dataset):
    def __init__(self, headlines, labels, sarcastic_tokenizer, non_sarcastic_tokenizer, max_length=50):
        self.headlines = headlines
        self.labels = labels
        self.sarcastic_tokenizer = sarcastic_tokenizer
        self.non_sarcastic_tokenizer = non_sarcastic_tokenizer
        self.max_length = max_length

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.headlines)
    
    def __getitem__(self, idx):
        processed_headline = self.headlines[idx]
        # GPT2 Tokenizer
        encoded_data_sarcastic = self.sarcastic_tokenizer(processed_headline, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")
        encoded_data_non_sarcastic = self.non_sarcastic_tokenizer(processed_headline, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")

        return encoded_data_sarcastic, encoded_data_non_sarcastic, self.labels[idx]
    
def load_model(path, hidden_size, max_seq_len):
    model = DoubleFeedTokenizerClassifier(hidden_size=hidden_size, num_classes=2, max_seq_len=max_seq_len) # Remember to change the class when you are using DoubleConcat!
    model.load_state_dict(torch.load(path))
    model.eval()
    return model

def train(model, trainDataSarcastic, valDataSarcastic, lr, eps, V):
    """
    V = Sarcastic Vocab Size
    """
    trainLoaderSarcastic = DataLoader(trainDataSarcastic, batch_size=16, shuffle=True)
    valLoaderSarcastic = DataLoader(valDataSarcastic, batch_size=16, shuffle=True)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    did_not_improve_count = 0
    best_val_score = 0
    best_epoch = 0
    model.train()
    
    for epoch_num in range(100):
        # total_acc_train = 0
        total_loss_train = 0
        train_predictions = []
        train_labels = []
        for encoded_data_sarcastic, encoded_data_non_sarcastic, train_label in tqdm(trainLoaderSarcastic):
            train_label = train_label.to(device)
            
            mask_sarcastic = encoded_data_sarcastic['attention_mask'].to(device)
            input_id_sarcastic = encoded_data_sarcastic["input_ids"].squeeze(1).to(device)

            mask_non_sarcastic = encoded_data_non_sarcastic['attention_mask'].to(device)
            input_id_non_sarcastic = encoded_data_non_sarcastic["input_ids"].squeeze(1).to(device) + V # Offset the input_ids because input_id "1" refers to different tokens in each tokenizer.

            model.zero_grad()

            output = model(input_id_sarcastic, mask_sarcastic, input_id_non_sarcastic, mask_non_sarcastic)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            train_labels += train_label.detach().cpu().numpy().flatten().tolist()
            train_predictions += output.argmax(dim=1).detach().cpu().numpy().flatten().tolist()

            batch_loss.backward()
            optimizer.step()
        
        # total_acc_val = 0
        total_loss_val = 0
        val_predictions = []
        val_labels = []
        
        with torch.no_grad():
            for encoded_data_sarcastic, encoded_data_non_sarcastic, val_label in valLoaderSarcastic:
                val_label = val_label.to(device)
                mask_sarcastic = encoded_data_sarcastic['attention_mask'].to(device)
                input_id_sarcastic = encoded_data_sarcastic['input_ids'].squeeze(1).to(device)

                mask_non_sarcastic = encoded_data_non_sarcastic['attention_mask'].to(device)
                input_id_non_sarcastic = encoded_data_non_sarcastic["input_ids"].squeeze(1).to(device) + V
                
                output = model(input_id_sarcastic, mask_sarcastic, input_id_non_sarcastic, mask_non_sarcastic)
                
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                val_labels += val_label.detach().cpu().numpy().flatten().tolist()
                val_predictions += output.argmax(dim=1).detach().cpu().numpy().flatten().tolist()
            
        f1_train = f1_score(train_labels, train_predictions, average='macro')
        f1_val = f1_score(val_labels, val_predictions, average='macro')

        if f1_val > (best_val_score + eps):
            best_val_score = f1_val
            did_not_improve_count = 0
            best_epoch = epoch_num
            print(f"Saving new best val acc {best_val_score}")
            torch.save(copy.deepcopy(model.state_dict()), f"./doubleconcat-tokenizer-gpt2-lr{lr}-iter{best_epoch+1}-tol{eps}-slen128.pt")
        else:
            did_not_improve_count += 1

        if did_not_improve_count >= 10:
            break

        print(
            f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train/len(trainDataSarcastic): .3f} \
            | Train Score: {f1_train: .3f} \
             | Val Loss: {total_loss_val / len(valDataSarcastic): .3f} \
             | Val Score: {f1_val: .3f}")
        
        gc.collect()
        torch.cuda.empty_cache()
        
def evaluate(model, testLoader, length, V):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    predictions = []
    true_labels = []

    total_acc_test = 0
    model.eval()
    with torch.no_grad():
        for encoded_data_sarcastic, encoded_data_non_sarcastic, test_label in testLoader:
            test_label = test_label.to(device)
            mask_sarcastic = encoded_data_sarcastic['attention_mask'].to(device)
            input_id_sarcastic = encoded_data_sarcastic['input_ids'].squeeze(1).to(device)

            mask_non_sarcastic = encoded_data_non_sarcastic['attention_mask'].to(device)
            input_id_non_sarcastic = encoded_data_non_sarcastic["input_ids"].squeeze(1).to(device) + V
            
            output = model(input_id_sarcastic, mask_sarcastic, input_id_non_sarcastic, mask_non_sarcastic)
                        
            acc = (output.argmax(dim=1)==test_label).sum().item()
            total_acc_test += acc

            # add original labels
            true_labels += test_label.detach().cpu().numpy().flatten().tolist()
            # get predictions to list
            predictions += output.argmax(dim=1).detach().cpu().numpy().flatten().tolist()
    test_score = f1_score(true_labels, predictions, average='macro')
    test_p = precision_score(true_labels, predictions, average='macro')
    test_r = recall_score(true_labels, predictions, average='macro')
    test_acc = total_acc_test / length
    print(f'Test Accuracy: {test_acc: .3f}, F1 Score: {test_score: .3f}, Precision: {test_p}, Recall: {test_r}')
    return true_labels, predictions

In [15]:
# Hyperparameters
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
sarcastic_tokenizer.padding_side = "left"
sarcastic_tokenizer.pad_token = sarcastic_tokenizer.eos_token

non_sarcastic_tokenizer.padding_side = "left"
non_sarcastic_tokenizer.pad_token = non_sarcastic_tokenizer.eos_token
seq_len = 128
hidden_size = 768
val_tol = 0.01
lr = 1e-5

In [None]:
model = DoubleFeedTokenizerClassifier(hidden_size=hidden_size, num_classes=2, max_seq_len=seq_len)
trainData = SarcasmDataset(headlines=train_texts, labels=train_labels, sarcastic_tokenizer=sarcastic_tokenizer, non_sarcastic_tokenizer=non_sarcastic_tokenizer, max_length=seq_len)
valData = SarcasmDataset(headlines=val_texts, labels=val_labels, sarcastic_tokenizer=sarcastic_tokenizer, non_sarcastic_tokenizer=non_sarcastic_tokenizer, max_length=seq_len)

In [9]:
train(model=model, trainDataSarcastic=trainData, valDataSarcastic=valData, lr=lr, eps=val_tol, V=sarcastic_tokenizer.vocab_size)

100%|██████████| 1202/1202 [19:18<00:00,  1.04it/s]


Saving new best val acc 0.8474624197126237
Epochs: 1 | Train Loss:  0.031             | Train Score:  0.741              | Val Loss:  0.021              | Val Score:  0.847


100%|██████████| 1202/1202 [19:45<00:00,  1.01it/s]


Saving new best val acc 0.885114375464638
Epochs: 2 | Train Loss:  0.020             | Train Score:  0.855              | Val Loss:  0.017              | Val Score:  0.885


100%|██████████| 1202/1202 [20:03<00:00,  1.00s/it]


Epochs: 3 | Train Loss:  0.017             | Train Score:  0.881              | Val Loss:  0.018              | Val Score:  0.883


100%|██████████| 1202/1202 [21:21<00:00,  1.07s/it]


Epochs: 4 | Train Loss:  0.016             | Train Score:  0.894              | Val Loss:  0.019              | Val Score:  0.875


100%|██████████| 1202/1202 [21:41<00:00,  1.08s/it]


Epochs: 5 | Train Loss:  0.015             | Train Score:  0.901              | Val Loss:  0.017              | Val Score:  0.886


100%|██████████| 1202/1202 [21:43<00:00,  1.08s/it]


Epochs: 6 | Train Loss:  0.014             | Train Score:  0.909              | Val Loss:  0.017              | Val Score:  0.871


100%|██████████| 1202/1202 [21:42<00:00,  1.08s/it]


Epochs: 7 | Train Loss:  0.015             | Train Score:  0.902              | Val Loss:  0.016              | Val Score:  0.893


100%|██████████| 1202/1202 [21:19<00:00,  1.06s/it]


Epochs: 8 | Train Loss:  0.013             | Train Score:  0.914              | Val Loss:  0.017              | Val Score:  0.885


100%|██████████| 1202/1202 [23:33<00:00,  1.18s/it]


Saving new best val acc 0.8957814702868864
Epochs: 9 | Train Loss:  0.013             | Train Score:  0.913              | Val Loss:  0.016              | Val Score:  0.896


100%|██████████| 1202/1202 [21:27<00:00,  1.07s/it]


Epochs: 10 | Train Loss:  0.013             | Train Score:  0.918              | Val Loss:  0.020              | Val Score:  0.869


100%|██████████| 1202/1202 [21:18<00:00,  1.06s/it]


Epochs: 11 | Train Loss:  0.013             | Train Score:  0.918              | Val Loss:  0.016              | Val Score:  0.892


100%|██████████| 1202/1202 [20:54<00:00,  1.04s/it]


Epochs: 12 | Train Loss:  0.013             | Train Score:  0.918              | Val Loss:  0.026              | Val Score:  0.848


100%|██████████| 1202/1202 [21:14<00:00,  1.06s/it]


Epochs: 13 | Train Loss:  0.013             | Train Score:  0.917              | Val Loss:  0.017              | Val Score:  0.893


100%|██████████| 1202/1202 [23:20<00:00,  1.16s/it]


Epochs: 14 | Train Loss:  0.012             | Train Score:  0.923              | Val Loss:  0.021              | Val Score:  0.864


100%|██████████| 1202/1202 [23:49<00:00,  1.19s/it]


Epochs: 15 | Train Loss:  0.011             | Train Score:  0.928              | Val Loss:  0.019              | Val Score:  0.880


100%|██████████| 1202/1202 [22:22<00:00,  1.12s/it]


Epochs: 16 | Train Loss:  0.011             | Train Score:  0.930              | Val Loss:  0.019              | Val Score:  0.879


100%|██████████| 1202/1202 [22:49<00:00,  1.14s/it]


Epochs: 17 | Train Loss:  0.010             | Train Score:  0.936              | Val Loss:  0.019              | Val Score:  0.886


100%|██████████| 1202/1202 [22:15<00:00,  1.11s/it]


Epochs: 18 | Train Loss:  0.010             | Train Score:  0.937              | Val Loss:  0.017              | Val Score:  0.894


100%|██████████| 1202/1202 [23:07<00:00,  1.15s/it]


In [16]:
testData = SarcasmDataset(headlines=test_texts, labels=test_labels, sarcastic_tokenizer=sarcastic_tokenizer, non_sarcastic_tokenizer=non_sarcastic_tokenizer, max_length=seq_len)
testLoader = DataLoader(testData, batch_size=16, shuffle=True)

model = load_model("./doublefeed-tokenizer-gpt2-lr1e-05-iter10-tol0.01-slen128.pt", hidden_size=768, max_seq_len=seq_len)

evaluate(model, testLoader, len(testData), V=sarcastic_tokenizer.vocab_size)

print(len(testData))

Test Accuracy:  0.904, F1 Score:  0.903, Precision: 0.9017639256549035, Recall: 0.9068206315439626
5342
