# GPT-2 Transformer Model for Sarcasm Detection

In [1]:
import torch
import numpy as np
import pandas as pd
import os
import json
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import copy
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from tqdm import tqdm
# https://colab.research.google.com/drive/1dMTdO5vxdVX0NA2Qe7AV9WGEy8ZH67Xn?usp=sharing#scrollTo=afcc233b

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

torch.cuda.is_available()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
train_df = pd.read_json(path_or_buf="./Sarcasm_Headlines_Dataset.json", lines=True)
train_df.rename(columns={"headline": "text", "is_sarcastic": "label"}, inplace=True)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word.isalpha()]  # Remove numbers and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    
    return tokens if tokens else None

# # For convenience
train_df = train_df.head(100)

# Apply preprocessing
# train_df['processed_text'] = train_df['text'].apply(preprocess_text)
# train_df = train_df.dropna(subset=['processed_text']) # After preprocessing, there will be some rows that are empty, delete these rows


# Split into train, test, validation (80% train, 20% validation from train)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    train_df['text'], train_df['label'], test_size=0.2, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42) # Ideally test_size should be 1/8 because 0.8 * 1/8 = 0.1 so its 10% for validation, but 0.08 is also fine.

print(len(train_texts))
print(len(train_labels))

train_texts = train_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)
test_texts = test_texts.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

72
72


In [4]:
train_texts

0            ryan lochte apologizes for behavior in rio
1     ex-boyfriend just thought he'd check in and th...
2     5 questions i wish younger people would stop a...
3     selig counted money while baseball lost the ne...
4     mom starting to fear son's web series closest ...
                            ...                        
67                    monster undeterred by night-light
68    report: john grisham slowly but surely climbin...
69    the vicious knot of syria, the untangling proc...
70           look: world cup star attacked by giant bug
71    scott used to stop breathing nearly 40 times a...
Name: text, Length: 72, dtype: object

In [None]:
class GPT2SequenceClassifier(torch.nn.Module):
    def __init__(self, hidden_size: int, num_classes: int, max_seq_len: int):
        super(GPT2SequenceClassifier,self).__init__()
        self.frontLayer = GPT2Model.from_pretrained('gpt2')
        trf_out_size = hidden_size * max_seq_len
        self.fc = torch.nn.Linear(in_features=trf_out_size, out_features=num_classes)

    def forward(self, input_id, mask):
        """
        Args: input_id: encoded input of ids that were sent
        """
        gpt_out, _ = self.frontLayer(input_ids=input_id, attention_mask=mask, return_dict=False)
        batch_size = gpt_out.shape[0]
        linear_output = self.fc(gpt_out.view(batch_size, -1))
        return linear_output

class SarcasmDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_length=50):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.headlines)
    
    def __getitem__(self, idx):
        processed_headline = self.headlines[idx]
        # processed_headline = " ".join(self.headlines[idx])
        encoded_data = self.tokenizer(processed_headline, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")
        return encoded_data, self.labels[idx]
    
def load_model(path, hidden_size, max_seq_len):
    model = GPT2SequenceClassifier(hidden_size=hidden_size, num_classes=2, max_seq_len=max_seq_len)
    model.load_state_dict(torch.load(path))
    model.eval()
    return model

def train(model, trainData, valData, lr, max_epochs, early_stop_tol):
    trainLoader = DataLoader(trainData, batch_size=16, shuffle=True)
    valLoader = DataLoader(valData, batch_size=16, shuffle=True)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    did_not_improve_count = 0
    best_val_score = 0
    best_epoch = 0
    best_state_dict = None
    model.train()
    
    for epoch_num in range(max_epochs):
        # total_acc_train = 0
        total_loss_train = 0
        train_predictions = []
        train_labels = []
        for encoded_data, train_label in tqdm(trainLoader):
            train_label = train_label.to(device)
            mask = encoded_data['attention_mask'].to(device)
            input_id = encoded_data["input_ids"].squeeze(1).to(device)            

            model.zero_grad()

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            # acc = (output.argmax(dim=1) == train_label).sum().item()
            # total_acc_train += acc

            # add original labels
            train_labels += train_label.cpu().numpy().flatten().tolist()
            # get predicitons to list
            train_predictions += output.argmax(dim=1).cpu().numpy().flatten().tolist()

            batch_loss.backward()
            optimizer.step()
        
        # total_acc_val = 0
        total_loss_val = 0
        val_predictions = []
        val_labels = []
        
        with torch.no_grad():
            for val_input, val_label in valLoader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                
                output = model(input_id, mask)
                
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                # acc = (output.argmax(dim=1)==val_label).sum().item()
                # total_acc_val += acc
                # add original labels
                val_labels += val_label.cpu().numpy().flatten().tolist()
                # get predicitons to list
                val_predictions += output.argmax(dim=1).cpu().numpy().flatten().tolist()
            

        # total_acc_val = total_acc_val/len(valData)
        f1_train = f1_score(train_labels, train_predictions, average='macro')
        f1_val = f1_score(val_labels, val_predictions, average='macro')

        if f1_val > (best_val_score + early_stop_tol):
            best_val_score = f1_val
            did_not_improve_count = 0
            best_epoch = epoch_num
            best_state_dict = copy.deepcopy(model.state_dict())
            print(f"Saving new best val acc {best_val_score}")
            torch.save(best_state_dict, f"./gpt2-classifier-model-lr{lr}-iter{best_epoch+1}-tol{early_stop_tol}.pt")
        else:
            did_not_improve_count += 1

        if did_not_improve_count >= 10:
            break

        print(
            f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train/len(trainData): .3f} \
            | Train Score: {f1_train: .3f} \
             | Val Loss: {total_loss_val / len(valData): .3f} \
             | Val Score: {f1_val: .3f}")
        
def evaluate(model, testLoader, length):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    predictions = []
    true_labels = []

    total_acc_test = 0
    model.eval()
    with torch.no_grad():
        for test_input, test_label in testLoader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            
            output = model(input_id, mask)
                        
            acc = (output.argmax(dim=1)==test_label).sum().item()
            total_acc_test += acc

            # add original labels
            true_labels += test_label.cpu().numpy().flatten().tolist()
            # get predictions to list
            predictions += output.argmax(dim=1).cpu().numpy().flatten().tolist()
    test_score = f1_score(true_labels, predictions, average='macro')
    test_acc = total_acc_test / length
    print(f'Test Accuracy: {test_acc: .3f}, F1 Score: {test_score: .3f}')
    return true_labels, predictions


In [6]:
# Hyperparameters
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
seq_len = 128
hidden_size = 768
val_tol = 0.01
lr = 1e-4



In [7]:
model = GPT2SequenceClassifier(hidden_size=hidden_size, num_classes=2, max_seq_len=seq_len)
trainData = SarcasmDataset(headlines=train_texts, labels=train_labels, tokenizer=tokenizer, max_length=seq_len)
valData = SarcasmDataset(headlines=val_texts, labels=val_labels, tokenizer=tokenizer, max_length=seq_len)

In [8]:
train(model, trainData, valData, lr, 500, val_tol)

100%|██████████| 5/5 [00:01<00:00,  3.68it/s]


New best val acc 0.5
Epochs: 1 | Train Loss:  0.182             | Train Score:  0.360              | Val Loss:  0.117              | Val Score:  0.500              | Best Val Score:  0.500


100%|██████████| 5/5 [00:01<00:00,  4.64it/s]


Epochs: 2 | Train Loss:  0.051             | Train Score:  0.514              | Val Loss:  0.145              | Val Score:  0.467              | Best Val Score:  0.500


100%|██████████| 5/5 [00:01<00:00,  4.69it/s]


Epochs: 3 | Train Loss:  0.057             | Train Score:  0.529              | Val Loss:  0.086              | Val Score:  0.500              | Best Val Score:  0.500


100%|██████████| 5/5 [00:01<00:00,  4.71it/s]


New best val acc 0.75
Epochs: 4 | Train Loss:  0.049             | Train Score:  0.612              | Val Loss:  0.079              | Val Score:  0.750              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.61it/s]


Epochs: 5 | Train Loss:  0.071             | Train Score:  0.500              | Val Loss:  0.137              | Val Score:  0.273              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.72it/s]


Epochs: 6 | Train Loss:  0.048             | Train Score:  0.610              | Val Loss:  0.128              | Val Score:  0.273              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.59it/s]


Epochs: 7 | Train Loss:  0.040             | Train Score:  0.663              | Val Loss:  0.098              | Val Score:  0.500              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.68it/s]


Epochs: 8 | Train Loss:  0.033             | Train Score:  0.743              | Val Loss:  0.086              | Val Score:  0.365              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.69it/s]


Epochs: 9 | Train Loss:  0.033             | Train Score:  0.776              | Val Loss:  0.135              | Val Score:  0.273              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.68it/s]


Epochs: 10 | Train Loss:  0.024             | Train Score:  0.845              | Val Loss:  0.141              | Val Score:  0.333              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.66it/s]


Epochs: 11 | Train Loss:  0.019             | Train Score:  0.871              | Val Loss:  0.169              | Val Score:  0.365              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.67it/s]


Epochs: 12 | Train Loss:  0.016             | Train Score:  0.928              | Val Loss:  0.139              | Val Score:  0.500              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.58it/s]


Epochs: 13 | Train Loss:  0.017             | Train Score:  0.861              | Val Loss:  0.520              | Val Score:  0.273              | Best Val Score:  0.750


100%|██████████| 5/5 [00:01<00:00,  4.69it/s]


In [9]:
testData = SarcasmDataset(headlines=test_texts, labels=test_labels, tokenizer=tokenizer, max_length=seq_len)
testLoader = DataLoader(testData, batch_size=16, shuffle=True)

model = load_model("./gpt2-classifier-model-lr1e-05-iter3-tol0.05-preprocessing.pt", hidden_size=768, max_seq_len=seq_len)

evaluate(model, testLoader, len(testData))

print(len(testData))



Test Accuracy:  0.700, F1 Score:  0.600
20
