# Colbert Model

In [5]:
import os

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch import optim
import torchtext
from torchtext.vocab import Vocab
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd

from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from nltk import tokenize


  from .autonotebook import tqdm as notebook_tqdm


#### Loading the combined dataset

In [6]:

# Load the dataset
os.chdir("../datasets/")
#dataset_path = os.path.abspath(os.curdir)
data = pd.read_parquet("combined.parquet")

# Display the first few rows of the dataset for a quick overview
data.head()

Unnamed: 0,sentence,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1.0
1,dem rep. totally nails why congress is falling...,0.0
2,eat your veggies: 9 deliciously different recipes,0.0
3,inclement weather prevents liar from getting t...,1.0
4,mother comes pretty close to using word 'strea...,1.0


In [7]:
# Checking for any null values in the dataset
null_check = data.isnull().sum()

# Checking the distribution of the 'is_sarcastic' column
label_distribution = data["is_sarcastic"].value_counts(normalize=True)

null_check, label_distribution

(sentence        0
 is_sarcastic    0
 dtype: int64,
 is_sarcastic
 0.0    0.521391
 1.0    0.478609
 Name: proportion, dtype: float64)

In [8]:
import re
from sklearn.model_selection import train_test_split

# Data cleaning: removing special characters and escape sequences from the sentences
data["sentence"] = data["sentence"].apply(lambda x: re.sub(r"[\n\r\t]+", " ", x))

# Max sentences to take into account
MAX_SENTENCES = 10

# Dropping the sample that are too long
count_sentences = 0
idx =[]
for i in range(len(data["sentence"])):
    sentence = str(data["sentence"][i])
    Splitted_sentence = tokenize.sent_tokenize(sentence)
    l = len(Splitted_sentence)
    if(l>MAX_SENTENCES):
        idx.append(i)
    count_sentences = max(count_sentences, l)


data = data.drop(idx)

# Splitting the dataset into training, validation, and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Showing the size of each split
train_size, val_size, test_size = len(train_data), len(val_data), len(test_data)
train_size, val_size, test_size




(27828, 5963, 5964)

In [9]:

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


class SarcasticSentencesDataset(Dataset):
    """
    A custom PyTorch Dataset for the sarcastic sentences dataset.
    """

    def __init__(self, sentences, labels, tokenizer, MAX_SENTENCES):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.MAX_SENTENCES = MAX_SENTENCES

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        Splitted_sentence = tokenize.sent_tokenize(sentence)
        label = self.labels[item]

    # Encoding the full sentence using the tokenizer

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=60,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",  # Return PyTorch tensors
            truncation=True,
        )
        input_ids_list = [encoding["input_ids"].flatten()]
        attention_list = [encoding["attention_mask"].flatten()]

        # Encoding each sentences individually
        for s in Splitted_sentence:
            encoding = self.tokenizer.encode_plus(
                s,
                add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                max_length=20,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt",  # Return PyTorch tensors
                truncation=True,
            )
            input_ids_list.append(encoding["input_ids"].flatten())
            attention_list.append(encoding["attention_mask"].flatten())
        
        input_ids_tensor = torch.cat(input_ids_list)
        attention_tensor = torch.cat(attention_list)

        size_to_extend = self.MAX_SENTENCES*20 + 60 - input_ids_tensor.size()[0]

        input_ids_tensor = torch.cat([input_ids_tensor, torch.zeros(size_to_extend)])
        attention_tensor = torch.cat([attention_tensor, torch.zeros(size_to_extend)])


        return {
            "sentence": sentence,
            "input_ids": input_ids_tensor,
            "attention_mask": attention_tensor,
            "labels": torch.tensor(label, dtype=torch.long),
        }


# Constants
MAX_LEN = 128  # Maximum length of the tokens list
BATCH_SIZE = 16

# Creating instances of the SarcasticSentencesDataset
train_dataset = SarcasticSentencesDataset(
    train_data["sentence"].to_numpy(),
    train_data["is_sarcastic"].to_numpy(),
    tokenizer,
    MAX_SENTENCES,
)

val_dataset = SarcasticSentencesDataset(
    val_data["sentence"].to_numpy(),
    val_data["is_sarcastic"].to_numpy(),
    tokenizer,
    MAX_SENTENCES,
)

test_dataset = SarcasticSentencesDataset(
    test_data["sentence"].to_numpy(),
    test_data["is_sarcastic"].to_numpy(),
    tokenizer,
    MAX_SENTENCES,
)

# Creating the DataLoaders for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Checking the first batch from the train_loader
print(type(next(iter(train_loader))["input_ids"]))
# next(iter(train_loader))["sentence"]

<class 'torch.Tensor'>


In [10]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)
from torch.nn import CrossEntropyLoss
from tqdm import tqdm  # for displaying progress
import numpy as np

In [11]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [12]:
def train_epoch(model, data_loader, optimizer, device, scheduler, loss_fn, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in tqdm(data_loader, total=len(data_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )

        loss = outputs.loss
        losses.append(loss.item())

        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.float() / n_examples, np.mean(losses)

In [13]:
def eval_model(model, data_loader, device, loss_fn, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )

            loss = outputs.loss
            losses.append(loss.item())

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.float() / n_examples, np.mean(losses)

In [None]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)  # Send the model to GPU if available

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3

# Total number of training steps
total_steps = len(train_loader) * num_epochs

# Scheduler for learning rate
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

# Loss function
loss_fn = CrossEntropyLoss()

# Training and Validation
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print("-" * 10)

    train_acc, train_loss = train_epoch(
        model, train_loader, optimizer, device, scheduler, loss_fn, len(train_dataset)
    )

    print(f"Train loss {train_loss} accuracy {train_acc}")

    val_acc, val_loss = eval_model(model, val_loader, device, loss_fn, len(val_dataset))

    print(f"Validation loss {val_loss} accuracy {val_acc}")
    print()

# Save the model
torch.save(model.state_dict(), "Colbert_model.pth")