In [None]:
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
import torch.nn as nn
from transformers import BertModel
import numpy as np
from sklearn.metrics import  f1_score, recall_score, precision_score, classification_report
import os
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup

# Number of training epochs (iterations over the entire dataset)
EPOCHS = 5

# Size of the batches of data (number of data points considered in a single update step)
BATCH_SIZE = 32

# Learning rate for the optimizer (determines the step size at each iteration while moving toward a minimum of a loss function)
LEARNING_RATE = 2e-5

# Term added to the denominator to improve numerical stability in the optimizer
EPSILON = 1e-8

# Number of warmup steps for learning rate scheduling (gradually increases learning rate to prevent training instability at the start)
WARMUP_STEPS = 0

# Maximum length of tokens (input text will be truncated or padded to this length)
TOKENIZER_MAX_LENGTH = 128

class BertForSequenceClassificationCustom(nn.Module):
    """
    A custom class for sequence classification that builds upon the pre-trained BERT model.
    
    Attributes:
        num_labels (int): The number of labels for the classification task.
        bert (BertModel): Pre-trained BERT model from Hugging Face Transformers.
        dropout (nn.Dropout): Dropout layer to reduce overfitting.
        classifier (nn.Sequential): Custom classifier layers added on top of BERT.
    """
    def __init__(self, num_labels=2):
        """
        Initializes the model by setting up the layers.
        
        Parameters:
            num_labels (int): Number of target labels for classification.
        """
        super(BertForSequenceClassificationCustom, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)  # Dropout layer for regularizing the model
        
        # Custom classifier that is added on top of the BERT model
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),  # 768 is the dimensionality of BERT's output
            nn.ReLU(),  # Activation function
            nn.Dropout(0.1),  # Another dropout layer for the classifier
            nn.Linear(512, num_labels)  # Final layer for classification
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        """
        Forward pass of the model.

        Parameters:
            input_ids (torch.Tensor): Indices of input sequence tokens.
            attention_mask (torch.Tensor): Mask to avoid performing attention on padding tokens.
            token_type_ids (torch.Tensor): Segment token indices.

        Returns:
            torch.Tensor: Logits from the classifier.
        """
        # Obtain the encoded sequence from BERT
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        # We are interested in BERT's pooled output, typically used for classification tasks
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.classifier(pooled_output)  # Obtain logits from the classifier
        
        return logits


def predict_on_example(model, tokenizer, sentence1, sentence2, device):
    """
    Make a prediction for a pair of sentences.

    Parameters:
        model (BertForSequenceClassificationCustom): The trained model for prediction.
        tokenizer (Tokenizer): Tokenizer for processing the text.
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.
        device (torch.device): The device to perform computation on.

    Returns:
        Tuple: Probabilities and predicted label.
    """
    model.eval()  # Set the model to evaluation mode
    # Tokenize the input sentences
    inputs = tokenizer(
        sentence1,
        sentence2,
        return_tensors="pt",
        max_length=TOKENIZER_MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    # Move tensors to the specified device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():  # Disable gradient computation
        logits = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        
    probabilities = torch.softmax(logits, dim=1)  # Calculate probabilities
    prediction = torch.argmax(probabilities, dim=1)  # Get the predicted label
    
    return probabilities, prediction.item()


def get_base_model():
    """
    Initializes and returns the custom BERT model.

    Returns:
        BertForSequenceClassificationCustom: The initialized model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Determine the computation device
    
    # Initialize the custom model
    model = BertForSequenceClassificationCustom(num_labels=2)
    model.to(device)  # Move the model to the device
    
    return model

def save_model(model, name):
    """
    Save the PyTorch model state dictionary.

    Parameters:
    model (torch.nn.Module): The model to be saved.
    name (str): The name of the file to save the model state dictionary.
    """
    if not os.path.exists("models"):
        os.makedirs("models")
    # Save the model's state dictionary under the 'models/' directory
    torch.save(model.state_dict(), f"models/{name}")
    print("Model saved.")  # Confirmation message


def flat_accuracy(preds, labels):
    """
    Calculate the accuracy of predictions compared to labels.

    Parameters:
    preds (numpy.ndarray): Predictions array with shape (num_samples, num_classes).
    labels (numpy.ndarray): Ground truth labels array with shape (num_samples,).

    Returns:
    float: The accuracy as a proportion of correct predictions.
    """
    # Flatten predictions and labels for comparison
    pred_flat = np.argmax(preds, axis=1).flatten()  # Convert softmax predictions to class predictions
    labels_flat = labels.flatten()  # Flatten the labels
    return np.sum(pred_flat == labels_flat) / len(labels_flat)  # Calculate and return accuracy


def evaluate_model(model, dataloader, device):
    """
    Evaluate the model's performance on a dataset.

    Parameters:
    model (torch.nn.Module): The model to be evaluated.
    dataloader (torch.utils.data.DataLoader): The DataLoader containing the evaluation dataset.
    device (torch.device): The device to perform the evaluation on.

    Returns:
    float: The average accuracy across all batches in the dataloader.
    """
    model.eval()  # Set the model to evaluation mode
    total_eval_accuracy = 0  # Accumulator for the total accuracy

    # Iterate over all batches in the provided DataLoader
    for batch in dataloader:
        # Move batch data to the device
        batch = tuple(t.to(device) for t in batch)
        # Unpack the batch data
        b_input_ids, b_input_mask, b_labels = batch

        # Disable gradient computation for evaluation
        with torch.no_grad():
            # Get model predictions for the current batch
            logits = model(b_input_ids, attention_mask=b_input_mask)

        # Move logits and labels to CPU for further operations
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Accumulate the total accuracy
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Compute the average accuracy across all batches
    return total_eval_accuracy / len(dataloader)

def full_model_evaluation(model, dataloader, device):
    """
    Evaluate the model's performance on a dataset using all available binary metrics.

    Parameters:
    model (torch.nn.Module): The model to be evaluated.
    dataloader (torch.utils.data.DataLoader): The DataLoader containing the evaluation dataset.
    device (torch.device): The device to perform the evaluation on.

    Returns:
    dict: A dictionary containing accuracy, F1 score, recall, precision, and individual class metrics.
    """
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    # Iterate over all batches in the provided DataLoader
    for batch in dataloader:
        # Move batch data to the device
        batch = tuple(t.to(device) for t in batch)
        # Unpack the batch data
        b_input_ids, b_input_mask, b_labels = batch

        # Disable gradient computation for evaluation
        with torch.no_grad():
            # Get model predictions for the current batch
            logits = model(b_input_ids, attention_mask=b_input_mask)

        # Move logits and labels to CPU for further operations
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Append predictions and labels for the entire dataset
        all_preds.extend(logits)
        all_labels.extend(label_ids)

    # Calculate metrics
    accuracy = flat_accuracy(np.array(all_preds), np.array(all_labels))
    f1 = f1_score(np.array(all_labels), np.argmax(np.array(all_preds), axis=1), average='weighted')
    recall = recall_score(np.array(all_labels), np.argmax(np.array(all_preds), axis=1), average='weighted')
    precision = precision_score(np.array(all_labels), np.argmax(np.array(all_preds), axis=1), average='weighted')
    class_report = classification_report(np.array(all_labels), np.argmax(np.array(all_preds), axis=1))

    metrics = {
        'accuracy': accuracy,
        'f1_score': f1,
        'recall': recall,
        'precision': precision,
        'class_report': class_report
    }

    return metrics

def fine_tune_model_on_data_loaders(model, train_dataloader, device, epochs=5):
    """
    Fine-tunes a pre-trained BERT model on a given dataset.

    Parameters:
        model (torch.nn.Module): The pre-trained BERT model for sequence classification.
        train_dataloader (torch.utils.data.DataLoader): DataLoader containing the training dataset.
        device (torch.device): The device to train the model on (CPU or GPU).
        epochs (int): Number of epochs to train the model.

    Returns:
        torch.nn.Module: The fine-tuned BERT model.
    """
    # Freeze all parameters of the model to prevent them from being updated during training
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the parameters of the last three layers to allow updating during training
    for layer in [model.bert.encoder.layer[-1], model.bert.encoder.layer[-2], model.bert.encoder.layer[-3]]:
        for param in layer.parameters():
            param.requires_grad = True

    # Initialize the optimizer with only the unfrozen parameters (i.e., last three layers)
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=LEARNING_RATE,
        eps=EPSILON
    )

    # Calculate the total number of training steps and set up the learning rate scheduler
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_STEPS,
        num_training_steps=total_steps
    )

    # Switch the model to training mode
    model.train()
    # Iterate over each epoch
    for epoch in tqdm(range(epochs), desc="Epochs"):
        total_loss = 0  # Track the total loss for each epoch
        # Iterate over each batch in the training data
        for batch in train_dataloader:
            # Move the batch tensors to the specified device
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from the dataloader
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()  # Reset gradients before performing backpropagation

            # Forward pass: compute the model output (logits)
            logits = model(b_input_ids, attention_mask=b_input_mask)
            # Compute the loss between model predictions and actual labels
            loss = nn.CrossEntropyLoss()(logits, b_labels)

            total_loss += loss.item()  # Accumulate the loss
            loss.backward()  # Perform backpropagation
            optimizer.step()  # Update model parameters
            scheduler.step()  # Update the learning rate

        # Calculate the average loss over all batches for the current epoch
        avg_train_loss = total_loss / len(train_dataloader)
        # Print the average training loss for the epoch
        print(f"Average training loss: {avg_train_loss:.4f}")

    # Print completion message once fine-tuning is finished
    print("Finished fine-tuning.")
    return model  # Return the fine-tuned model

def get_data_loaders():
    """
    Load and preprocess the MRPC dataset and return the training and validation dataloaders
    :param tokenizer: The tokenizer to use for preprocessing the dataset
    :return: The training and validation dataloaders
    """
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    # Load and preprocess the MRPC dataset
    dataset = load_dataset("glue", "mrpc")
    texts = [
        (
            tokenizer(
                example["sentence1"],
                example["sentence2"],
                truncation=True,
                padding="max_length",
                max_length=TOKENIZER_MAX_LENGTH
            ),
            example["label"]
        )
        for example in dataset["train"]
    ]
    input_ids = torch.tensor([t[0]["input_ids"] for t in texts])
    attention_masks = torch.tensor([t[0]["attention_mask"] for t in texts])
    labels = torch.tensor([t[1] for t in texts])

    # Split the dataset into training and validation sets
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
        input_ids,
        labels,
        random_state=0,
        test_size=0.1
    )
    train_masks, validation_masks, _, _ = train_test_split(
        attention_masks,
        labels,
        random_state=0,
        test_size=0.1
    )

    # Convert all of our data into torch tensors, the required datatype for our model
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    # Create the DataLoader for our training set
    train_data = TensorDataset(
        train_inputs,
        train_masks,
        train_labels
    )
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE)

    # Create the DataLoader for our validation set
    validation_data = TensorDataset(
        validation_inputs,
        validation_masks,
        validation_labels
    )
    validation_dataloader = DataLoader(validation_data, batch_size=BATCH_SIZE)

    return train_dataloader, validation_dataloader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device is ", device)

# Initialize the tokenizer from the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_loader, validation_loader = get_data_loaders()

model_without_fine_tuning = get_base_model()

pre_fine_tune_metrics = full_model_evaluation(
    model_without_fine_tuning,
    validation_loader,
    device
)
print("Metrics before fine-tuning")
for item in pre_fine_tune_metrics:
    print(f"{item}: ", pre_fine_tune_metrics[item])

save_model(model_without_fine_tuning, "model.pt")

model_after_fine_tuning = fine_tune_model_on_data_loaders(
    model_without_fine_tuning,
    train_loader,
    device,
    epochs=20
)
save_model(model_after_fine_tuning, "model_fine_tuned.pt")

# Evaluate of the fine-tuned model on the validation set
post_fine_tune_metrics = full_model_evaluation(
    model_after_fine_tuning,
    validation_loader,
    device
)
print("Post fine tune metrics")
for item in post_fine_tune_metrics:
    print(f"{item}: ", post_fine_tune_metrics[item])

In [None]:
import pandas as pd
data = []

# Replace 'file.csv' with the path to your file
with open('/kaggle/input/sts-tests/sts-test.tsv', 'r') as f:
    for line in f:
        try:
            # Splitting each line by tab delimiter
            row = line.strip().split('\t')
            data.append(row)
        except Exception as e:
            print(f"Skipping line: {line}. Error: {e}")

df = pd.DataFrame(data)

from tqdm.notebook import tqdm

raw_probabilities = []
predictions = []

# Iterate over the rows of the DataFrame with tqdm to display progress
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    # Extract the sentences from columns 5 and 6
    sentence1 = row[5]
    sentence2 = row[6]
    
    # Get predictions from your model
    probabilities, prediction = predict_on_example(model_after_fine_tuning, tokenizer, sentence1, sentence2, device)
    
    # Append probabilities and predictions to the respective lists
    raw_probabilities.append(probabilities)
    predictions.append(prediction)

import numpy as np

# Move the tensors to the CPU and convert them to NumPy arrays
raw_predictions = [probs[0][1] for probs in raw_probabilities]
raw_predictions_cpu = [prediction.item() for prediction in raw_predictions]

actual_scores = df.iloc[:, 4].astype(float).values

from scipy.stats import pearsonr

# Round the actual scores to either 0 or 1
threshold = 2.5
actual_scores_rounded = np.where(actual_scores < threshold, 0, 1)

# Calculate the Pearson correlation coefficient
correlation, _ = pearsonr(raw_predictions_cpu, actual_scores_rounded)
print("Pearson correlation coefficient after rounding:", correlation)