In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('/kaggle/input/new-dataset/nlp_dataset.csv')

# Data Preparation: Select relevant columns
data = data[['hypothesis', 'premise', 'label']].dropna()
data['label'] = data['label'].map({'n': 0, 'c': 1})  # Map 'n' to 0 and 'c' to 
data.head()

Unnamed: 0,hypothesis,premise,label
0,further the paper makes several misleading cla...,the paper is rather well written but it strong...,0
1,4 .i like the key idea and the speedup is very...,review scores reflect this reviewers impressio...,0
2,the idea to use sampling is nice but the analy...,review scores reflect this reviewers impressio...,0
3,to summarize i think this paper give some empi...,in my opinion the overall quality of the paper...,0
4,to summarize i think this paper give some empi...,the context and relevance as well as the contr...,0


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [12]:
# Split the data into train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [13]:
from transformers import DistilBertTokenizer, DistilBertModel

# Dataset Class for DataLoader
class ReviewPairDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(
            row['hypothesis'],
            row['premise'],
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

# Load pre-trained tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

In [14]:
# Create DataLoaders
train_dataset = ReviewPairDataset(train_data, tokenizer)
val_dataset = ReviewPairDataset(val_data, tokenizer)
test_dataset = ReviewPairDataset(test_data, tokenizer)

In [15]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [16]:
class ContrastiveClassifier(nn.Module):
    def __init__(self, model_name, embedding_dim=768):
        super(ContrastiveClassifier, self).__init__()
        self.encoder = DistilBertModel.from_pretrained(model_name)
        self.fc = nn.Linear(embedding_dim, 2)  # Binary classification

    def forward(self, input_ids, attention_mask):
        # Get the hidden state of the [CLS] token
        outputs = self.encoder(input_ids, attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
        outputs = self.fc(cls_embeddings)
        return outputs

model_name = "distilbert-base-uncased"
model = ContrastiveClassifier(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


ContrastiveClassifier(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1

In [17]:
# Define Optimizer and Loss Function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()



In [18]:
from tqdm import tqdm
import time

# Training Function with TQDM and Timing
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=3):
    for epoch in range(epochs):
        start_time = time.time()  # Start timing the epoch
        
        # Training phase
        model.train()
        train_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1} Training", leave=False)
        
        for batch in train_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # Update TQDM description with current batch loss
            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation", leave=False)
        
        with torch.no_grad():
            for batch in val_loader_tqdm:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

                # Update TQDM description with current batch loss
                val_loader_tqdm.set_postfix(loss=loss.item())

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(classification_report(val_labels, val_preds))

        # Calculate and print epoch time
        end_time = time.time()
        epoch_time = end_time - start_time
        print(f"Epoch {epoch + 1} completed in {epoch_time:.2f} seconds")

# Train the Model
train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=3)


Epoch 1 Training:   0%|          | 0/1050 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1 Training:   0%|          | 3/1050 [00:01<06:54,  2.53it/s, loss=0.469]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1 Training:   1%|          | 8/1050 [00:03<06:48,  2.55it/s, loss=0.315]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 

Epoch 1, Train Loss: 0.3436


Epoch 1 Validation:   0%|          | 1/225 [00:00<00:30,  7.46it/s, loss=0.246]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1 Validation:   1%|▏         | 3/225 [00:00<00:30,  7.25it/s, loss=0.445]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1 Validation:   2%|▏         | 5/225 [00:00<00:30,  7.20it/s, loss=0.235]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. s

Validation Loss: 0.3277
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      6346
           1       0.67      0.00      0.01       850

    accuracy                           0.88      7196
   macro avg       0.77      0.50      0.47      7196
weighted avg       0.86      0.88      0.83      7196

Epoch 1 completed in 427.18 seconds


Epoch 2 Training:   0%|          | 3/1050 [00:01<06:40,  2.62it/s, loss=0.243]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 Training:   1%|          | 6/1050 [00:02<06:34,  2.64it/s, loss=0.269]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 Training:   1%|          | 7/1050 [00:02<06:34,  2.65it/s, loss=0.178]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequ

Epoch 2, Train Loss: 0.3057


Epoch 2 Validation:   0%|          | 1/225 [00:00<00:32,  6.86it/s, loss=0.234]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 Validation:   1%|▏         | 3/225 [00:00<00:32,  6.92it/s, loss=0.443]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 Validation:   2%|▏         | 5/225 [00:00<00:31,  7.09it/s, loss=0.224]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. s

Validation Loss: 0.3131
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      6346
           1       0.47      0.18      0.26       850

    accuracy                           0.88      7196
   macro avg       0.68      0.58      0.60      7196
weighted avg       0.85      0.88      0.85      7196

Epoch 2 completed in 425.56 seconds


Epoch 3 Training:   0%|          | 0/1050 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 Training:   1%|          | 13/1050 [00:04<06:30,  2.66it/s, loss=0.168] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 Training:   2%|▏         | 16/1050 [00:06<06:28,  2.66it/s, loss=0.43] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 Training:   2%|▏         | 19/1050 [00:07<06:26,  2.67it/s, loss=0.304]Be aware, overflowing toke

Epoch 3, Train Loss: 0.2617


Epoch 3 Validation:   0%|          | 1/225 [00:00<00:29,  7.61it/s, loss=0.19]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 Validation:   1%|▏         | 3/225 [00:00<00:30,  7.27it/s, loss=0.453]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 Validation:   2%|▏         | 5/225 [00:00<00:30,  7.27it/s, loss=0.199]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. se

Validation Loss: 0.3143
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      6346
           1       0.50      0.26      0.34       850

    accuracy                           0.88      7196
   macro avg       0.70      0.61      0.64      7196
weighted avg       0.86      0.88      0.86      7196

Epoch 3 completed in 425.40 seconds




In [19]:
# Save the model
torch.save(model.state_dict(), "contrastive_classifier.pt")

# Inference on Test Data with TQDM
def evaluate_model(model, test_loader, device):
    model.eval()
    test_preds, test_labels = [], []
    test_loader_tqdm = tqdm(test_loader, desc="Evaluating on Test Data", leave=False)

    with torch.no_grad():
        for batch in test_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

            # Optionally, display intermediate batch-level accuracy in the TQDM bar
            batch_accuracy = (preds.cpu().numpy() == labels.cpu().numpy()).mean()
            test_loader_tqdm.set_postfix(batch_accuracy=batch_accuracy)

    print("Test Results:")
    print(classification_report(test_labels, test_preds))

# Evaluate on Test Data
evaluate_model(model, test_loader, device)

Evaluating on Test Data:   0%|          | 0/225 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating on Test Data:   2%|▏         | 4/225 [00:00<00:32,  6.78it/s, batch_accuracy=0.812]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating on Test Data:   3%|▎         | 7/225 [00:01<00:31,  7.02it/s, batch_accuracy=0.719]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have

Test Results:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      6313
           1       0.46      0.23      0.31       884

    accuracy                           0.87      7197
   macro avg       0.68      0.60      0.62      7197
weighted avg       0.84      0.87      0.85      7197



