In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mred-dataset/full_data_info.txt
/kaggle/input/bert-base/bert-base-uncased/tokenizer_config.json
/kaggle/input/bert-base/bert-base-uncased/special_tokens_map.json
/kaggle/input/bert-base/bert-base-uncased/vocab.txt
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_0_train.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_4_train.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_0_test.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_3_val.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_0_val.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_1_train.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_2_test.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_2_val.json
/kaggle/input/nlp_model/transformers/default/1/argscichat_train_dev/fold_3_train.json
/kaggl

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('/kaggle/input/nlp-dataset/nlp_dataset.csv')

# Data Preparation: Select relevant columns
data = data[['hypothesis', 'premise', 'label']].dropna()
data['label'] = data['label'].map({'n': 0, 'c': 1})  # Map 'n' to 0 and 'c' to 
data.head()

Unnamed: 0,hypothesis,premise,label
0,further the paper makes several misleading cla...,the paper is rather well written but it strong...,0
1,4 .i like the key idea and the speedup is very...,review scores reflect this reviewers impressio...,0
2,the idea to use sampling is nice but the analy...,review scores reflect this reviewers impressio...,0
3,to summarize i think this paper give some empi...,in my opinion the overall quality of the paper...,0
4,to summarize i think this paper give some empi...,the context and relevance as well as the contr...,0


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Split the data into train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [5]:
# Dataset Class for DataLoader
class ReviewPairDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(
            row['hypothesis'],
            row['premise'],
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [6]:
# Create DataLoaders
train_dataset = ReviewPairDataset(train_data, tokenizer)
val_dataset = ReviewPairDataset(val_data, tokenizer)
test_dataset = ReviewPairDataset(test_data, tokenizer)

In [7]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# Model Definition
class ContrastiveClassifier(nn.Module):
    def __init__(self, model_name, embedding_dim=384):
        super(ContrastiveClassifier, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(embedding_dim, 2)  # Binary classification

    def forward(self, input_ids, attention_mask):
        embeddings = self.encoder(input_ids, attention_mask)['pooler_output']
        outputs = self.fc(embeddings)
        return outputs

# Instantiate the model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = ContrastiveClassifier(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [9]:
# Define Optimizer and Loss Function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()



In [11]:
from tqdm import tqdm
import time

# Training Function with TQDM and Timing
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=3):
    for epoch in range(epochs):
        start_time = time.time()  # Start timing the epoch
        
        # Training phase
        model.train()
        train_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1} Training", leave=False)
        
        for batch in train_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # Update TQDM description with current batch loss
            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation", leave=False)
        
        with torch.no_grad():
            for batch in val_loader_tqdm:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

                # Update TQDM description with current batch loss
                val_loader_tqdm.set_postfix(loss=loss.item())

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(classification_report(val_labels, val_preds))

        # Calculate and print epoch time
        end_time = time.time()
        epoch_time = end_time - start_time
        print(f"Epoch {epoch + 1} completed in {epoch_time:.2f} seconds")

# Train the Model
train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=3)


                                                                                  

Epoch 1, Train Loss: 0.1938


                                                                                  

Validation Loss: 0.3088
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      6346
           1       0.51      0.39      0.44       850

    accuracy                           0.88      7196
   macro avg       0.72      0.67      0.69      7196
weighted avg       0.87      0.88      0.88      7196

Epoch 1 completed in 138.27 seconds


                                                                                  

Epoch 2, Train Loss: 0.1616


                                                                                  

Validation Loss: 0.3412
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6346
           1       0.48      0.41      0.45       850

    accuracy                           0.88      7196
   macro avg       0.70      0.68      0.69      7196
weighted avg       0.87      0.88      0.87      7196

Epoch 2 completed in 138.40 seconds


                                                                                  

Epoch 3, Train Loss: 0.1300


                                                                                  

Validation Loss: 0.3609
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      6346
           1       0.47      0.48      0.47       850

    accuracy                           0.87      7196
   macro avg       0.70      0.70      0.70      7196
weighted avg       0.87      0.87      0.87      7196

Epoch 3 completed in 138.40 seconds


In [12]:
# Save the model
torch.save(model.state_dict(), "contrastive_classifier.pt")

# Inference on Test Data with TQDM
def evaluate_model(model, test_loader, device):
    model.eval()
    test_preds, test_labels = [], []
    test_loader_tqdm = tqdm(test_loader, desc="Evaluating on Test Data", leave=False)

    with torch.no_grad():
        for batch in test_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

            # Optionally, display intermediate batch-level accuracy in the TQDM bar
            batch_accuracy = (preds.cpu().numpy() == labels.cpu().numpy()).mean()
            test_loader_tqdm.set_postfix(batch_accuracy=batch_accuracy)

    print("Test Results:")
    print(classification_report(test_labels, test_preds))

# Evaluate on Test Data
evaluate_model(model, test_loader, device)

                                                                                                

Test Results:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      6313
           1       0.47      0.48      0.47       884

    accuracy                           0.87      7197
   macro avg       0.70      0.70      0.70      7197
weighted avg       0.87      0.87      0.87      7197

