<a href="https://colab.research.google.com/github/human-ai2025/nlp_projects/blob/master/SetenceSimilarity_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Kaggle Stuff

In [1]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/tokens/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

## Downloading Dataset

In [6]:
!kaggle competitions download -c quora-question-pairs

Downloading quora-question-pairs.zip to /content
 99% 306M/309M [00:01<00:00, 217MB/s]
100% 309M/309M [00:01<00:00, 182MB/s]


In [7]:
! unzip quora-question-pairs.zip

Archive:  quora-question-pairs.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv                
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [8]:
! unzip train.csv.zip
! unzip test.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: r
new name: test_new.csv
  inflating: test_new.csv            


## Code Stuff

In [None]:
!pip install git+https://github.com/PyTorchLightning/pytorch-lightning

In [None]:
# Transformers installation
! pip install transformers

In [38]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl


pl.seed_everything(41)

INFO:lightning_lite.utilities.seed:Global seed set to 41


41

Approach is concat the two sentences and do sequence classification. 
- Divide the train into train, val and test
- The test dataset will be for final testing. 

In [27]:
data = pd.read_csv('/content/train.csv')
data = data.dropna()
print("The percentage of non similar question pairs is : ")
print(len(data[data['is_duplicate']==0].index)*100/len(data.index))
print("The percentage of similar question pairs is : ")
print(len(data[data['is_duplicate']==1].index)*100/len(data.index))

The percentage of non similar question pairs is : 
63.07994073517081
The percentage of similar question pairs is : 
36.92005926482919


In [35]:
class SimilarSentences(Dataset):
    def __init__(self, tokenizer, qone, qtwo, label, maxsize):
        self.tokenizer = tokenizer
        self.question1 = qone
        self.question2 = qtwo
        self.label = label
        self.uniqueId = self.dataframe['id']
        self.maxlen = maxsize

    def __len__(self):
        return len(self.question1) + len(self.question2) + 3

    def __getitem__(self, idx):
        text = str(self.question1[idx]) + ' [SEP] ' + str(self.question2[idx]) 
        encodedText = self.tokenizer.encode_plus(
            text, 
            add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
            max_length = self.maxlen,   # Pad & truncate all sentences.
            pad_to_max_length = True,
            return_attention_mask = True,   # Construct attn. masks.
            return_tensors = 'pt',     # Return pytorch tensors.
        )
        ids = encodedText['input_ids']
        mask = encodedText['attention_mask']
        token_type_ids = encodedText['token_type_ids']

        return {
            'UID': self.uniqueId[idx],
            'dataNeeded':{
                'ids': torch.tensor(ids, dtype=torch.long), 
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'labels': torch.tensor(self.label[idx], dtype=torch.float)
            }
            
        }


In [39]:
class SentenceSimilarityDataModule(pl.LightningDataModule):
    def __init__(self, path,  batchTrainSize: 4, batchValSize: 1, maxTokenLen: 512):
      self.path = path
      self.trainBatch = batchTrainSize
      self.testBatch = batchValSize
      self.max_len = maxTokenLen
      self.tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
      
    def setup(self, stage):

        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            self.dataframe = pd.read_csv(self.path)
            qone = list(self.dataframe['question1'].values)
            qtwo = list(self.dataframe['question2'].values)
            labels = list(self.dataframe['is_duplicate'].values)

            # lets do stratified splitting 
            self.train_inp_q1, self.val_inp_q1, self.train_inp_q2, self.val_inp_q2, self.train_label, self.val_label  = train_test_split(qone,
                                                                                                                                         qtwo,
                                                                                                                                         labels,
                                                                                                                                         random_state=2022,
                                                                                                                                         test_size = 0.1,
                                                                                                                                         stratify=labels)
                                                                                    
            self.train_inp_q1, self.test_inp_q1, self.train_inp_q2, self.test_inp_q2, self.train_label, self.test_label  = train_test_split(self.train_inp_q1,
                                                                                                                                            self.train_inp_q2,
                                                                                                                                            self.train_label,
                                                                                                                                            random_state=2022,
                                                                                                                                            test_size = 0.2,
                                                                                                                                            stratify=self.train_label)
            self.train_dataset = SimilarSentences(self.tokenizer, self.train_inp_q1, self.train_inp_q2, self.train_label, self.max_len)
            self.val_dataset = SimilarSentences(self.tokenizer, self.val_inp_q1, self.val_inp_q2, self.val_label, self.max_len)

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
          self.test_dataset = SimilarSentences(self.tokenizer, self.test_inp_q1, self.test_inp_q2, self.test_label, self.max_len)


    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.trainBatch)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.testBatch, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.testBatch, shuffle=False)



In [40]:
class SimilarSentenceModel(torch.nn.Module):
  def __init__(self, config):
    super().__init()
    self.config = config
    self.pretrainedModel = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', 
                                                                              retutn_dict = True, 
                                                                              num_labels=2
                                                                              )
    # the loss we use here is Binary Cross Entropy with logitics Loss 
    # its 60 40 
    # if it doesnot work well then try weighted loss 

    self.loss_func = torch.nn.BCEWithLogitsLoss()

    def forward(self, inputIds, attentionMask, labels=None):
      output = self.pretrainedModel(input_ids = inputIds,attention_mask = attentionMask)
      logits = output.logits 
      loss = 0
      if labels:
        loss = self.loss_func(logits.view(-1, 2), labels.view(-1, 2))
      return loss, logits


In [None]:
class SimilarSentenceModelPL(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def training_step(self, batch, batch_idx):
        pass

    def validation_step(self, batch, batch_idx):
        pass

    def test_step(self, batch, batch_idx):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

    def configure_optimizers(self):
        pass