<a href="https://colab.research.google.com/github/jayteaftw/NLP-quora-duplicate-detection/blob/main/quora_duplicate_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We are only using test.csv(400k training examples). We are gonna do train, cv, test split on it. Lets just do a simple logistic regression on this and set a baseline performance for our model

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/gdrive/')
    path = '/content/gdrive/MyDrive/NLP_Final_Project/' 
except:
    path = ''

print(f"Path is {path}")


Mounted at /content/gdrive/
Path is /content/gdrive/MyDrive/NLP_Final_Project/


In [2]:
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Init

In [9]:
df = pd.read_csv(path+'train.csv')
y = np.array(df['is_duplicate'])
X = df[['id', 'qid1', 'qid2', 'question1', 'question2']]
X = np.array(X)
print(y.shape)
print(X.shape)

(404290,)
(404290, 5)


In [10]:
X_train = df[['question1','question2' ]]
X_train = np.array(X_train)
X_train[:,0]

array(['What is the step by step guide to invest in share market in india?',
       'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
       'How can I increase the speed of my internet connection while using a VPN?',
       ..., 'What is one coin?',
       'What is the approx annual cost of living while studying in UIC Chicago, for an Indian student?',
       'What is like to have sex with cousin?'], dtype=object)

In [11]:
def train(train_dataset, test_dataset,feature_size):
    train_len = len(train_dataset)
    train_dataloader = DataLoader(train_dataset,batch_size=64)
    
    if test_dataset != None:
        test_len = len(test_dataset)
        test_dataloader = DataLoader(test_dataset,batch_size=64)

    model = LogisticModel(feature_size)
    model.to(device)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2,betas=(0.9, 0.999))


    epochs = 500
    epoch_history = []
    fold_train_loss_history, fold_val_loss_history = [], []
    fold_val_acc_history = []
    
    for e in range(1,epochs+1):
        
        total_train_loss = 0
        total_train_correct = 0
        for batch_x, batch_y in train_dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
    
            # Compute prediction error
            pred = torch.squeeze(model(batch_x))
            loss = loss_fn(pred, batch_y)
            
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #Save Training Loss and total correct
            total_train_loss += loss.item()
            out = (pred>0.5).float()
            total_train_correct += sum(out==batch_y).float().sum()
     
        if e == 1 or e % 10 == 0:
            with torch.no_grad():
                epoch_history.append(e)

                train_acc = total_train_correct / train_len
                avg_train_loss = total_train_loss / len(train_dataloader)
                fold_train_loss_history.append(avg_train_loss)
                if test_dataset != None:
                    total_val_correct = 0
                    total_val_loss = 0
                    for val_x, val_y in test_dataloader:
                        val_x, val_y = val_x.to(device), val_y.to(device)
                        val_pred = torch.squeeze(model(val_x))
                        val_out = (val_pred>0.5).float()
                        val_loss = loss_fn(val_pred, val_y)
                        total_val_loss += val_loss.cpu()
                        total_val_correct += sum(val_out==val_y).float().sum()

                    val_acc = total_val_correct / test_len
                    avg_val_loss = total_val_loss / len(test_dataloader)
                    fold_val_acc_history.append(val_acc.cpu())
                    fold_val_loss_history.append(avg_val_loss)
               

                    print(f"epoch: {e}, Train loss: {avg_train_loss:>4f},  Train Acc: {train_acc:>4f}, Val Loss: {avg_val_loss::>4f}, Val Acc: {val_acc:>4f}")
                else:
                    print(f"epoch: {e}, Train loss: {avg_train_loss:>4f},  Train Acc: {train_acc:>4f}")   
    return fold_train_loss_history, fold_val_loss_history, fold_val_acc_history, epoch_history,model

# Glove50 Logsitic Regregression Test


In [None]:
class CustomDataset():
    def __init__(self,X,y )-> None:
        

        word_embeddings = pd.read_csv(path+'glove.6B.50d.txt.zip',
                               header=None, sep=' ', index_col=0,
                               nrows=100000, compression='zip', encoding='utf-8', quoting=3)
        # Build a dict that will map from string word to 50-dim vector
        word_list = word_embeddings.index.values.tolist()
        word2vec = OrderedDict(zip(word_list, word_embeddings.values))
        

        q1 = []
        q2 = []
        a = []
        for (x1, x2),y1 in zip(X,y):
          if isinstance(x1, str) and isinstance(x2, str):
            q1.append(x1)
            q2.append(x2)
            a.append(y1)
          else:
            print(x1,x2)

        pair_len = len(q1)

        x_embed_q1 = np.zeros((pair_len, 50))
        x_embed_q2 = np.zeros((pair_len, 50))        
        for idx, (x1, x2) in enumerate(zip(q1,q2)):

          question1 = x1.strip().split(" ")
          question2 = x2.strip().split(" ")

          for word in question1:
            if word in word2vec:
              x_embed_q1[idx] += word2vec[word]
            x_embed_q1[idx] /= len(question1)

          for word in question2:
            if word in word2vec:
              x_embed_q2[idx] += word2vec[word]
            x_embed_q2[idx] /= len(question2)
        

        x_embed = np.concatenate((x_embed_q1,x_embed_q2),axis=1)
        print(x_embed.shape)

        self.feature_size = x_embed.shape[1]
        self.x_train = torch.Tensor(x_embed)[:100000]
        self.y_train = torch.Tensor(a)[:100000]
        print(self.feature_size, self.x_train.shape, self.y_train.shape)

    def __len__(self):
        return self.x_train.shape[0]
    
    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]


In [22]:
class LogisticModel(nn.Module):
    def __init__(self,input_size):
        super(LogisticModel,self).__init__()
        self.w = nn.Linear(input_size,1)

    def forward(self, x):
        return F.sigmoid(self.w(x))

In [None]:
dataset = CustomDataset(X_train,y)


How can I develop android app? nan
How can I create an Android app? nan
nan My Chinese name is Haichao Yu. What English name is most suitable for me considering the pronounciation of my Chinese name?
(404287, 100)
100 torch.Size([100000, 100]) torch.Size([100000])
cuda


In [None]:
#For Testing
kfold = KFold(n_splits=10, shuffle=True)
kfold.split(dataset)
train_loss_history, val_loss_history = [], []
val_acc_history = []
for fold, (train_idxs, test_idxs) in enumerate(kfold.split(dataset)):

    print(f'Starting fold {fold}')
    train_dataset = torch.utils.data.Subset(dataset,train_idxs)
    test_dataset = torch.utils.data.Subset(dataset,test_idxs)
    
    fold_train_loss_history, fold_val_loss_history, fold_val_acc_history, epoch_history, _ = train(train_dataset=train_dataset,test_dataset=test_dataset,feature_size=dataset.feature_size)

    train_loss_history.append(fold_train_loss_history)
    val_loss_history.append(fold_val_loss_history)
    val_acc_history.append(fold_val_acc_history) 

# Sentence Bert Model

In [None]:
!pip install -U sentence-transformers

In [5]:
import pickle
import os
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

In [6]:
class SentenceTransformerDataset():
    def __init__(self,X,y, pair_len=0)-> None:
        
        model = SentenceTransformer('all-MiniLM-L6-v2')
        
        q1 = []
        q2 = []
        a = []
        for (x1, x2),y1 in zip(X,y):
          if isinstance(x1, str) and isinstance(x2, str):
            q1.append(x1)
            q2.append(x2)
            a.append(y1)
          else:
            print(x1,x2)

        pair_len = pair_len if pair_len else len(q1)

        q1, q2, a = q1[:pair_len], q2[:pair_len], a[:pair_len]
        x_embed_q1 = np.zeros((pair_len, 384))[:pair_len]
        x_embed_q2 = np.zeros((pair_len, 384)) [:pair_len]

        pool = model.start_multi_process_pool()


        #Start the multi-process pool on all available CUDA devices
        pool = model.start_multi_process_pool()

        step_size = 10000
        for start in tqdm(range(0, pair_len, step_size),total=pair_len//step_size):
          stop = start + step_size if (start + step_size) < pair_len else (pair_len - 1)
          x_embed_q1[start:stop] = model.encode_multi_process(q1[start:stop], pool)
          x_embed_q2[start:stop] = model.encode_multi_process(q2[start:stop], pool)

        #Optional: Stop the proccesses in the pool
        model.stop_multi_process_pool(pool)

        x_embed = np.concatenate((x_embed_q1,x_embed_q2),axis=1)
        
        print(x_embed.shape)

        self.feature_size = x_embed.shape[1]
        self.setences = [q1,q2]
        self.x_train = torch.Tensor(x_embed)
        self.y_train = torch.Tensor(a)
        print(self.feature_size, self.x_train.shape, self.y_train.shape)

    def __len__(self):
        return self.x_train.shape[0]
    
    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]



print()






In [13]:

create_bert_encoding = True
file_name = 'bert_encoding_max.pkl'


run = 'y'
if create_bert_encoding: 
  if os.path.isfile(path+file_name):
    run = input(f"'{file_name}' exists in the drive! Would you like to replace? (y/n)")
  if run == 'y':
    bert_dataset = SentenceTransformerDataset(X_train,y,pair_len=0)
    # Save the object to a file
    print(f'Writing Dataset to Drive')
    with open(path+file_name, 'wb') as f:
        pickle.dump(bert_dataset, f)
    print(f'Dataset Written!')

if not create_bert_encoding or run != 'y':
  # Load the object from the file
  if not os.path.isfile(path+file_name):
    raise ValueError("No Dataset in drive")
  else:
    print(f'Loading Dataset from Drive')
    with open(path+file_name, 'rb') as f:
        bert_dataset = pickle.load(f)
    print(f'Dataset Load!')


'bert_encoding_max.pkl' exists in the drive! Would you like to replace? (y/n)n
Loading Dataset from Drive
Dataset Load!


In [32]:
from sentence_transformers import util

x_train, y_train = bert_dataset.x_train.numpy(), bert_dataset.y_train.numpy()
feature_size = bert_dataset.feature_size//2

sum = 0
for x, y in tqdm(zip(x_train, y_train), total=len(bert_dataset)):
  q1 = x[:feature_size]
  q2 = x[feature_size:]
  cosine_scores = util.cos_sim(q1, q2).item()
  sum += 1 if (cosine_scores >= 0.5)==y else 0 


percent = sum / len(bert_dataset)
print(f"Cosine Sim Acc: {percent}")



  0%|          | 0/404287 [00:00<?, ?it/s]

Cosine Sim Acc: 0.5915030658913099


In [28]:
#For Testing
kfold = KFold(n_splits=10, shuffle=True)
train_loss_history, val_loss_history = [], []
val_acc_history = []
for fold, (train_idxs, test_idxs) in enumerate(kfold.split(bert_dataset)):

    print(f'Starting fold {fold}')
    train_dataset = torch.utils.data.Subset(bert_dataset,train_idxs)
    test_dataset = torch.utils.data.Subset(bert_dataset,test_idxs)
    
    fold_train_loss_history, fold_val_loss_history, fold_val_acc_history, epoch_history, _ = train(train_dataset=train_dataset,test_dataset=test_dataset,feature_size=bert_dataset.feature_size)

    train_loss_history.append(fold_train_loss_history)
    val_loss_history.append(fold_val_loss_history)
    val_acc_history.append(fold_val_acc_history) 

Starting fold 0
epoch: 1, Train loss: 0.572008,  Train Acc: 0.709213, Val Loss: 0.568460, Val Acc: 0.710876


KeyboardInterrupt: ignored