In [1]:
import numpy as np
import pandas as pd
from math import *
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import time as ti
from sklearn.model_selection import train_test_split
import re
from string import punctuation
from nltk.stem import SnowballStemmer
!pip install gensim --upgrade 
import gensim
from copy import deepcopy

Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 44.8 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [2]:
class Quora_dataset2(torch.utils.data.Dataset):
    # enable to create a dataset for the data loader
    # https://towardsdatascience.com/building-efficient-custom-datasets-in-pytorch-2563b946fd9f
    def __init__(self, df1, vocab):
        self.df1 = df1
        self.MAX_SEQUENCE_LENGTH = 30
        self.vocab = vocab
        self.prepare_loader()

    def prepare_loader(self):
        # create to dictionnary for each row of the dataframe
        values = self.df1.values
        cles = self.df1.columns
        questions = []
        for vals in values:
            #converts into a list:
            question1_list = gensim.utils.simple_preprocess(vals[0].encode('utf-8'))
            question1_list = [word for word in question1_list if word in self.vocab][:self.MAX_SEQUENCE_LENGTH]
            question2_list = gensim.utils.simple_preprocess(vals[0].encode('utf-8'))
            question2_list = [word for word in question2_list if word in self.vocab][:self.MAX_SEQUENCE_LENGTH]
            questions.append({cles[0] : question1_list, cles[1] : question2_list})
        self.sample = list(zip(questions, values[:, 2]))

    def __len__(self):
        return len(self.sample)

    def __getitem__(self, idx):
        return self.sample[idx]

In [7]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import os
os.chdir('/content/drive/My Drive/')
df = pd.read_csv("train.csv.zip")
df = df.dropna()

## The importance of cleaning the text

https://www.kaggle.com/code/currie32/the-importance-of-cleaning-text/notebook

In [9]:
stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

def text_to_wordlist(text, remove_stop_words = False, stem_words = False):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    # text = re.sub(r"what's", "", text)
    # text = re.sub(r"What's", "", text)
    # text = re.sub(r"\'s", " ", text)
    text = re.sub(r'\s+', " ", text) # remove new lines
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    # text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [10]:
#https://www.kaggle.com/code/talha1503/quora-question-pairs-bi-lstm-pytorch
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am",'i\'m':'i am', "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled'}

def clean_contractions(text, mapping):
    text = text.lower()
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else mapping[t.lower()] if t.lower() in mapping else t for t in text.split(" ")])
    return text

df['question1'] = df['question1'].apply(lambda x: clean_contractions(str(x),contraction_mapping))
df['question2'] = df['question2'].apply(lambda x: clean_contractions(str(x),contraction_mapping))
questions1 = df["question1"].values
questions2 = df["question2"].values
questions1 = [text_to_wordlist(question) for question in questions1]
questions2 = [text_to_wordlist(question) for question in questions2]
df["question1"] = questions1
df["question2"] = questions2
columns = ["question1", "question2", "is_duplicate"] #only keeps relevant columns
df = df[columns]
df.to_csv("clean_train.csv")

## We train a Word2Vec model

As we have many sentences

https://www.kaggle.com/code/liananapalkova/simply-about-word2vec

In [11]:
All_questions = np.concatenate([df["question1"].values, df["question2"].values])
temps1 = ti.perf_counter()
All_questions = [gensim.utils.simple_preprocess(question.encode('utf-8')) for question in All_questions]
# converts questions to a list of words
print(ti.perf_counter() - temps1)

27.305018055000005


In [12]:
v_size = 200 #size of the embedding
model_W2V = gensim.models.Word2Vec(vector_size=v_size, window=10, min_count=2, sg=1, workers=10)
model_W2V.build_vocab(All_questions)  # prepare the model vocabulary

temps1 = ti.perf_counter()
model_W2V.train(All_questions, total_examples = len(All_questions), epochs = 10)
print(ti.perf_counter() - temps1)
model_W2V.save("quoraW2V.model")

441.22376084499996


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv("clean_train.csv", index_col=[0])
print(1 - df["is_duplicate"].mean())
questions1 = df["question1"]
questions2 = df["question2"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

0.630799407351708
cuda


In [31]:
v_size = 200
MAX_SEQUENCE_LENGTH = 30
model_W2V = gensim.models.Word2Vec.load("quoraW2V.model")

df_train, df_test0 = train_test_split(df, test_size = 0.1, random_state = 0)
df_test, df_valid = train_test_split(df_test0, test_size = 0.05, random_state = 0)
print(df_train.shape, df_test0.shape, df_valid.shape)

(363858, 3) (40429, 3) (2022, 3)


## We train a Word2Vec model

As we have many sentences

https://www.kaggle.com/code/liananapalkova/simply-about-word2vec

In [32]:
vocab = model_W2V.wv.index_to_key #list of vectorized words

In [34]:
def custom_collate(data):
    """Custom method to treat the mini-batch before loading"""
    text_batch, labels = deepcopy(zip(*data)) #need to copy otherwise changes in dataset
    bs = len(text_batch)
    tensor1 = torch.zeros(bs, MAX_SEQUENCE_LENGTH, v_size)
    tensor2 = torch.zeros(bs, MAX_SEQUENCE_LENGTH, v_size)
    for n, text_dico in enumerate(text_batch):
        question1 = text_dico["question1"]
        question2 = text_dico["question2"]
        try:
            embedding_array1 = model_W2V.wv[question1]
            embedding_array2 = model_W2V.wv[question2]
        except ValueError:
            continue #keeps 0
        tensor1[n, :len(question1)] = torch.from_numpy(embedding_array1)
        tensor2[n, :len(question2)] = torch.from_numpy(embedding_array2)
    dico_return = {"question1" : tensor1, "question2" : tensor2}
    labels = torch.tensor(labels).float().reshape(1, -1)
    return dico_return, labels

In [35]:
train_set = Quora_dataset2(df_train, vocab)
valid_set = Quora_dataset2(df_valid, vocab)

In [36]:
bs = 32
params = {'batch_size' : bs,
          'shuffle' : True,
          'num_workers' : 0,
          'collate_fn' : custom_collate}
trainloader = torch.utils.data.DataLoader(train_set, **params)
valid_loader = torch.utils.data.DataLoader(valid_set, **params)

In [37]:
class SimpleModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm1 = nn.LSTM(v_size, hidden_size, batch_first = True, bidirectional = True)
        self.lstm2 = nn.LSTM(v_size, hidden_size, batch_first = True, bidirectional = True)
        self.fc1 = nn.Linear(hidden_size * 4 * MAX_SEQUENCE_LENGTH, hidden_size * 2)
        self.fc2 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        self.drop = nn.Dropout(.2)

    def __call__(self, x_dic):
        x1 = x_dic["question1"]
        x2 = x_dic["question2"]
        x1 = x1.cuda()
        x2 = x2.cuda()
        x11 = self.lstm1(x1)
        x22 = self.lstm2(x2)
        x = torch.cat([x11[0], x22[0]], axis = 2) #concatenate for FC layer
        x = x.view(-1, MAX_SEQUENCE_LENGTH * self.hidden_size * 4) #flattens
        #print(x.shape)
        x = F.relu(self.fc1(self.drop(x)))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [40]:
model_simple = SimpleModel(50)
model_simple.to(device)
n_epochs = 30
lr = .001
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_simple.parameters(),lr = lr) 

In [41]:
temps1 = ti.perf_counter()
total = len(train_set)
dis_loss = 500
for epoch in range(n_epochs):
    running_loss = 0
    model_simple.train()
    for i, data in enumerate(trainloader):
        x_dic, labels = data
        optimizer.zero_grad()
        outputs = model_simple(x_dic).T
        loss = criterion(outputs, labels.cuda())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 20 == 0:
            print('\r', '{:.2%}'.format(i / (total // bs)), end = '', sep = '')
        if i % dis_loss == dis_loss - 1 or i == 0:    # print every dis_loss mini-batches
            print('\r', f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / dis_loss / bs:.4f}')
            running_loss = 0.0
        # if i == 0:
        #     break

    running_loss = 0
    running_accuracy = 0
    model_simple.eval()
    for i, data in enumerate(valid_loader):
        x_dic, labels = data
        outputs = model_simple(x_dic).T
        loss = criterion(outputs, labels.cuda())
        running_loss += loss.item()
        sigmoid = nn.Sigmoid()(outputs)
        predictions = torch.round(sigmoid)
        running_accuracy += (predictions == labels.cuda()).sum()
    print('\r', f"validation loss: {running_loss / df_valid.shape[0]:.4f}")
    print('\r', f"validation accuracy: {running_accuracy / df_valid.shape[0]:.4f}")

print("    ", ti.perf_counter() - temps1)
print('Finished Training')

 [1,     1] loss: 0.0000
 [1,   500] loss: 0.0185
 [1,  1000] loss: 0.0175
 [1,  1500] loss: 0.0174
 [1,  2000] loss: 0.0172
 [1,  2500] loss: 0.0168
 [1,  3000] loss: 0.0169
 [1,  3500] loss: 0.0168
 [1,  4000] loss: 0.0168
 [1,  4500] loss: 0.0165
 [1,  5000] loss: 0.0166
 [1,  5500] loss: 0.0165
 [1,  6000] loss: 0.0165
 [1,  6500] loss: 0.0166
 [1,  7000] loss: 0.0165
 [1,  7500] loss: 0.0161
 [1,  8000] loss: 0.0164
 [1,  8500] loss: 0.0162
 [1,  9000] loss: 0.0161
 [1,  9500] loss: 0.0161
 [1, 10000] loss: 0.0161
 [1, 10500] loss: 0.0161
 [1, 11000] loss: 0.0161
 validation loss: 0.0161
 validation accuracy: 0.7537
 [2,     1] loss: 0.0000
 [2,   500] loss: 0.0156
 [2,  1000] loss: 0.0155
 [2,  1500] loss: 0.0153
 [2,  2000] loss: 0.0154
 [2,  2500] loss: 0.0154
 [2,  3000] loss: 0.0154
 [2,  3500] loss: 0.0157
 [2,  4000] loss: 0.0154
 [2,  4500] loss: 0.0152
 [2,  5000] loss: 0.0155
 [2,  5500] loss: 0.0155
 [2,  6000] loss: 0.0153
 [2,  6500] loss: 0.0154
 [2,  7000] loss: 0.0

In [42]:
#Possibility to verify on the test part
test_set = Quora_dataset2(df_test, vocab)

bs = 32
params = {'batch_size' : bs,
          'shuffle' : True,
          'num_workers' : 0,
          'collate_fn' : custom_collate}

test_loader = torch.utils.data.DataLoader(test_set, **params)

running_loss = 0
running_accuracy = 0
model_simple.eval()
for i, data in enumerate(test_loader):
    x_dic, labels = data
    outputs = model_simple(x_dic).T
    loss = criterion(outputs, labels.cuda())
    running_loss += loss.item()
    sigmoid = nn.Sigmoid()(outputs)
    predictions = torch.round(sigmoid)
    running_accuracy += (predictions == labels.cuda()).sum()

In [43]:
print('\r', f"testing accuracy: {running_accuracy / df_test.shape[0]:.4f}")

 testing accuracy: 0.7822
