In [3]:
from datasets import load_dataset
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from gensim.models import Word2Vec

In [4]:
# Load the MS MARCO dataset from Hugging Face
dataset = load_dataset('ms_marco', 'v1.1')

# Split the dataset into training, validation, and test sets
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

# ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers']

# Extract queries and documents
queries = train_dataset['query']
relevant_docs = train_dataset['passages']

test_queries = test_dataset['query']
test_relevant_docs = test_dataset['passages']

val_queries = val_dataset['query']
val_relevant_docs = val_dataset['passages']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
df = pd.DataFrame({'query': queries, 'relevant_docs': relevant_docs})
train_df = pd.DataFrame({'query': test_queries, 'relevant_docs': test_relevant_docs})
val_df = pd.DataFrame({'query': val_queries, 'relevant_docs': val_relevant_docs})

In [6]:
"""import os

# Set the desired directory path
directory = '/Users/kailashkumar/Downloads'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the CSV files
df.to_csv(os.path.join(directory, 'train_ms_marco_data.csv'), index=False)
train_df.to_csv(os.path.join(directory, 'test_ms_marco_data.csv'), index=False)
val_df.to_csv(os.path.join(directory, 'val_ms_marco_data.csv'), index=False)"""


df.head()

Unnamed: 0,query,relevant_docs
0,what is rba,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]..."
1,was ronald reagan a democrat,"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag..."
2,how long do you need for sydney and surroundin...,"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]..."
3,price to install tile in shower,"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '..."
4,why conversion observed in body,"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas..."


In [7]:
# Print the sizes of each split
print("Training set size:", len(train_dataset))
print("Validation set size:", len(val_dataset))
print("Test set size:", len(test_dataset))

#Sample_queries
sample_size = 1000
sample_queries = queries[0:sample_size]
sample_relevant_docs = relevant_docs[0:sample_size]

print("\nShape of sample queries:", len(sample_queries))
print("Shape of sample relevant documents:", len(sample_relevant_docs))

Training set size: 82326
Validation set size: 10047
Test set size: 9650

Shape of sample queries: 1000
Shape of sample relevant documents: 1000


In [8]:
sample_queries[0], sample_relevant_docs[0]

('what is rba',
 {'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
  'passage_text': ["Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.",
   "The Reserve Bank of Australia (RBA) came into being on 14 January 1960 as Australia 's central bank and banknote issuing authority, when the Reserve Bank Act 1959 removed the central banking functions from the Commonwealth Bank. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its h

In [9]:
new_sample_relevant_docs = []
for i in range(len(sample_relevant_docs)):
  new_sample_relevant_docs.append(sample_relevant_docs[i]['passage_text'])


all_passages = []
for i in new_sample_relevant_docs:
  for j in i:
    all_passages.append(j)

len(sample_queries), len(new_sample_relevant_docs), len(all_passages)

(1000, 1000, 8251)

In [10]:
new_sample_relevant_docs[0][0], all_passages[0]

("Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.",
 "Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.")

In [11]:
import numpy as np

def select_irrelevant_passages(relevant_docs):
    # Get the size of relevant passages
    size = len(relevant_docs)

    #Filter out passages that are present in relevant_passages
    irrelevant_passages_indices = [i for i in range(len(all_passages)) if all_passages[i] not in relevant_docs]

    #Select irrelevant passages randomly from filtered passages
    irrelevant_indices = np.random.choice(irrelevant_passages_indices, size, replace=False)
    irrelevant_passages = [all_passages[i] for i in irrelevant_indices]
    return irrelevant_passages

def generate_triplets(queries, relevant_docs):
    triplets = []
    for i, query in enumerate(queries):
        relevant_doc = relevant_docs[i]
        negative_doc = select_irrelevant_passages(relevant_doc)
        triplets.append((query, relevant_doc, negative_doc))
    return triplets

triplets_train = generate_triplets(sample_queries, new_sample_relevant_docs)

triplets_train[0]

('what is rba',
 ["Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.",
  "The Reserve Bank of Australia (RBA) came into being on 14 January 1960 as Australia 's central bank and banknote issuing authority, when the Reserve Bank Act 1959 removed the central banking functions from the Commonwealth Bank. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumpti

In [12]:
# Ensuring relevant and  non-relevant documents have same size
for i in range(len(triplets_train)):
  if len(triplets_train[i][1]) != len(triplets_train[i][2]):
    print("mismatch")

In [13]:
count = 0
for sent in triplets_train:
  count += len(sent[1])

print(count)
len(triplets_train)

8251


1000

In [14]:
transformed_list = []
for item in triplets_train:
  lent = len(item[1])
  for i in range(lent):
    transformed_list.append((item[0],item[1][i],item[2][i]))



len(transformed_list), transformed_list[:5]

(8251,
 [('what is rba',
   "Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.",
   'There is a trend to modernize data centers in order to take advantage of the performance and energy efficiency increases of newer IT equipment and capabilities, such as cloud computing. This process is also known as data center transformation. In May 2011, data center research organization Uptime Institute reported that 36 percent of the large companies it surveyed expect to exhaust IT capacity within the next 18 months. Data center transformation takes a step-by-step 

In [15]:
type(transformed_list[0][0]), type(transformed_list[0][1]), type(transformed_list[0][2])

(str, str, str)

In [16]:
# Sentencepiece
import sentencepiece as spm
import torchtext

spm.SentencePieceTrainer.Train('--input=wiki_text.txt --model_prefix=model_1 --vocab_size=10000 --model_type=unigram --model_type=word')
sp = spm.SentencePieceProcessor()
sp.load('model_1.model')

sp.get_piece_size()

10000

In [17]:
print(sp.encode_as_pieces('Eli lilly!'))
print(sp.encode_as_ids('Eli lilly!'))
print(sp.decode_pieces(['▁E', 'li', '▁li', 'lly', '!']))
print(sp.decode_ids([74, 1610, 1714, 1599, 976]))

['▁E', 'li', '▁li', 'lly', '!']
[74, 1610, 1714, 1599, 976]
Eli lilly!
Eli lilly!


In [18]:
# Embedding using gensim in word2vec format
embedding_size = 300

def tokenize_and_embed(filepath):
  with open(filepath, 'r') as f:
    text = f.read()
  tokens = sp.encode_as_pieces(text)

  model = Word2Vec(sentences = [tokens], vector_size = embedding_size, min_count = 1)
  word2vec_embedding = model.wv
  return word2vec_embedding

embeddings = tokenize_and_embed("/content/wiki_text.txt")
print(f"The length of the embedding is {len(embeddings)}")

The length of the embedding is 9746


In [19]:
embeddings.index_to_key[:10]

[',', '▁the', '.', 's', '▁', '▁of', '▁and', '▁in', '▁a', '▁to']

##Embedding

In [20]:
missing_embedding = []
properly_embedded = []

def get_embedding(word, embeddings):
  if word in embeddings:
    properly_embedded.append(word)
    return embeddings[word]
  else:
    missing_embedding.append(word)
    return np.zeros(embeddings.vector_size)

def tokenize(text):
  text = text.lower().replace('\n','')
  tokens = sp.encode_as_pieces(text)
  return tokens

# Create a list to store word embeddings for the new data
query_embeddings, rel_data_embeddings, non_rel_data_embeddings = [], [], []

for i,j,k in transformed_list:
  # Get word embeddings for each token and average them
  query_embedding = [get_embedding(token, embeddings) for token in tokenize(i)]
  query_embeddings.append(query_embedding)

  rel_data_embedding = [get_embedding(token, embeddings) for token in tokenize(j)]
  rel_data_embeddings.append(rel_data_embedding)

  non_rel_data_embedding = [get_embedding(token, embeddings) for token in tokenize(k)]
  non_rel_data_embeddings.append(non_rel_data_embedding)

print(f"The missing_embedding are {len(missing_embedding)} in total")
print(f"The properly_embedded are {len(properly_embedded)} in total\n")

len(query_embeddings), len(rel_data_embeddings), len(non_rel_data_embeddings), len(query_embeddings[0]), len(rel_data_embeddings[0]), len(non_rel_data_embeddings[0])


The missing_embedding are 14180 in total
The properly_embedded are 2126294 in total



(8251, 8251, 8251, 5, 144, 123)

In [21]:
query_embeddings[0]

[array([-5.25803538e-04,  2.73320312e-03,  9.59745259e-04,  1.60894473e-03,
        -2.72297626e-03,  1.06581450e-04, -2.62446888e-03,  1.73834758e-03,
        -5.61952584e-05,  2.94386555e-04,  5.81946399e-04, -9.32405004e-04,
         1.19037228e-03, -9.67824453e-05, -2.88602151e-03, -3.18788202e-03,
         2.94460007e-03, -2.22537247e-03,  7.82318937e-04,  2.86475779e-03,
         2.07802490e-03, -2.86427489e-03,  3.07741167e-04, -8.40481545e-04,
         2.68015219e-03,  2.24711897e-04,  3.29057849e-03,  9.32275434e-04,
        -1.84917846e-03, -1.01081526e-03, -2.93991994e-03,  3.82645521e-04,
        -1.32458447e-03, -2.51732790e-03,  5.54287457e-04,  3.19824857e-03,
         2.48474511e-03,  3.31913191e-03, -2.78100139e-03,  6.71157439e-04,
         2.54838634e-03,  5.05964330e-04,  1.03178900e-03, -2.93698115e-03,
         2.58134725e-03,  2.32951995e-03, -6.82234764e-04,  2.36323080e-03,
        -1.73923571e-03,  1.31844636e-03,  5.69057447e-05,  1.08566124e-03,
        -1.8

In [None]:
# creating tensors out of array
query_tensors = [torch.tensor(embedding).float() for embedding in query_embeddings]
rel_data_tensors = [torch.tensor(embedding).float() for embedding in rel_data_embeddings]
non_rel_data_tensors = [torch.tensor(embedding).float() for embedding in non_rel_data_embeddings]

len(query_tensors), len(rel_data_tensors), len(non_rel_data_tensors), query_tensors[66].shape, rel_data_tensors[66].shape, non_rel_data_tensors[66].shape

  query_tensors = [torch.tensor(embedding).float() for embedding in query_embeddings]


In [None]:
# Pad sequences to a common length (assuming you want to pad to the maximum sequence length)
que_max_seq_length = max(len(embedding) for embedding in query_embeddings)
print(f"max query sequence len is {que_max_seq_length}")

doc_max_seq_length = max(len(embedding) for embedding in rel_data_embeddings + non_rel_data_embeddings)
print(f"max query sequence len is {doc_max_seq_length}")

query_padded = [F.pad(embedding, pad=(0, 0, 0, que_max_seq_length - embedding.size(0))) for embedding in query_tensors]
rel_data_padded = [F.pad(embedding, pad=(0, 0, 0, doc_max_seq_length - embedding.size(0))) for embedding in rel_data_tensors]
non_rel_data_padded = [F.pad(embedding, pad=(0, 0, 0, doc_max_seq_length - embedding.size(0))) for embedding in non_rel_data_tensors]


len(query_padded), len(rel_data_padded), len(non_rel_data_padded), query_padded[66].shape, rel_data_padded[66].shape, non_rel_data_padded[66].shape

In [None]:
# Stack tensors into batches
query_batch = torch.stack(query_padded)
rel_data_batch = torch.stack(rel_data_padded)
non_rel_data_batch = torch.stack(non_rel_data_padded)

len(query_batch), len(rel_data_batch), len(non_rel_data_batch), query_batch[66].shape, rel_data_batch[66].shape, non_rel_data_batch[66].shape

## RNN, GRU, LSTM

In [None]:
class TwoTowerModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, output_size, batch_first=True):
        super(TwoTowerModel, self).__init__()
        # RNN encoders for queries and documents
        self.query_encoder = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=batch_first)
        self.doc_encoder = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=batch_first)

        # Final linear layer to project last hidden state to output size
        self.query_fc = nn.Linear(hidden_dim, output_size, dtype=torch.float, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        self.doc_fc = nn.Linear(hidden_dim, output_size, dtype=torch.float, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    def forward(self, query_batch = None, rel_data_batch = None, non_rel_data_batch = None):
        query_representation = None
        rel_doc_representation = None
        non_rel_doc_representation = None

        if query_batch is not None:
          query_encoding, _ = self.query_encoder(query_batch)
          last_query_hidden = query_encoding[:, -1, :]  # Get the last hidden state from the sequence output # Select last element from sequence dimension
          query_representation = self.query_fc(last_query_hidden)  # Apply the final linear layer to project to desired output size

        if rel_data_batch is not None:
          rel_doc_encoding, _ = self.doc_encoder(rel_data_batch)
          last_rel_doc_hidden = rel_doc_encoding[:, -1, :]
          rel_doc_representation = self.doc_fc(last_rel_doc_hidden)

        if non_rel_data_batch is not None:
          non_rel_doc_encoding, _ = self.doc_encoder(non_rel_data_batch)
          last_non_rel_doc_hidden = non_rel_doc_encoding[:, -1, :]
          non_rel_doc_representation = self.doc_fc(last_non_rel_doc_hidden)

        return query_representation, rel_doc_representation, non_rel_doc_representation


def triplet_loss(query_encoding, rel_doc_encoding, non_rel_doc_encoding, margin):
      dist_rel = 1 - F.cosine_similarity(query_encoding, rel_doc_encoding)
      dist_non_rel = 1 - F.cosine_similarity(query_encoding, non_rel_doc_encoding)
      # Calculate triplet loss
      loss = torch.clamp(margin + dist_rel - dist_non_rel, min = 0)
      return loss.mean()

# Create an instance of the two-tower model
embedding_dim = 50
hidden_dim = 128
output_size = 64
num_layers = 1
batch_first = True
num_epochs = 50

# Define optimizer
learning_rate = 0.01
momentum = 0.9
loss_margin = 1.0

# Instantiate the model
two_tower_model = TwoTowerModel(embedding_dim, hidden_dim, num_layers, output_size, batch_first)

# Set the optimizer
optimizer = optim.SGD(two_tower_model.parameters(), lr=learning_rate, momentum = momentum)

loss_data = []
for epoch in range(num_epochs):
    query_encoding, rel_doc_encoding, non_rel_doc_encoding = two_tower_model(query_batch, rel_data_batch, non_rel_data_batch)
    if epoch == 0:
      print("\nThe encoded outputs look like")
      print(query_encoding.shape, rel_doc_encoding.shape, non_rel_doc_encoding.shape)
      print("\n")
    loss = triplet_loss(query_encoding, rel_doc_encoding, non_rel_doc_encoding, margin=loss_margin)
    loss_data.append(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch % 1 == 0):
      print(f"Epoch {epoch+1}, Loss: {loss.item()}")

print("Training completed!")

In [None]:
import matplotlib.pyplot as plt

loss_values = [loss.item() for loss in loss_data] #converting tensors
# Plotting the loss data
plt.plot(loss_values)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.show()

## Inference

In [None]:
# Cachings docs
docs = []

for i,j,k in transformed_list:
  docs.append(j)
print(docs[1])

new_docs = list(set(docs))
new_docs = new_docs[:10]
print(f"The unique docs are {len(new_docs)}")
print(new_docs[0])
print(new_docs[1])

def preprocess_doc(docs):
  doc_embedding = []
  for i in docs:
    data_embeddings = [get_embedding(token, embeddings) for token in tokenize(i)]
    doc_embedding.append(data_embeddings)
  data_tensors = [torch.tensor(embedding).float() for embedding in doc_embedding]
  data_padded = [F.pad(embedding, pad=(0, 0, 0, doc_max_seq_length - embedding.size(0))) for embedding in data_tensors]
  data_batch = torch.stack(data_padded)
  return data_batch

cached_docs = preprocess_doc(docs)
# Encode documents (assuming cached_docs is preprocessed)
_, document_encodings, _ = two_tower_model.forward(None, cached_docs, None)

docs_dict = {item: document_encoding for item, document_encoding in zip(new_docs, document_encodings)}

cached_docs.shape, cached_docs[0][0], document_encodings[0], document_encodings[1], document_encodings[0] == document_encodings[1]

In [None]:
que_max_seq_length = 8

def preprocess_query(queries):
  query_embedding = []
  for i in queries:
    query_embeddings = [get_embedding(token, embeddings) for token in tokenize(i)]
    query_embedding.append(query_embeddings)

  query_tensors = [torch.tensor(embedding).float() for embedding in query_embedding]
  query_padded = [F.pad(embedding, pad=(0, 0, 0, que_max_seq_length - embedding.size(0))) for embedding in query_tensors]
  query_batch = torch.stack(query_padded)
  return query_batch


test_queries = ["Where is india"]
test_query_batch = preprocess_query(test_queries)

# Encode the query using a separate call
query_encodings, _, _ = two_tower_model.forward(test_query_batch, None, None)

test_query_batch.shape, test_query_batch[0][0]

In [None]:
def predict(query_encodings, docs_dict, k):
    # Calculate cosine similarity scores
    similarity_scores = [F.cosine_similarity(query_encodings.unsqueeze(0), docs_dict[doc_id].unsqueeze(0), dim=1) for doc_id in docs_dict.keys()]
    similarity_scores = torch.cat(similarity_scores, dim=1)

    # Rank documents by similarity
    ranked_documents = sorted(zip(docs_dict.keys(), similarity_scores[0]), key=lambda x: x[1], reverse=True)

    # Retrieve top-K documents
    top_k_documents = ranked_documents[:k]

    return top_k_documents

k = 10
prediction = predict(query_encodings, docs_dict, k)
for i,j in enumerate(prediction):
  print(f"\nPrediction {i+1}:\n")