In [1]:
pip install SentencePiece

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from tqdm import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

GPU available


In [3]:
dataset = load_dataset('ms_marco', 'v1.1')
dataset

DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})

In [4]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

train_df = train_data.to_pandas()
train_df.drop(['query_id', 'query_type', 'wellFormedAnswers'], axis=1, inplace=True)
train_df = train_df[:100] #Using small amount of data
train_df.head()

Unnamed: 0,answers,passages,query
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body


In [5]:
passage = []
answer = []

for i in range(len(train_df)):
    x = train_df['answers'][i].tolist()
    if len(x)==0:
        x = "-"
    passage.append((train_df['passages'][i]['passage_text']).tolist())
    answer.append(x[0])

train_df['passages'] = passage
train_df['answers'] = answer
train_df = train_df[['query', 'answers', 'passages']]
train_df.head()

Unnamed: 0,query,answers,passages
0,what is rba,Results-Based Accountability is a disciplined ...,"[Since 2007, the RBA's outstanding reputation ..."
1,was ronald reagan a democrat,Yes,"[In his younger years, Ronald Reagan was a mem..."
2,how long do you need for sydney and surroundin...,20-25 minutes,"[Sydney, New South Wales, Australia is located..."
3,price to install tile in shower,$11 to $22 per square foot,"[In regards to tile installation costs, consum..."
4,why conversion observed in body,Due to symptoms in the body,"[Conclusions: In adult body CT, dose to an org..."


In [6]:
unique_docs = []
for i in range(len(train_df)):
    x = train_df['passages'][i]
    for j in range(len(x)):
        unique_docs.append(x[j])
print(len(unique_docs))
unique_docs = list(set(unique_docs))
unique_docs[:5]

814


["Metabolic acidosis, as a disruption of the body's acid/base balance, can be a mild symptom brought on by a lack of insulin, a starvation diet, or a gastrointestinal disorder like vomiting and diarrhea. Metabolic acidosis can indicate a more serious problem with a major organ like the liver, heart, or kidneys. In compensated acidosis pH may be normal, with a high blood bicarbonate concentration (due to increased renal retention) when the cause was respiratory, or with a low blood carbon dioxide (due to hyperventilation) when the cause was metabolic. See also glycolysis, lactic acid.",
 "4. Get work at a private detective agency. This is typically the first step before you can go out on your own. Most of these agencies are small, without much room for advancement, and they do not have definite steps or ranks that you can follow to move up in your career. Gain work experience. Once you've become a police officer, you'll need to put in your best effort for at least three years before you

In [93]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer

# Load pre-trained models and tokenizers
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

# Load and encode documents
all_docs = ["Involuntary muscles are muscles that are not controllable consciously.",
            "Muscles ",
            "Voluntary ",
            "paloma"]
    
all_docs = all_docs
doc_embeddings = sentence_transformer.encode(all_docs)

# Create FAISS index and store document embeddings
import faiss
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)

# Encode query and retrieve top-k relevant documents
query = "example of involuntary muscle tissue is"
answer = "yes, you are right"
query_embedding = sentence_transformer.encode([query])[0]
k = 2
_, topk_doc_indices = index.search(query_embedding.reshape(1, -1), k)
top_docs = [all_docs[idx] for idx in topk_doc_indices[0]]
print(f"\ntop_docs are :")
for i in top_docs:
    print(i)

# Compute softmax scores for top-k documents
topk_doc_scores, _ = index.search(query_embedding.reshape(1, -1), k)
topk_doc_scores = F.softmax(torch.tensor(topk_doc_scores[0]), dim=0)
print(f"\nsoftmax_scores are \n{topk_doc_scores}")

# Prepare input for encoder-decoder model
inputs = []
for i, doc_idx in enumerate(topk_doc_indices[0]):
    inputs.append((f"{query} {all_docs[doc_idx]}", answer, topk_doc_scores[i].item()))
print(f"\ninputs are :")
for i in inputs:
    print(i)

# Define custom dataset and dataloader
class QADataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx]

dataset = QADataset(inputs)
print(f"\ndataset is {dataset}")
for i, j, k in dataset:
    print(i)
    print(j)
    print(k)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
print(f"\ndataloader is {dataloader}")
for i, j, k in dataloader:
    print(i)
    print(j)
    print(k)

# Define model
class QAModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.t5 = t5_model

    def forward(self, input_ids, attention_mask, decoder_input_ids, labels=None):
        outputs = self.t5(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            # decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )
        return outputs

model = QAModel()
epochs = 1
# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(epochs):
    for inputs, answer, score in dataloader:
        inputs_dict = t5_tokenizer(inputs, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = inputs_dict['input_ids']
        attention_mask = inputs_dict['attention_mask']

        print(f"\ninput_ids are {input_ids.shape}\n{input_ids}")
        print(f"input attention_mask are {attention_mask.shape}\n{attention_mask}")

        answer_dict = t5_tokenizer(answer, return_tensors='pt', padding=True, truncation=True, max_length=512)
        answer_dict_ids = answer_dict['input_ids']
        answer_dict_attention_mask = answer_dict['attention_mask']
        print(f"\nanaswer_dict are {answer_dict_ids.shape}\n{answer_dict_ids}")
        print(f"anaswer_dict_attention_mask are {answer_dict_attention_mask.shape}\n{answer_dict_attention_mask}")
        
        print("" + "-" * 100)

        decoder_inputs = answer_dict_ids[:, :-1].contiguous()
        print(f"\ndecoder_inputs is {decoder_inputs.shape}\n{decoder_inputs}")
        labels = answer_dict_ids[:, 1:].contiguous()
        print(f"labels is {labels.shape}\n{labels}")
        print(f"\nscore is {score.shape}\n{score}")

        outputs = model(input_ids, attention_mask, decoder_inputs, labels=labels)
        print("Decoder done" + "-" * 100)
        print(f"\noutputs is {outputs.keys()}\n\nloss is {outputs['loss']}\n\nlogits is {outputs['logits'].shape}\n{outputs['logits']}")
        print("Printed outputs" + "-" * 100)
        loss = outputs.loss # I wont be interested in this loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



top_docs are :
Involuntary muscles are muscles that are not controllable consciously.
Muscles 

softmax_scores are 
tensor([0.5555, 0.4445])

inputs are :
('example of involuntary muscle tissue is Involuntary muscles are muscles that are not controllable consciously.', 'yes, you are right', 0.5554922223091125)
('example of involuntary muscle tissue is Muscles ', 'yes, you are right', 0.44450777769088745)

dataset is <__main__.QADataset object at 0x7f2df7d81270>
example of involuntary muscle tissue is Involuntary muscles are muscles that are not controllable consciously.
yes, you are right
0.5554922223091125
example of involuntary muscle tissue is Muscles 
yes, you are right
0.44450777769088745

dataloader is <torch.utils.data.dataloader.DataLoader object at 0x7f2dfb61c100>
('example of involuntary muscle tissue is Muscles ', 'example of involuntary muscle tissue is Involuntary muscles are muscles that are not controllable consciously.')
('yes, you are right', 'yes, you are right')
ten

In [83]:
answer = "yes, you are right"
tokens = ['▁yes', ',', '▁you', '▁are', '▁right']
print(t5_tokenizer.tokenize(answer), t5_tokenizer.convert_tokens_to_ids(tokens))


['▁yes', ',', '▁you', '▁are', '▁right'] [4273, 6, 25, 33, 269]


In [43]:
t5_tokenizer.decode(1), t5_tokenizer.decode(0),  t5_tokenizer.pad_token_id

('</s>', '<pad>', 0)

In [27]:
inputs_dict = t5_tokenizer(inputs, return_tensors='pt', padding=True, truncation=True, max_length=512)
input_ids = inputs_dict['input_ids']
attention_mask = inputs_dict['attention_mask']

print(f"\ninput_ids are \n{input_ids}")
print(f"attention_mask are \n{attention_mask}")


input_ids are 
tensor([[  677,    13,    16,  4571, 14016,   651,  5467,  6316,    19,    86,
          4571, 14016,   651,  7654,    33,  7654,    24,    33,    59,   610,
            40,   179,     3, 26355,     5,     1,     0],
        [  677,    13,    16,  4571, 14016,   651,  5467,  6316,    19,  6887,
          2482,     7,    33, 12910,   321,  5014,   120,    38,   893, 18545,
            42,    16,  4571, 14016,   651,     5,     1]])
attention_mask are 
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])
