# Importing Libraries

In [None]:
!pip install faiss-cpu

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [1]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
import os
import re
import string
from transformers import DistilBertTokenizer, DistilBertModel, GPT2LMHeadModel, AutoTokenizer
import faiss

# Loading data for content specific answer

In [2]:
# Loading Text files for content specific answers
directory = '/content/drive/MyDrive/en'

# Function to clean text
def clean_text(text):
    text = re.sub(r'\[\d+\]', '', text)  # Removing citations
    text = re.sub(r'\(\d+\)', '', text)
    text = re.sub(r'\n+', ' ', text)  # Removing new lines
    text = re.sub(r'\s+', ' ', text)  # Replacing multiple spaces with one space
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuations
    text = re.sub(r'\d+', '', text)  # Removing any numbers
    text = text.lower()  # Making text Lowercase
    return text

# Function to chunk a document into smaller chunks
def chunk_document(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# List to store all chunked documents
chunked_documents = []
document_ids = []  # Store file names or IDs to map chunks back to their source

# Dividind the books into small 100 word paragraphs so as to reduce the tokens to serve as an input to LLM
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)

        # Read the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Clean the text content
        cleaned_text = clean_text(content)

        # Chunk the cleaned text
        chunks = chunk_document(cleaned_text, chunk_size=100)  # Chunk size of 100 words

        # Store the chunks and corresponding file name (as ID)
        chunked_documents.extend(chunks)
        document_ids.extend([filename] * len(chunks))  # Map each chunk to its document

print(f"Loaded {len(chunked_documents)} chunks from {len(os.listdir(directory))} documents.")

Loaded 2298 chunks from 2 documents.


# Generating Embeddings

In [5]:
# Initialize the BERT tokenizer and model to generate embeddings
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').cuda()  # Load model on GPU

# Function to generate embeddings for a batch of text chunks
def generate_embeddings_batch(text_chunks):

    inputs = tokenizer(text_chunks, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move the inputs to the GPU
    inputs = {key: val.cuda() for key, val in inputs.items()}

    with torch.no_grad():
        with torch.cuda.amp.autocast():  # Use mixed precision for inference
            outputs = model(**inputs)

    # Return the mean pooling of token embeddings
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU

# Process chunks in batches to optimize GPU utilization
batch_size = 16
embeddings = []

for i in range(0, len(chunked_documents), batch_size):
    batch_chunks = chunked_documents[i:i + batch_size]
    batch_embeddings = generate_embeddings_batch(batch_chunks)
    embeddings.extend(batch_embeddings)

print(f"Generated {len(embeddings)} embeddings from {len(chunked_documents)} chunks.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  with torch.cuda.amp.autocast():  # Use mixed precision for inference


Generated 2298 embeddings from 2298 chunks.


In [7]:
embeddings = np.array(embeddings).astype('float32')

# FAISS
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

index.add(embeddings)

faiss.write_index(index, 'faiss_embeddings.index')

print(f"FAISS index created with {index.ntotal} embeddings and saved to 'faiss_embeddings.index'.")

FAISS index created with 2298 embeddings and saved to 'faiss_embeddings.index'.


# Loading our FIne-Tuned GPT model

In [8]:
# Load your fine-tuned GPT-2 model and tokenizer
fine_tuned_model_path = "/content/drive/MyDrive/gpt2-finetuned-med-exam"
model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)

# Predicting the output for random question

In [27]:
# Input Question
user_input_question = "Which of the following processes is primarily responsible for activating transcription by loosening DNA coiling?"
# Input Options
options = ["DNA methylation", "Histone deacetylation", "Histone acetylation", "Phosphorylation of histones"]

# Function to generate an answer from GPT-2 with attention_mask and max_new_tokens
def generate_gpt2_answer(chunk, question, options, model, tokenizer):
    input_text = f"Question: {question}\nContext: {chunk}\nOptions: {', '.join(options)}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

    # Set attention_mask to properly handle padding
    attention_mask = inputs['attention_mask']

    # Generate output with `max_new_tokens`
    outputs = model.generate(inputs.input_ids, attention_mask=attention_mask, max_new_tokens=50, num_return_sequences=1)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Perform FAISS search and retrieve the top-k chunks
k = 5  #top 5 chunks
D, I = index.search(user_input_embedding.astype('float32'), k)
retrieved_chunks = [chunked_documents[i] for i in I[0] if i < len(chunked_documents)]

In [28]:
# Generate GPT-2 answers for each chunk using your loaded GPT-2 model and tokenizer
gpt2_answers = [generate_gpt2_answer(chunk, user_input_question, options, model, tokenizer) for chunk in retrieved_chunks]

option_embeddings = generate_embeddings(options, distilbert_tokenizer, distilbert_model)

# Predict the Final Answer
final_answers = []
for gpt2_answer in gpt2_answers:
    gpt2_answer_embedding = generate_embeddings([gpt2_answer], distilbert_tokenizer, distilbert_model)
    similarities = cosine_similarity(gpt2_answer_embedding, option_embeddings)
    most_similar_option = options[np.argmax(similarities)]
    final_answers.append(most_similar_option)

final_answer = Counter(final_answers).most_common(1)[0][0]

print(f"The most likely answer is: {final_answer}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The most likely answer is: Histone deacetylation
