<a href="https://colab.research.google.com/github/kdhenderson/msds_colab_notebooks/blob/main/RAG_workshop_part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Retrieval Augmented Generation
## Part 1



#Step 0: Install and import useful packages

In [None]:
# PyMuPDF -> digest pdfs; tranformers -> hugging face models; faiss-cpu (facebook pkg) -> vectorize
pip install PyMuPDF transformers faiss-cpu

In [None]:
%pip install nltk  # natural language toolkit

In [None]:
import os
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModel
import torch
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import faiss
import numpy as np

# Step 1: Read PDF Files

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Example folder path in Google Drive
folder_path = '/content/drive/My Drive/PDFs/'  # Adjust this to your folder path
#file_path = '/content/drive/MyDrive/documents/my_pdf_file.pdf'

def read_pdfs(folder_path):
    pdf_texts = []
    for file_name in os.listdir(folder_path):  # can put many pdfs in here (will slow it down)
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            try:
                doc = fitz.open(file_path)  # fitz function (digest pdfs)
                text = ""
                for page in doc:
                    text += page.get_text()
                pdf_texts.append((file_name, text))
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
    return pdf_texts

# Run the function
pdf_contents = read_pdfs(folder_path)

# Display the results
for file_name, text in pdf_contents:
    print(f"Contents of {file_name}:\n{text[:1000]}...")  # Display first 100 characters for preview

In [None]:
pdf_texts = read_pdfs(folder_path)

pdf_texts

# Step 2: Chunk Text

In [None]:
# Step 2: Chunk Text
def chunk_text(text, chunk_size=100):  # chunk_size = hyperparameter (can't be more than 100 tokens, i.e. ~words)
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        words = sentence.split()
        if current_length + len(words) > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.extend(words)
        current_length += len(words)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

   # Print out each chunk
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i}: {chunk}\n")

    return chunks

In [None]:
nltk.download('punkt_tab')

all_chunks = []
chunk_mapping = []

for pdf_name, text in pdf_texts:
    chunks = chunk_text(text)
    all_chunks.extend(chunks)
    chunk_mapping.append((pdf_name, chunks))

# Step 3: Create Embeddings / Vectorization

In [None]:
# Step 3: Create Embeddings
def create_embeddings(text_chunks, tokenizer, model):
    embeddings = []
    for chunk in text_chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

          # Print out each embedding
    for i, embed in enumerate(embeddings):
        print(f"Embedding {i}: {embed}\n")

    return np.array(embeddings)

In [None]:
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 sentence transformer model to 384 dim vector
model = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModel.from_pretrained(model)

# Create embeddings
embeddings = create_embeddings(all_chunks, tokenizer, model)

# Step 4: Index Vectors / Embeddings

Indexing embeddings allows for efficient retrieval of relevant text chunks. Without indexing, finding similar chunks would involve comparing the query embedding against all embeddings, which is computationally expensive.

In [None]:
# Step 4: Index Embeddings
def index_embeddings(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index
# faiss vectorization strategy (organize based on semantic values, cosine similarity)

In [None]:
 # Index embeddings
 index = index_embeddings(embeddings)
 index

# Step 5: Retrieve and return relevant chunks.
### Note that there is no LLM to provide a refined answer here... we were add this later.

In [None]:
# Step 5: Answer Questions
def answer_question(question, pdf_texts, index, embeddings, tokenizer, model, top_k=3):
    # Create embedding for the question
    inputs = tokenizer(question, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        question_embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()

    # Search for the nearest text chunks
    _, indices = index.search(np.array([question_embedding]), k=top_k)
    indices = indices[0]

    # Collect top-k chunks
    retrieved_chunks = []
    sources = []
    for idx in indices:
        chunk_offset = idx
        pdf_idx = 0

        while chunk_offset >= len(pdf_texts[pdf_idx][1]):
            chunk_offset -= len(pdf_texts[pdf_idx][1])
            pdf_idx += 1

        pdf_name, chunks = pdf_texts[pdf_idx]
        retrieved_chunks.append(chunks[chunk_offset])
        sources.append(f"{pdf_name}, Chunk {chunk_offset}")



    combined_text = ' '.join(retrieved_chunks)
    return f"Answer: {combined_text}\nSources: {sources}"

In [None]:
 # Answer question
question = "What percent of the overall grade is the homework grade worth in DS 6371?"
answer = answer_question(question, chunk_mapping, index, embeddings, tokenizer, model, top_k=3)
print(answer)

# All Together

In [None]:
# Step 1: Read PDF Files
def read_pdfs(folder_path):
    pdf_texts = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            try:
                doc = fitz.open(file_path)
                text = ""
                for page in doc:
                    text += page.get_text()
                pdf_texts.append((file_name, text))
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
    return pdf_texts

# Step 2: Chunk Text
def chunk_text(text, chunk_size=100):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        words = sentence.split()
        if current_length + len(words) > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.extend(words)
        current_length += len(words)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


# Step 3: Create Embeddings
def create_embeddings(text_chunks, tokenizer, model):
    embeddings = []
    for chunk in text_chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

# Step 4: Index Embeddings
def index_embeddings(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

# Step 5: Answer Questions
def answer_question(question, pdf_texts, index, embeddings, tokenizer, model, top_k=3):
    # Create embedding for the question
    inputs = tokenizer(question, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        question_embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()

    # Search for the nearest text chunks
    _, indices = index.search(np.array([question_embedding]), k=top_k)
    indices = indices[0]

    # Collect top-k chunks
    retrieved_chunks = []
    sources = []
    for idx in indices:
        chunk_offset = idx
        pdf_idx = 0

        while chunk_offset >= len(pdf_texts[pdf_idx][1]):
            chunk_offset -= len(pdf_texts[pdf_idx][1])
            pdf_idx += 1

        pdf_name, chunks = pdf_texts[pdf_idx]
        retrieved_chunks.append(chunks[chunk_offset])
        sources.append(f"{pdf_name}, Chunk {chunk_offset}")



    combined_text = ' '.join(retrieved_chunks)
    return f"Answer: {combined_text}\nSources: {sources}"


# Main function to tie everything together
def main(folder_path, question, model):
    tokenizer = AutoTokenizer.from_pretrained(model)
    model = AutoModel.from_pretrained(model)

    # Read and chunk PDFs
    pdf_texts = read_pdfs(folder_path)
    all_chunks = []
    chunk_mapping = []

    for pdf_name, text in pdf_texts:
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        chunk_mapping.append((pdf_name, chunks))

    # Create and index embeddings
    embeddings = create_embeddings(all_chunks, tokenizer, model)
    index = index_embeddings(embeddings)

    # Answer question
    answer = answer_question(question, chunk_mapping, index, embeddings, tokenizer, model)
    print(answer)


# Comparing Different Models

In [None]:
#question = 'What does the "Check drainage" code mean on the washer?'
#question = 'What is Campus Caring Connections?'
question = "What percent of the overall grade is the homework grade worth in DS 6371?"
#question = "What determines the  largest percent of the grade?"
#question = "What is the FLS assignment?"

__DistilBERT Variants__
  - __distilbert-base-uncased:__ A distilled version of the original BERT model, which is optimized for speed and reduced size, while retaining much of the performance of the larger BERT models.
  - __distilroberta-base:__ A distilled version of the RoBERTa model, offering similar benefits in terms of size and speed.

In [None]:
main(folder_path, question, 'distilbert-base-uncased')

In [None]:
main(folder_path, question, 'distilroberta-base')

__BERT Variants:__
  - __bert-large-uncased:__ A larger version of BERT with more parameters, which can provide better embeddings and improved performance.
  - __roberta-large:__ A robustly optimized BERT approach with more parameters and improved training techniques.

In [None]:
main(folder_path, question, 'bert-large-uncased')

__Sentence Transformers:__

  - __all-MiniLM-L6-v2:__ A lightweight model optimized for generating sentence embeddings efficiently.
  - __all-mpnet-base-v2:__ A variant of MPNet optimized for generating high-quality sentence embeddings.

In [None]:
main(folder_path, question, 'sentence-transformers/all-MiniLM-L6-v2')

In [None]:
main(folder_path, question, 'sentence-transformers/all-mpnet-base-v2')