In [1]:
import os
import re
import tqdm
import PyPDF2
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text

def split_into_sentences(text: str) -> list[str]:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

In [3]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    pages_and_texts = []
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_number in tqdm.tqdm(range(len(pdf_reader.pages)), desc="Reading PDF"):
            text = pdf_reader.pages[page_number].extract_text()
            text = text_formatter(text)
            pages_and_texts.append({
                "page_number": page_number,
                "text": text
            })
    return pages_and_texts

def split_list(input_list: list, slice_size: int, overlap: int = 5) -> list[list[str]]:
    chunks = []
    for i in range(0, len(input_list), slice_size - overlap):
        chunk = input_list[i:i + slice_size]
        if chunk:
            chunks.append(chunk)
    return chunks

def chunk_pdf_text(pages_and_texts: list[dict], num_sentence_chunk_size: int = 20, overlap: int = 9) -> pd.DataFrame:
    for item in pages_and_texts:
        item["sentences"] = split_into_sentences(item["text"])

    pages_and_chunks = []
    for item in pages_and_texts:
        item["sentence_chunks"] = split_list(
            input_list=item["sentences"],
            slice_size=num_sentence_chunk_size,
            overlap=overlap
        )
        for sentence_chunk in item["sentence_chunks"]:
            joined_sentence_chunk = " ".join(sentence_chunk)
            joined_sentence_chunk = re.sub(r'\s+', ' ', joined_sentence_chunk)
            chunk_dict = {
                "page_number": item["page_number"],
                "sentence_chunk": joined_sentence_chunk,
                "chunk_char_count": len(joined_sentence_chunk),
                "chunk_word_count": len(joined_sentence_chunk.split(" ")),
                "chunk_token_count": len(joined_sentence_chunk) / 4.0
            }
            pages_and_chunks.append(chunk_dict)

    df = pd.DataFrame(pages_and_chunks)
    return df

def embed_chunks(df: pd.DataFrame, embedding_model_name: str = "all-mpnet-base-v2", min_token_length: int = 30) -> list[dict]:
    filtered = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
    embedding_model = SentenceTransformer(embedding_model_name, device="cpu")

    for item in tqdm.tqdm(filtered, desc="Generating Embeddings"):
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])

    return filtered

def store_in_chroma(chunks: list[dict], collection_name: str, chroma_db_path: str = "./chromadb", embedding_model_name: str = "all-mpnet-base-v2") -> None:
    chroma_client = chromadb.PersistentClient(path=chroma_db_path)

    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_model_name
    )

    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_function,
        metadata={"hnsw:space": "cosine"}
    )

    ids = [str(i) for i in range(len(chunks))]
    documents = [item["sentence_chunk"] for item in chunks]
    metadatas = [{"page_number": str(item["page_number"])} for item in chunks]
    embeddings = [item["embedding"].tolist() for item in chunks]

    collection.upsert(
        ids=ids,
        documents=documents,
        metadatas=metadatas,
        embeddings=embeddings
    )
    print(f"Embeddings stored in Chroma collection '{collection_name}' at {chroma_db_path}.")


In [4]:
def query_chroma(query: str, collection_name: str, chroma_db_path: str = "./chromadb", embedding_model_name: str = "all-mpnet-base-v2", n_results: int = 3) -> dict:
    # Load embedding model to embed the query
    embedding_model = SentenceTransformer(embedding_model_name, device="cpu")
    query_embedding = embedding_model.encode(query).tolist()

    # Connect to Chroma and retrieve the collection
    chroma_client = chromadb.PersistentClient(path=chroma_db_path)
    collection = chroma_client.get_collection(name=collection_name)

    # Query
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results


In [9]:
import json
with open('train.json', 'r') as f:
    train_data = json.load(f)

with open('validation.json', 'r') as f:
    validation_data = json.load(f)


In [5]:
client = chromadb.PersistentClient(path="./chromadb")
pdf_collection = client.get_collection(name="pdf_embeddings")

OperationalError: no such column: collections.topic

In [11]:
def retrieve_documents_embeddings(query_embedding, k=10):
    query_embedding_list = query_embedding.tolist()
    
    results = pdf_collection.query(
        query_embeddings=[query_embedding_list],
        n_results=k)
    return results['documents'][0]

In [12]:
def reciprocal_rank(retrieved_docs, ground_truth, k):
    try:
        rank = retrieved_docs.index(ground_truth) + 1
        return 1.0 / rank if rank <= k else 0.0
    except ValueError:
        return 0.0

In [13]:
def hit_rate(retrieved_docs, ground_truth, k):
    return 1.0 if ground_truth in retrieved_docs[:k] else 0.0

In [14]:
from sentence_transformers import SentenceTransformer

base_model = SentenceTransformer('all-mpnet-base-v2')

In [15]:
import numpy as np

def validate_embedding_model(validation_data, base_model, k=10):
    hit_rates = []
    reciprocal_ranks = []
    
    for data_point in validation_data:
        question = data_point['question']
        ground_truth = data_point['chunk']
        
        # Generate embedding for the question
        question_embedding = base_model.encode(question)
        
        # Retrieve documents using the embedding
        retrieved_docs = retrieve_documents_embeddings(question_embedding, k)
        
        # Calculate metrics
        hr = hit_rate(retrieved_docs, ground_truth, k)
        rr = reciprocal_rank(retrieved_docs, ground_truth, k)
        
        hit_rates.append(hr)
        reciprocal_ranks.append(rr)
    
    # Calculate average metrics
    avg_hit_rate = np.mean(hit_rates)
    avg_reciprocal_rank = np.mean(reciprocal_ranks)
    
    return {
        'average_hit_rate': avg_hit_rate,
        'average_reciprocal_rank': avg_reciprocal_rank
    }

results = validate_embedding_model(validation_data, base_model)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.8636363636363636
Mean Reciprocal Rank @10: 0.5737628384687208


In [21]:
import random
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils import clip_grad_norm_

In [17]:
# Process the NVIDIA PDF
nvidia_pages_and_texts = open_and_read_pdf("pdfs/nvidia-rand.pdf")
nvidia_df = chunk_pdf_text(nvidia_pages_and_texts)

# Filter chunks by token length and get the text chunks
nvidia_chunks = nvidia_df[nvidia_df["chunk_token_count"] > 30]["sentence_chunk"].tolist()
print(f"Generated {len(nvidia_chunks)} chunks from the NVIDIA PDF")

Reading PDF: 100%|██████████| 96/96 [00:02<00:00, 47.63it/s]

Generated 209 chunks from the NVIDIA PDF





In [22]:
def random_negative():
    random_sample = random.choice(nvidia_chunks)
    return random_sample

In [23]:
random_negative()

"EXHIBIT 32.2 CERTIFICATION Pursuant to the requirement set forth in Rule 13a-14(b) of the Securities Exchange Act of 1934, as amended (the “Exchange Act”), and Section 1350 of Chapter 63 of Title 18 of the United States Code (18 U.S.C. § 1350), Colette M. Kress, Executive Vice President and Chief Financial Officer of NVIDIA Corporation (the “Company”), hereby certifies that, to the best of her knowledge: 1. The Company's Annual Report on Form 10-K for the year ended January 28, 2024, to which this Certification is attached as Exhibit 32.2 (the “Periodic Report”), fully complies with the requirements of Section 13(a) or Section 15(d) of the Exchange Act; and 2. The information contained in the Periodic Report fairly presents, in all material respects, the financial condition of the Company at the end of the period covered by the Periodic Report and results of operations of the Company for the period covered by the Periodic Report. Date: February 21, 2024 /s/ COLETTE M. KRESS Colette M.

In [24]:
class LinearAdapter(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, input_dim)
    
    def forward(self, x):
        return self.linear(x)

In [25]:
class TripletDataset(Dataset):
    def __init__(self, data, base_model, negative_sampler):
        self.data = data
        self.base_model = base_model
        self.negative_sampler = negative_sampler

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        query = item['question']
        positive = item['chunk']
        negative = self.negative_sampler()
        
        query_emb = self.base_model.encode(query, convert_to_tensor=True)
        positive_emb = self.base_model.encode(positive, convert_to_tensor=True)
        negative_emb = self.base_model.encode(negative, convert_to_tensor=True)
        
        return query_emb, positive_emb, negative_emb

In [26]:
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

def train_linear_adapter(base_model, train_data, negative_sampler, num_epochs=10, batch_size=32, 
                         learning_rate=2e-5, warmup_steps=100, max_grad_norm=1.0, margin=1.0):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize the LinearAdapter
    adapter = LinearAdapter(base_model.get_sentence_embedding_dimension()).to(device)
    
    # Define loss function and optimizer
    triplet_loss = nn.TripletMarginLoss(margin=margin, p=2)
    optimizer = AdamW(adapter.parameters(), lr=learning_rate)
    
    # Create dataset and dataloader
    dataset = TripletDataset(train_data, base_model, negative_sampler)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Calculate total number of training steps
    total_steps = len(dataloader) * num_epochs
    
    # Create learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    
    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            query_emb, positive_emb, negative_emb = [x.to(device) for x in batch]
            
            # Forward pass
            adapted_query_emb = adapter(query_emb)
            
            # Compute loss
            loss = triplet_loss(adapted_query_emb, positive_emb, negative_emb)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            clip_grad_norm_(adapter.parameters(), max_grad_norm)
            
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")
    
    return adapter

In [27]:
# Define the kwargs dictionary
adapter_kwargs = {
    'num_epochs': 1,
    'batch_size': 32,
    'learning_rate': 0.003,
    'warmup_steps': 100,
    'max_grad_norm': 1.0,
    'margin': 1.0
}

# Train the adapter using the kwargs dictionary
trained_adapter = train_linear_adapter(base_model, train_data, random_negative, **adapter_kwargs)

# Create a dictionary to store both the adapter state_dict and the kwargs
save_dict = {
    'adapter_state_dict': trained_adapter.state_dict(),
    'adapter_kwargs': adapter_kwargs
}

# Save the combined dictionary
torch.save(save_dict, 'adapters/linear-adapter_1epch.pth')

KeyboardInterrupt: 

In [None]:
# Function to encode query using the adapter
def encode_query(query, base_model, adapter):
    device = next(adapter.parameters()).device
    query_emb = base_model.encode(query, convert_to_tensor=True).to(device)
    adapted_query_emb = adapter(query_emb)
    return adapted_query_emb.cpu().detach().numpy()

In [None]:
# Later, loading and using the saved information
loaded_dict = torch.load('adapters/linear_adapter_1epoch.pth')

# Recreate the adapter
loaded_adapter = LinearAdapter(base_model.get_sentence_embedding_dimension())  # Initialize with appropriate parameters
loaded_adapter.load_state_dict(loaded_dict['adapter_state_dict'])

# Access the training parameters
training_params = loaded_dict['adapter_kwargs']

print("Adapter loaded successfully.")
print("Training parameters used:")
for key, value in training_params.items():
    print(f"{key}: {value}")

In [None]:
def evaluate_adapter(validation_data, base_model, adapter, k=10):
    hit_rates = []
    reciprocal_ranks = []
    
    for data_point in validation_data:
        question = data_point['question']
        ground_truth = data_point['chunk']
        
        # Generate embedding for the question
        question_embedding = encode_query(question, base_model, adapter)
        # Retrieve documents using the embedding
        retrieved_docs = retrieve_documents_embeddings(question_embedding, k)
        
        # Calculate metrics
        hr = hit_rate(retrieved_docs, ground_truth, k)
        rr = reciprocal_rank(retrieved_docs, ground_truth, k)
        
        hit_rates.append(hr)
        reciprocal_ranks.append(rr)
    
    # Calculate average metrics
    avg_hit_rate = np.mean(hit_rates)
    avg_reciprocal_rank = np.mean(reciprocal_ranks)
    
    return {
        'average_hit_rate': avg_hit_rate,
        'average_reciprocal_rank': avg_reciprocal_rank
    }

results = evaluate_adapter(validation_data, base_model, loaded_adapter, k=10)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")