<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/llm_rag_test5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install pymupdf
!pip install transformers
!pip install faiss-gpu
!pip install torch  # Ensure PyTorch is installed

import fitz  # PyMuPDF for handling PDFs
import os
import numpy as np
import faiss
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from google.colab import drive
drive.mount('/content/drive')

# Set environment variable to optimize PyTorch CUDA memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''.join(page.get_text() for page in doc)
    doc.close()
    return text

# Function to load and process a limited number of documents from a given directory
def load_and_process_documents(directory_path, limit=15):
    documents = []
    for i, filename in enumerate(os.listdir(directory_path)):
        if i >= limit:  # Limit the number of files processed
            break
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                documents.append(text)
    return documents

# Define path to your directory containing PDF files
directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'

# Load and process documents (limit to 15)
documents = load_and_process_documents(directory_path)

# Initialize model and tokenizer
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map='auto')
model.eval()  # Set model to evaluation mode
torch.set_grad_enabled(False)  # Disable gradients to save memory

# Function to create embeddings with memory optimization
def create_embeddings(texts, batch_size=2):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # Take the first token ([CLS]) embeddings
            embeddings.extend(batch_embeddings)
        del inputs, outputs  # Free up memory immediately
        torch.cuda.empty_cache()  # Clear CUDA cache
    return np.array(embeddings)

# Generate embeddings for all loaded documents
embeddings = create_embeddings(documents)

# Setup FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))

# Retrieval function based on embeddings
def retrieve_documents(query, k=5):
    query_embedding = create_embeddings([query])[0]  # Get embedding for query
    _, indices = index.search(np.array([query_embedding]), k)
    return [(documents[i], indices[0][i]) for i in indices[0]]

# Example query and retrieval
query = "What is the impact of climate change?"
retrieved_docs = retrieve_documents(query)
for doc, score in retrieved_docs:
    print(doc[:200], "Score:", score)  # Print first 200 characters of each document and its score


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.09 GiB is free. Process 34498 has 13.65 GiB memory in use. Of the allocated memory 13.26 GiB is allocated by PyTorch, and 275.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)