<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install necessary libraries
!pip install pymupdf sentence-transformers faiss-gpu transformers

import fitz  # PyMuPDF for handling PDFs
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

from google.colab import drive
drive.mount('/content/drive')

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''.join(page.get_text() for page in doc)
    doc.close()
    return text

# Function to load and process documents from a given directory
def load_and_process_documents(directory_path, limit=15):
    documents = []
    for i, filename in enumerate(os.listdir(directory_path)):
        if i >= limit:
            break
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                documents.append(text)
    return documents

# Define path to your directory containing PDF files
directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'

# Load and process documents
documents = load_and_process_documents(directory_path)

# Initialize the Sentence Transformer model for embeddings
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to create embeddings using Sentence Transformers
def create_embeddings(texts):
    return embed_model.encode(texts, show_progress_bar=True)

# Generate embeddings for all loaded documents
embeddings = create_embeddings(documents)

# Setup FAISS index for efficient similarity search
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

# Initialize GPT-2 model and tokenizer for text generation
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
def retrieve_and_generate(query, k=5):
    # Generate query embedding
    query_embedding = create_embeddings([query])[0]
    k = min(k, len(documents))  # Ensure the number of documents does not exceed available documents
    _, indices = index.search(np.array([query_embedding]), k)
    retrieved_texts = " ".join(documents[i] for i in indices[0])
    combined_text = query + " " + retrieved_texts

    # Tokenize the combined text and handle out-of-range token indices
    input_ids = tokenizer.encode(combined_text, truncation=True, max_length=1024, return_tensors='pt')

    # Ensure all input IDs are within the range of model's embeddings
    max_index = model.config.vocab_size - 1  # Get the maximum index GPT-2 can handle
    input_ids = torch.clamp(input_ids, max=max_index)  # Clamp all tokens to the max index

    # Generate the response with limited new tokens

    outputs = model.generate(
        input_ids,
        max_new_tokens=150,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Run the query
query = "What are the latest trends in financial markets?"
response = retrieve_and_generate(query, k=15)
print(response)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

IndexError: index out of range in self