<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install pymupdf sentence-transformers faiss-gpu transformers

import fitz  # PyMuPDF for handling PDFs
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

from google.colab import drive
drive.mount('/content/drive')

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''.join(page.get_text() for page in doc)
    doc.close()
    return text

# Function to load and process documents from a given directory
def load_and_process_documents(directory_path, limit=15):
    documents = []
    for i, filename in enumerate(os.listdir(directory_path)):
        if i >= limit:
            break
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                documents.append(text)
    return documents

# Define path to your directory containing PDF files
directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'

# Load and process documents
documents = load_and_process_documents(directory_path)

# Print diagnostic information about documents
print(f"Number of documents loaded: {len(documents)}")
if documents:
    print(f"Sample document length: {len(documents[0])}")

# Initialize the Sentence Transformer model for embeddings
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to create embeddings using Sentence Transformers
def create_embeddings(texts):
    return embed_model.encode(texts, show_progress_bar=True)

# Generate embeddings for all loaded documents
embeddings = create_embeddings(documents)

# Print diagnostic information about embeddings
print(f"Embeddings shape: {embeddings.shape}")

# Setup FAISS index for efficient similarity search
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

# Print diagnostic information about FAISS index
print(f"FAISS index size: {index.ntotal}")

# Initialize GPT-2 model and tokenizer for text generation
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Add padding token
tokenizer.pad_token = tokenizer.eos_token

def retrieve_and_generate(query, k=5):
    # Generate query embedding
    query_embedding = create_embeddings([query])[0]
    k = min(k, len(documents))
    _, indices = index.search(np.array([query_embedding]), k)

    # Limit the retrieved texts length
    retrieved_texts = []
    total_tokens = 0
    max_tokens = 800  # Leave room for query and generation

    for idx in indices[0]:
        tokens = tokenizer.encode(documents[idx])
        if total_tokens + len(tokens) > max_tokens:
            break
        retrieved_texts.append(documents[idx])
        total_tokens += len(tokens)

    print(f"Debug - Number of retrieved texts: {len(retrieved_texts)}")
    print(f"Debug - Total tokens before combination: {total_tokens}")

    # Combine texts
    combined_text = query + " " + " ".join(retrieved_texts)

    # Tokenize with proper truncation
    input_ids = tokenizer.encode(
        combined_text,
        truncation=True,
        max_length=1024,
        padding=True,
        return_tensors='pt'
    )

    print(f"Debug - Input shape after tokenization: {input_ids.shape}")

    # Create attention mask
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    try:
        # Move model to GPU if available
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Generate with more conservative parameters
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            num_beams=1,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id,
            max_length=1024,
            early_stopping=True,
            do_sample=False,  # Use greedy decoding
            temperature=1.0
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the generated portion (after the input query)
        if query in generated_text:
            response = generated_text[generated_text.index(query) + len(query):].strip()
        else:
            response = generated_text.strip()

        return response

    except Exception as e:
        print(f"Debug - Input shape: {input_ids.shape}")
        print(f"Debug - Max token id: {input_ids.max().item()}")
        print(f"Debug - Full error: {str(e)}")
        return f"Error generating response: {str(e)}"

def test_system():
    # Test queries
    test_queries = [
        "What are the latest trends in financial markets?",
        "Explain the concept of inflation",
        "How do interest rates affect the economy?"
    ]

    print("Testing system with multiple queries...")
    for query in test_queries:
        print("\n" + "="*50)
        print(f"Query: {query}")
        try:
            response = retrieve_and_generate(query, k=3)  # Using k=3 for testing
            print(f"Response: {response}")
        except Exception as e:
            print(f"Error processing query: {str(e)}")
        print("="*50)

# Run system test
print("Starting system test...")
test_system()

# Example usage
query = "What are the latest trends in financial markets?"
print('\nRunning final test query:', query)
response = retrieve_and_generate(query, k=5)
print('Final response:', response)

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, pymupdf
Successfully installed faiss-gpu-1.7.2 pymupdf-1.25.1
Mounted at /content/drive
Number of documents loaded: 15
Sample document length: 9279


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings shape: (15, 384)
FAISS index size: 15


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Starting system test...
Testing system with multiple queries...

Query: What are the latest trends in financial markets?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2060 > 1024). Running this sequence through the model will result in indexing errors


Debug - Number of retrieved texts: 0
Debug - Total tokens before combination: 0
Debug - Input shape after tokenization: torch.Size([1, 10])


Both `max_new_tokens` (=100) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Response: What is the most important news for investors?
The latest news is that the US Federal Reserve is now raising interest rates to keep the economy growing. 
What do you think about the Fed's decision to raise interest rate?

Query: Explain the concept of inflation


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Both `max_new_tokens` (=100) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Debug - Number of retrieved texts: 0
Debug - Total tokens before combination: 0
Debug - Input shape after tokenization: torch.Size([1, 7])
Response: and how it works.
The first thing to understand is that inflation is a function of the price of goods and services. The price is the sum of all the goods that are sold in the market. If you buy a car, you get a dollar for every dollar you spend on it. So if you have a house, a home, or a truck, the cost of a gallon of gasoline is $1.50. You can buy anything you want, but you can't buy it

Query: How do interest rates affect the economy?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Both `max_new_tokens` (=100) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Debug - Number of retrieved texts: 0
Debug - Total tokens before combination: 0
Debug - Input shape after tokenization: torch.Size([1, 9])
Response: The answer is that they do.    The Fed's interest rate policy is based on the assumption that the Fed will raise rates in the near term, and that it will do so in a way that will reduce the cost of borrowing.
The Fed has been using interest-rate policy to reduce its borrowing costs for the past several years.   The reason for this is simple: the Federal Reserve has not been able to raise interest on its own.  The Federal Open Market Committee (F

Running final test query: What are the latest trends in financial markets?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Both `max_new_tokens` (=100) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Debug - Number of retrieved texts: 0
Debug - Total tokens before combination: 0
Debug - Input shape after tokenization: torch.Size([1, 10])
Final response: What is the most important news for investors?
The latest news is that the US Federal Reserve is now raising interest rates to keep the economy growing. 
What do you think about the Fed's decision to raise interest rate?
