<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install pymupdf sentence-transformers faiss-gpu transformers

import fitz  # PyMuPDF for handling PDFs
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from google.colab import drive

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Function to extract text from PDFs with error handling
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ' '.join(page.get_text() for page in doc)
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

# Enhanced function to load and process documents
def load_and_process_documents(directory_path, limit=15):
    documents = []

    # Check if directory exists
    if not os.path.exists(directory_path):
        print(f"Directory not found: {directory_path}")
        return documents

    # List and filter PDF files
    files = os.listdir(directory_path)
    pdf_files = [f for f in files if f.endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files in directory")

    # Process each PDF file
    for i, filename in enumerate(pdf_files):
        if i >= limit:
            break
        try:
            pdf_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")
            text = extract_text_from_pdf(pdf_path)

            if text and len(text.strip()) > 0:
                documents.append(text)
                print(f"Successfully loaded: {filename} ({len(text)} characters)")
            else:
                print(f"Warning: Empty text from {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    print(f"\nSuccessfully loaded {len(documents)} documents")
    return documents

# Define path to your directory containing PDF files
directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'
print(f"\nLoading documents from: {directory_path}")

# Load and process documents
documents = load_and_process_documents(directory_path)

# Verify documents were loaded
if not documents:
    print("No documents were loaded! Please check your directory path and PDF files.")
    # You might want to raise an exception here or handle this case appropriately
else:
    print(f"\nLoaded {len(documents)} documents")
    print("First document preview:")
    print(documents[0][:500] + "...")

# Initialize the Sentence Transformer model for embeddings
print("\nInitializing Sentence Transformer model...")
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to create embeddings using Sentence Transformers
def create_embeddings(texts):
    return embed_model.encode(texts, show_progress_bar=True)

# Generate embeddings for all loaded documents
print("\nGenerating embeddings for documents...")
embeddings = create_embeddings(documents)
print(f"Embeddings shape: {embeddings.shape}")

# Setup FAISS index for efficient similarity search
print("\nSetting up FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))
print(f"FAISS index size: {index.ntotal}")

# Initialize GPT-2 model and tokenizer for text generation
print("\nInitializing GPT-2 model and tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

def retrieve_and_generate(query, k=5):
    print(f"\nProcessing query: {query}")

    # Generate query embedding
    query_embedding = create_embeddings([query])[0]
    k = min(k, len(documents))

    if len(documents) == 0:
        print("Warning: No documents available for retrieval")
        context = ""
        retrieved_texts = []
    else:
        # Retrieve similar documents
        _, indices = index.search(np.array([query_embedding]).astype('float32'), k)
        retrieved_texts = []
        total_tokens = 0
        max_tokens = 800  # Leave room for query and generation

        # Build context from retrieved documents
        for idx in indices[0]:
            text = documents[idx]
            tokens = tokenizer.encode(text)
            if total_tokens + len(tokens) > max_tokens:
                break
            retrieved_texts.append(text)
            total_tokens += len(tokens)

        context = " ".join(retrieved_texts)

    print(f"Debug - Retrieved texts count: {len(retrieved_texts)}")
    print(f"Debug - Context length: {len(context) if context else 0}")

    # Combine query with context
    combined_text = query + " " + context

    # Tokenize with proper truncation
    input_ids = tokenizer.encode(
        combined_text,
        truncation=True,
        max_length=1024,
        padding=True,
        return_tensors='pt'
    )

    print(f"Debug - Input shape after tokenization: {input_ids.shape}")

    # Move input_ids to the same device as the model
    input_ids = input_ids.to(device)

    # Create attention mask
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

    try:
        # Generate with more conservative parameters
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            num_beams=1,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True,
            do_sample=False,  # Use greedy decoding
            temperature=1.0
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the generated portion (after the input query)
        if query in generated_text:
            response = generated_text[generated_text.index(query) + len(query):].strip()
        else:
            response = generated_text.strip()

        return response

    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return f"Error generating response: {str(e)}"

def test_system():
    # Test queries
    test_queries = [
        "What are the latest trends in financial markets?",
        "Explain the concept of inflation",
        "How do interest rates affect the economy?"
    ]

    print("\nTesting system with multiple queries...")
    for query in test_queries:
        print("\n" + "="*50)
        print(f"Query: {query}")
        try:
            response = retrieve_and_generate(query, k=3)
            print(f"Response: {response}")
        except Exception as e:
            print(f"Error processing query: {str(e)}")
        print("="*50)

# Run system test
print("\nStarting system test...")
test_system()

# Example usage
query = "What are the latest trends in financial markets?"
print('\nRunning final test query:', query)
response = retrieve_and_generate(query, k=5)
print('Final response:', response)

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, pymupdf
Successfully installed faiss-gpu-1.7.2 pymupdf-1.25.1
Mounting Google Drive...
Mounted at /content/drive

Loading documents from: /content/drive/My Drive/All_Finance_PDF_files_old/
Found 2672 PDF files in directory
Processing China’s Covid-19 Surge Shuts Dow

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Generating embeddings for documents...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings shape: (15, 384)

Setting up FAISS index...
FAISS index size: 15

Initializing GPT-2 model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda

Starting system test...

Testing system with multiple queries...

Query: What are the latest trends in financial markets?

Processing query: What are the latest trends in financial markets?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2058 > 1024). Running this sequence through the model will result in indexing errors


Debug - Retrieved texts count: 0
Debug - Context length: 0
Debug - Input shape after tokenization: torch.Size([1, 10])




Response: What is the most important news for investors?
The latest news is that the US Federal Reserve is now raising interest rates to keep the economy growing. 
What do you think about the Fed's decision to raise interest rate?

Query: Explain the concept of inflation

Processing query: Explain the concept of inflation


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Debug - Retrieved texts count: 0
Debug - Context length: 0
Debug - Input shape after tokenization: torch.Size([1, 7])
Response: and how it works.
The first thing to understand is that inflation is a function of the price of goods and services. The price is the sum of all the goods that are sold in the market. If you buy a car, you get a dollar for every dollar you spend on it. So if you have a house, a home, or a truck, the cost of a gallon of gasoline is $1.50. You can buy anything you want, but you can't buy it

Query: How do interest rates affect the economy?

Processing query: How do interest rates affect the economy?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Debug - Retrieved texts count: 0
Debug - Context length: 0
Debug - Input shape after tokenization: torch.Size([1, 9])
Response: The answer is that they do.    The Fed's interest rate policy is based on the assumption that the Fed will raise rates in the near term, and that it will do so in a way that will reduce the cost of borrowing.
The Fed has been using interest-rate policy to reduce its borrowing costs for the past several years.   The reason for this is simple: the Federal Reserve has not been able to raise interest on its own.  The Federal Open Market Committee (F

Running final test query: What are the latest trends in financial markets?

Processing query: What are the latest trends in financial markets?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Debug - Retrieved texts count: 0
Debug - Context length: 0
Debug - Input shape after tokenization: torch.Size([1, 10])
Final response: What is the most important news for investors?
The latest news is that the US Federal Reserve is now raising interest rates to keep the economy growing. 
What do you think about the Fed's decision to raise interest rate?


In [2]:
# Add these diagnostic functions at the beginning of the code
def check_drive_mounting():
    try:
        drive_path = '/content/drive'
        if not os.path.exists(drive_path):
            print("Google Drive is not mounted!")
            return False
        print("Google Drive is mounted successfully")
        return True
    except Exception as e:
        print(f"Error checking drive mount: {str(e)}")
        return False

def check_directory_content(directory_path):
    try:
        if not os.path.exists(directory_path):
            print(f"Directory does not exist: {directory_path}")
            return False

        files = os.listdir(directory_path)
        pdf_files = [f for f in files if f.endswith('.pdf')]

        print(f"Directory contents:")
        print(f"Total files: {len(files)}")
        print(f"PDF files: {len(pdf_files)}")
        if pdf_files:
            print("First few PDF files:")
            for pdf in pdf_files[:5]:
                print(f"- {pdf}")
        return True
    except Exception as e:
        print(f"Error checking directory: {str(e)}")
        return False

# Modify the document loading function to be more verbose
def load_and_process_documents(directory_path, limit=15):
    documents = []

    print("\nChecking drive and directory...")
    if not check_drive_mounting():
        return documents

    if not check_directory_content(directory_path):
        return documents

    print("\nAttempting to load documents...")
    files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for i, filename in enumerate(files):
        if i >= limit:
            break

        try:
            pdf_path = os.path.join(directory_path, filename)
            print(f"\nProcessing {filename}...")

            # Check if file exists and is readable
            if not os.path.exists(pdf_path):
                print(f"File not found: {pdf_path}")
                continue

            # Extract text
            text = extract_text_from_pdf(pdf_path)

            # Check extracted text
            if not text:
                print(f"No text extracted from {filename}")
                continue

            if len(text.strip()) < 100:  # Arbitrary minimum length
                print(f"Warning: Very short text ({len(text)} chars) from {filename}")

            documents.append(text)
            print(f"Successfully loaded: {filename} ({len(text)} characters)")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    print(f"\nDocument loading summary:")
    print(f"Total documents loaded: {len(documents)}")
    if documents:
        print(f"Average document length: {sum(len(d) for d in documents)/len(documents):.0f} characters")

    return documents

# Use this to test the document loading
def test_document_loading():
    print("Testing document loading...")
    directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'

    # Check drive mounting
    if not check_drive_mounting():
        return

    # Check directory
    if not check_directory_content(directory_path):
        return

    # Load documents
    documents = load_and_process_documents(directory_path)

    # Print results
    if documents:
        print("\nDocument loading successful!")
        print(f"Number of documents loaded: {len(documents)}")
        print("\nFirst document preview:")
        print(documents[0][:500] + "...")
    else:
        print("\nNo documents were loaded!")

# Run the test before proceeding with the rest of the code
test_document_loading()

Testing document loading...
Google Drive is mounted successfully
Directory contents:
Total files: 2673
PDF files: 2672
First few PDF files:
- China’s Covid-19 Surge Shuts Down Plants in Manufacturing Hubs Shenzhen and Changchun - WSJ.pdf
- Russian Prosecutors Warn Western Companies of Arrests, Asset Seizures - WSJ.pdf
- Können russische Oligarchen Sanktionen mit Kryptowährungen umgehen.pdf
- TikTok Influencers Get Spotlight in Information Battle Over the Russia-Ukraine War - WSJ.pdf
- Chip Makers Stockpiled Key Materials Ahead of Russian Invasion of Ukraine - WSJ.pdf

Checking drive and directory...
Google Drive is mounted successfully
Directory contents:
Total files: 2673
PDF files: 2672
First few PDF files:
- China’s Covid-19 Surge Shuts Down Plants in Manufacturing Hubs Shenzhen and Changchun - WSJ.pdf
- Russian Prosecutors Warn Western Companies of Arrests, Asset Seizures - WSJ.pdf
- Können russische Oligarchen Sanktionen mit Kryptowährungen umgehen.pdf
- TikTok Influencers Get