<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test5a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install pymupdf sentence-transformers faiss-gpu transformers

import fitz  # PyMuPDF for handling PDFs
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from google.colab import drive

# Diagnostic functions
def check_drive_mounting():
    """
    Verifies that Google Drive is properly mounted and accessible.
    Returns: bool indicating success
    """
    try:
        drive_path = '/content/drive'
        if not os.path.exists(drive_path):
            print("Google Drive is not mounted!")
            return False
        print("Google Drive is mounted successfully")
        return True
    except Exception as e:
        print(f"Error checking drive mount: {str(e)}")
        return False

def check_directory_content(directory_path):
    """
    Analyzes and reports the contents of the specified directory.
    Args:
        directory_path: str, path to check
    Returns: bool indicating if directory is accessible and contains PDFs
    """
    try:
        if not os.path.exists(directory_path):
            print(f"Directory does not exist: {directory_path}")
            return False

        files = os.listdir(directory_path)
        pdf_files = [f for f in files if f.endswith('.pdf')]

        print(f"\nDirectory contents:")
        print(f"Total files: {len(files)}")
        print(f"PDF files: {len(pdf_files)}")
        if pdf_files:
            print("First few PDF files:")
            for pdf in pdf_files[:5]:
                print(f"- {pdf}")
        return bool(pdf_files)
    except Exception as e:
        print(f"Error checking directory: {str(e)}")
        return False

def extract_text_from_pdf(pdf_path):
    """
    Extracts text content from a PDF file.
    Args:
        pdf_path: str, path to PDF file
    Returns: str, extracted text
    """
    try:
        doc = fitz.open(pdf_path)
        text = ' '.join(page.get_text() for page in doc)
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

def load_and_process_documents(directory_path, limit=15):
    """
    Loads and processes PDF documents from the specified directory.
    Args:
        directory_path: str, path to directory containing PDFs
        limit: int, maximum number of documents to load
    Returns: list of processed document texts
    """
    documents = []

    print("\nChecking drive and directory...")
    if not check_drive_mounting():
        return documents

    if not check_directory_content(directory_path):
        return documents

    print("\nAttempting to load documents...")
    files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for i, filename in enumerate(files):
        if i >= limit:
            break

        try:
            pdf_path = os.path.join(directory_path, filename)
            print(f"\nProcessing {filename}...")

            if not os.path.exists(pdf_path):
                print(f"File not found: {pdf_path}")
                continue

            text = extract_text_from_pdf(pdf_path)

            if not text:
                print(f"No text extracted from {filename}")
                continue

            if len(text.strip()) < 100:
                print(f"Warning: Very short text ({len(text)} chars) from {filename}")

            documents.append(text)
            print(f"Successfully loaded: {filename} ({len(text)} characters)")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    print(f"\nDocument loading summary:")
    print(f"Total documents loaded: {len(documents)}")
    if documents:
        print(f"Average document length: {sum(len(d) for d in documents)/len(documents):.0f} characters")

    return documents

def create_embeddings(texts):
    """
    Creates embeddings for the provided texts using Sentence Transformer.
    Args:
        texts: list of str, texts to embed
    Returns: numpy array of embeddings
    """
    return embed_model.encode(texts, show_progress_bar=True)

def retrieve_and_generate(query, k=5):
    """
    Retrieves relevant documents and generates a response based on the query.
    Args:
        query: str, the user's question
        k: int, number of documents to retrieve
    Returns: str, generated response
    """
    print(f"\nProcessing query: {query}")

    # Generate query embedding
    query_embedding = create_embeddings([query])[0]
    k = min(k, len(documents))

    # Retrieve similar documents
    distances, indices = index.search(np.array([query_embedding]).astype('float32'), k)

    # Build context from retrieved documents
    retrieved_texts = []
    total_tokens = 0
    max_tokens = 800

    print("\nRetrieved documents:")
    for i, idx in enumerate(indices[0]):
        text = documents[idx]
        preview = text[:100] + "..."
        print(f"{i+1}. Score: {distances[0][i]:.4f}\nPreview: {preview}\n")

        tokens = tokenizer.encode(text)
        if total_tokens + len(tokens) > max_tokens:
            print(f"Reached token limit after {i} documents")
            break
        retrieved_texts.append(text)
        total_tokens += len(tokens)

    context = " ".join(retrieved_texts)

    print(f"Debug - Retrieved texts count: {len(retrieved_texts)}")
    print(f"Debug - Total tokens in context: {total_tokens}")

    # Construct prompt with explicit structure
    combined_text = (
        f"Based on the following articles, {query}\n\n"
        f"Articles:\n{context}\n\n"
        "Answer:"
    )

    # Tokenize with proper truncation
    input_ids = tokenizer.encode(
        combined_text,
        truncation=True,
        max_length=1024,
        padding=True,
        return_tensors='pt'
    )

    print(f"Debug - Input shape after tokenization: {input_ids.shape}")

    # Move to GPU if available
    input_ids = input_ids.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

    try:
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=150,
            num_beams=3,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the generated answer
        if "Answer:" in generated_text:
            response = generated_text.split("Answer:")[-1].strip()
        else:
            response = generated_text.strip()

        return response

    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return f"Error generating response: {str(e)}"

# Main execution
if __name__ == "__main__":
    print("Starting document processing and model initialization...")

    # Mount Google Drive
    drive.mount('/content/drive')

    # Define directory path
    directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'

    # Load documents
    documents = load_and_process_documents(directory_path)
    if not documents:
        raise Exception("No documents loaded!")

    # Initialize models
    print("\nInitializing models...")
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Setup GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print(f"Using device: {device}")

    # Create embeddings and FAISS index
    print("\nCreating embeddings...")
    embeddings = create_embeddings(documents)

    print("\nSetting up FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))

    # Test the system
    test_queries = [
        "What are the recent developments in Russia's economy and sanctions?",
        "What's happening with the stock market and gold prices?",
        "What are the latest developments in cryptocurrency?"
    ]

    print("\nTesting system with sample queries...")
    for query in test_queries:
        print("\n" + "="*80)
        print(f"Query: {query}")
        try:
            response = retrieve_and_generate(query, k=3)
            print(f"\nResponse: {response}")
        except Exception as e:
            print(f"Error processing query: {str(e)}")
        print("="*80)

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, pymupdf
Successfully installed faiss-gpu-1.7.2 pymupdf-1.25.1
Starting document processing and model initialization...
Mounted at /content/drive

Checking drive and directory...
Google Drive is mounted successfully

Directory contents:
Total files: 2673
PDF files: 26

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda

Creating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Setting up FAISS index...

Testing system with sample queries...

Query: What are the recent developments in Russia's economy and sanctions?

Processing query: What are the recent developments in Russia's economy and sanctions?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3093 > 1024). Running this sequence through the model will result in indexing errors



Retrieved documents:
1. Score: 0.6182
Preview: This copy is for your personal, non-commercial use only. To order presentation-ready copies for dist...

Reached token limit after 0 documents
Debug - Retrieved texts count: 0
Debug - Total tokens in context: 0
Debug - Input shape after tokenization: torch.Size([1, 27])

Response: Russia's economy is growing at an average annual rate of 3.7% per year, according to the International Monetary Fund (IMF).

According to the IMF, the Russian economy grew at a rate of 2.5% in the first quarter of 2014, compared to a 2.3% growth rate in the same period last year.

In the first three months of 2015, Russia's GDP grew by 1.1% per annum, while the average annual growth rate was 1.5%.

Source: IMF

What are the current economic trends in Russia?


Question 1: Is the economy growing at a faster pace than the rest of the world, or at a slower pace than other countries

Query: What's happening with the stock market and gold prices?

Processing query: W

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 0.9697
Preview: The Dow Jones Industrial Average slid nearly 800 points Monday, marking a 10% decline
from January’s...

Reached token limit after 0 documents
Debug - Retrieved texts count: 0
Debug - Total tokens in context: 0
Debug - Input shape after tokenization: torch.Size([1, 26])

Response: The stock market has been in a downward spiral for a long time. It has been going up and down for a while now, but it has never been higher than it is now. The stock market is the most volatile asset class in the world, and it has always been so. It is the only asset class that has been able to maintain its current level of volatility.

In the past few years, there has been a lot of speculation about what the future holds for the stock markets. There have been a number of articles that have been written on the subject, but the most important one is this one from Bloomberg:
. . .

"The price of gold has gone up by more than 20 percent in the past

Query: What are

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 1.0238
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
POLITICS
Bitcoin Price Surges on Biden’s Crypto Executive O...

Reached token limit after 0 documents
Debug - Retrieved texts count: 0
Debug - Total tokens in context: 0
Debug - Input shape after tokenization: torch.Size([1, 23])

Response: Cryptocurrency is not a new concept. It has been around for a long time and has been used for many different purposes. In fact, it is one of the most popular and widely used cryptocurrencies.

In the past, there were two main types of cryptocurrencies. The first type was called Bitcoin. The second type of cryptocurrency was called Monero. These two cryptocurrencies were called "cryptocurrencies" because they were both based on the concept of "monero".

Monero is a cryptocurrency that is based on Monero protocol. It is the first cryptocurrency to use the Monero network. Monero was created by Satoshi Nakamoto. It was created to solve a problem that was solved by t