<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test5b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install pymupdf sentence-transformers faiss-gpu transformers

import fitz  # PyMuPDF for handling PDFs
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

# Diagnostic functions
def check_drive_mounting():
    """
    Verifies that Google Drive is properly mounted and accessible.
    Returns: bool indicating success
    """
    try:
        drive_path = '/content/drive'
        if not os.path.exists(drive_path):
            print("Google Drive is not mounted!")
            return False
        print("Google Drive is mounted successfully")
        return True
    except Exception as e:
        print(f"Error checking drive mount: {str(e)}")
        return False

def check_directory_content(directory_path):
    """
    Analyzes and reports the contents of the specified directory.
    Args:
        directory_path: str, path to check
    Returns: bool indicating if directory is accessible and contains PDFs
    """
    try:
        if not os.path.exists(directory_path):
            print(f"Directory does not exist: {directory_path}")
            return False

        files = os.listdir(directory_path)
        pdf_files = [f for f in files if f.endswith('.pdf')]

        print(f"\nDirectory contents:")
        print(f"Total files: {len(files)}")
        print(f"PDF files: {len(pdf_files)}")
        if pdf_files:
            print("First few PDF files:")
            for pdf in pdf_files[:5]:
                print(f"- {pdf}")
        return bool(pdf_files)
    except Exception as e:
        print(f"Error checking directory: {str(e)}")
        return False

def extract_text_from_pdf(pdf_path):
    """
    Extracts text content from a PDF file.
    Args:
        pdf_path: str, path to PDF file
    Returns: str, extracted text
    """
    try:
        doc = fitz.open(pdf_path)
        text = ' '.join(page.get_text() for page in doc)
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

def load_and_process_documents(directory_path, limit=15):
    """
    Loads and processes PDF documents from the specified directory.
    Args:
        directory_path: str, path to directory containing PDFs
        limit: int, maximum number of documents to load
    Returns: list of processed document texts
    """
    documents = []

    print("\nChecking drive and directory...")
    if not check_drive_mounting():
        return documents

    if not check_directory_content(directory_path):
        return documents

    print("\nAttempting to load documents...")
    files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for i, filename in enumerate(files):
        if i >= limit:
            break

        try:
            pdf_path = os.path.join(directory_path, filename)
            print(f"\nProcessing {filename}...")

            if not os.path.exists(pdf_path):
                print(f"File not found: {pdf_path}")
                continue

            text = extract_text_from_pdf(pdf_path)

            if not text:
                print(f"No text extracted from {filename}")
                continue

            if len(text.strip()) < 100:
                print(f"Warning: Very short text ({len(text)} chars) from {filename}")

            documents.append(text)
            print(f"Successfully loaded: {filename} ({len(text)} characters)")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    print(f"\nDocument loading summary:")
    print(f"Total documents loaded: {len(documents)}")
    if documents:
        print(f"Average document length: {sum(len(d) for d in documents)/len(documents):.0f} characters")

    return documents

def create_embeddings(texts):
    """
    Creates embeddings for the provided texts using Sentence Transformer.
    Args:
        texts: list of str, texts to embed
    Returns: numpy array of embeddings
    """
    return embed_model.encode(texts, show_progress_bar=True)

def preprocess_text(text, max_tokens=250):
    """
    Preprocesses text to fit within token limits.
    Args:
        text: str, input text
        max_tokens: int, maximum number of tokens
    Returns: str, preprocessed text
    """
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens)
    return text

def retrieve_and_generate(query, k=5):
    """
    Retrieves relevant documents and generates a response based on the query.
    Args:
        query: str, the user's question
        k: int, number of documents to retrieve
    Returns: str, generated response
    """
    print(f"\nProcessing query: {query}")

    # Generate query embedding
    query_embedding = create_embeddings([query])[0]
    k = min(k, len(documents))

    # Retrieve similar documents
    distances, indices = index.search(np.array([query_embedding]).astype('float32'), k)

    # Build context from retrieved documents
    retrieved_texts = []
    total_tokens = 0
    max_context_tokens = 500

    print("\nRetrieved documents:")
    for i, idx in enumerate(indices[0]):
        text = documents[idx]
        preview = text[:100] + "..."
        print(f"{i+1}. Score: {distances[0][i]:.4f}\nPreview: {preview}\n")

        # Preprocess and truncate text
        processed_text = preprocess_text(text)
        tokens = tokenizer.encode(processed_text)

        if total_tokens + len(tokens) > max_context_tokens:
            print(f"Reached token limit after {i} documents")
            break

        retrieved_texts.append(processed_text)
        total_tokens += len(tokens)

    context = " ".join(retrieved_texts)

    print(f"Debug - Retrieved texts count: {len(retrieved_texts)}")
    print(f"Debug - Total tokens in context: {total_tokens}")

    # Construct prompt
    combined_text = (
        f"Based on the provided articles, give a concise summary about {query}\n\n"
        f"Articles:\n{context}\n\n"
        "Summary:"
    )

    # Tokenize
    input_ids = tokenizer.encode(
        combined_text,
        truncation=True,
        max_length=1024,
        padding=False,
        return_tensors='pt'
    )

    print(f"Debug - Final input shape: {input_ids.shape}")

    # Move to GPU if available
    input_ids = input_ids.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

    try:
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=150,
            num_beams=3,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            length_penalty=1.0,
            repetition_penalty=1.2
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the generated summary
        if "Summary:" in generated_text:
            response = generated_text.split("Summary:")[-1].strip()
        else:
            response = generated_text.strip()

        return response

    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return f"Error generating response: {str(e)}"

def test_system(custom_queries=None):
    """
    Tests the system with predefined or custom queries.
    Args:
        custom_queries: list of str, optional custom queries to test
    """
    default_queries = [
        "What are the recent developments in Russia's economy and sanctions?",
        "What's happening with the stock market and gold prices?",
        "What are the latest developments in cryptocurrency?",
        "What's happening with global supply chains and manufacturing?",
        "How are international relations affecting the economy?"
    ]

    test_queries = custom_queries if custom_queries else default_queries

    print("\nTesting system with queries...")
    for query in test_queries:
        print("\n" + "="*80)
        print(f"Query: {query}")
        try:
            response = retrieve_and_generate(query, k=3)
            print(f"\nResponse: {response}")
        except Exception as e:
            print(f"Error processing query: {str(e)}")
        print("="*80)

def main():
    """
    Main execution function that sets up and runs the system.
    """
    global documents, embed_model, tokenizer, model, device, index

    print("Starting document processing and model initialization...")

    # Mount Google Drive
    drive.mount('/content/drive')

    # Define directory path
    directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'

    # Load documents
    documents = load_and_process_documents(directory_path)
    if not documents:
        raise Exception("No documents loaded!")

    # Initialize models
    print("\nInitializing models...")
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Setup GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print(f"Using device: {device}")

    # Create embeddings and FAISS index
    print("\nCreating embeddings...")
    embeddings = create_embeddings(documents)

    print("\nSetting up FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))

    # Run system test
    test_system()

    # Interactive mode
    while True:
        try:
            query = input("\nEnter your query (or 'quit' to exit): ")
            if query.lower() == 'quit':
                break

            response = retrieve_and_generate(query, k=3)
            print(f"\nResponse: {response}")

        except KeyboardInterrupt:
            print("\nExiting interactive mode...")
            break
        except Exception as e:
            print(f"Error: {str(e)}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Fatal error: {str(e)}")

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, pymupdf
Successfully installed faiss-gpu-1.7.2 pymupdf-1.25.1
Starting document processing and model initialization...
Mounted at /content/drive

Checking drive and directory...
Google Drive is mounted successfully

Directory contents:
Total files: 2673
PDF files: 26

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda

Creating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Setting up FAISS index...

Testing system with queries...

Query: What are the recent developments in Russia's economy and sanctions?

Processing query: What are the recent developments in Russia's economy and sanctions?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3093 > 1024). Running this sequence through the model will result in indexing errors



Retrieved documents:
1. Score: 0.6182
Preview: This copy is for your personal, non-commercial use only. To order presentation-ready copies for dist...

2. Score: 0.6817
Preview: Manezhnaya Square in Moscow. Analysts expect Russia’s economy to contract as much as 20% this quarte...

3. Score: 1.1016
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
HEALTH
Western Drugmakers Are Still Providing Medicines to ...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 532])

Response: 1. Russia's financial system is crippled by sanctions.
2. The United States and European Union imposed sanctions on Russia in response
to the annexation of Crimea and the destabilization of Ukraine.
3. The Russian economy has been hit hard by the sanctions. It is now in a state of recession.
4. The West has cut off access to Russia's banking system and cut off
the ability of Russian banks to conduct business.
5. Th

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 0.9697
Preview: The Dow Jones Industrial Average slid nearly 800 points Monday, marking a 10% decline
from January’s...

2. Score: 1.0038
Preview: Dow Fell, Oil and Gold Surged—and What
Else Happened in the Stock Market Today
By
Jack Denton
Follow...

3. Score: 1.1438
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
POLITICS
Bitcoin Price Surges on Biden’s Crypto Executive O...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 531])

Response: The Dow Jones industrial average
slowed by more than 600 points to close
in correction territory Monday.
This was the second day in a row that the Dow
was down more than 500 points. On Monday, it was

down more than 1,200 points, and on Tuesday it was down more

than 1,300 points.
On Monday, the S&C 500 lost more than

1,500 points,

and on Tuesday, it fell more than 2,000 points, a

decrease of more than 4%.
On Fr

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 1.0238
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
POLITICS
Bitcoin Price Surges on Biden’s Crypto Executive O...

2. Score: 1.4777
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
A third of the world’s palladium comes from Russia, analyst...

3. Score: 1.4997
Preview: Dow Fell, Oil and Gold Surged—and What
Else Happened in the Stock Market Today
By
Jack Denton
Follow...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 529])

Response: The U.N. Security Council has approved a resolution authorizing the
submission of a draft resolution to the Organization for Security and Cooperation in

Eurasia (OSCE) to consider whether to impose sanctions on Russia for its

illegal annexation of Crimea from Ukraine in March 2014. The

resolution, which was approved by the Security Council on Wednesday, is expected to be adopted

by the end of the year.
The reso

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 1.0434
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
BUSINESS
China’s Covid-19 Surge Shuts Down Plants in Manufa...

2. Score: 1.1895
Preview: China is racing ahead in building the infrastructure of 5G networks, but it is inside
factories, coa...

3. Score: 1.2198
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
HEALTH
Western Drugmakers Are Still Providing Medicines to ...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 531])

Response: China's 5G network will be the world's most advanced and reliable. It will be able to

automatically connect to a wide range of industries, including food processing,

manufacturing, logistics, transportation, energy, pharmaceuticals, health care, and

oil and gas. It could also be used to connect to the rest of the world, including

the United States, Canada, Australia, New Zealand, Japan, South Korea, and the Eur

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 1.1460
Preview: This copy is for your personal, non-commercial use only. To order presentation-ready copies for dist...

2. Score: 1.2321
Preview: Manezhnaya Square in Moscow. Analysts expect Russia’s economy to contract as much as 20% this quarte...

3. Score: 1.3907
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
POLITICS
Bitcoin Price Surges on Biden’s Crypto Executive O...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 528])

Response: The United States and European Union imposed sanctions on Russia over the Ukraine crisis,
and the Russian government responded with a series of measures designed to weaken the
U.S.-led NATO alliance. The sanctions were aimed at preventing Russia from expanding its influence in
the region, and to prevent the United States from using its military might to
escalate the conflict.
Russia's economy has been crippled by t

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 1.0721
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
BUSINESS
China’s Covid-19 Surge Shuts Down Plants in Manufa...

2. Score: 1.1527
Preview: Manezhnaya Square in Moscow. Analysts expect Russia’s economy to contract as much as 20% this quarte...

3. Score: 1.2566
Preview: This copy is for your personal, non-commercial use only. To order presentation-ready copies for dist...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 524])

Response: Russia's economy contracted by more than 20% in the first quarter of this year, according to data from the International Monetary Fund.
Russia is the world's second-largest exporter of crude oil, after the United States, and its

oil production is expected to grow by at least 3.5% this year. It is also the third-largest producer of natural gas

in the world after China and the United Arab Emirates.
In the first hal

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved documents:
1. Score: 1.2917
Preview: DOW JONES, A NEWS CORP COMPANY
About WSJ
POLITICS
Bitcoin Price Surges on Biden’s Crypto Executive O...

2. Score: 1.3996
Preview: Dow Fell, Oil and Gold Surged—and What
Else Happened in the Stock Market Today
By
Jack Denton
Follow...

3. Score: 1.4250
Preview: The Dow Jones Industrial Average slid nearly 800 points Monday, marking a 10% decline
from January’s...

Reached token limit after 2 documents
Debug - Retrieved texts count: 2
Debug - Total tokens in context: 500
Debug - Final input shape: torch.Size([1, 528])

Response: The Dow Jones industrial average (DJIA) is up 4.3% over the past

week. The Nasdaq composite (DJI) is down 2.9%. The

US dollar is down 1.2%. The Euro is down 0.7%. The Japanese yen

is down 1%. The Russian ruble is up 0.5%. The Chinese yuan is up

0.4% and the Chinese renminbi is up 1.3%.
The Nikkei 225 is up 2.5% and is up 3.1%

on the day. The Shanghai Composite (S&P) has

tied up 1% against the US dollar. The J