<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test5c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Required installations
!pip install pymupdf sentence-transformers faiss-gpu transformers torch numpy

import fitz
import os
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from google.colab import drive
import warnings
import re
warnings.filterwarnings('ignore')

class DocumentProcessor:
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.documents = []
        self.embed_model = None
        self.tokenizer = None
        self.model = None
        self.device = None
        self.index = None

    def init_models(self):
        """Initialize all required models and move to appropriate device."""
        print("\nInitializing models...")
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # Setup GPU if available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        print(f"Using device: {self.device}")

    def check_drive_mounting(self):
        """Verify Google Drive is mounted and accessible."""
        try:
            if not os.path.exists('/content/drive'):
                print("Google Drive is not mounted!")
                return False
            return True
        except Exception as e:
            print(f"Error checking drive mount: {str(e)}")
            return False

    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF file with error handling."""
        try:
            doc = fitz.open(pdf_path)
            text = ' '.join(page.get_text() for page in doc)
            doc.close()
            return self.clean_text(text)
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {str(e)}")
            return ""

    def clean_text(self, text):
        """Clean and normalize text content."""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        return text.strip()

    def load_documents(self, limit=15):
        """Load and process documents from the specified directory."""
        print("\nLoading documents...")

        if not self.check_drive_mounting():
            return False

        if not os.path.exists(self.directory_path):
            print(f"Directory not found: {self.directory_path}")
            return False

        files = [f for f in os.listdir(self.directory_path) if f.endswith('.pdf')]

        for i, filename in enumerate(files):
            if i >= limit:
                break

            try:
                pdf_path = os.path.join(self.directory_path, filename)
                print(f"\nProcessing {filename}...")

                text = self.extract_text_from_pdf(pdf_path)
                if not text:
                    continue

                self.documents.append(text)
                print(f"Successfully loaded: {filename} ({len(text)} characters)")

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

        print(f"\nLoaded {len(self.documents)} documents")
        return len(self.documents) > 0

    def create_embeddings(self):
        """Create embeddings for all documents."""
        print("\nCreating document embeddings...")
        embeddings = self.embed_model.encode(self.documents, show_progress_bar=True)

        # Initialize FAISS index
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(np.array(embeddings).astype('float32'))
        print("Embeddings created and indexed successfully")

    def preprocess_text(self, text, max_tokens=200):
        """Preprocess text to fit within token limits."""
        tokens = self.tokenizer.encode(text)
        if len(tokens) > max_tokens:
            tokens = tokens[:max_tokens]
            text = self.tokenizer.decode(tokens)
        return text

    def retrieve_and_generate(self, query, k=3):
        """Retrieve relevant documents and generate a response."""
        print(f"\nProcessing query: {query}")

        # Generate query embedding and retrieve similar documents
        query_embedding = self.embed_model.encode([query])[0]
        distances, indices = self.index.search(
            np.array([query_embedding]).astype('float32'),
            min(k, len(self.documents))
        )

        # Build context from retrieved documents
        retrieved_texts = []
        total_tokens = 0
        max_context_tokens = 400

        print("\nRetrieved documents:")
        for i, idx in enumerate(indices[0]):
            text = self.documents[idx]
            preview = text[:100] + "..."
            print(f"{i+1}. Score: {distances[0][i]:.4f}\nPreview: {preview}\n")

            # Extract most relevant paragraphs
            paragraphs = text.split('\n\n')
            relevant_text = ' '.join(paragraphs[:3])

            processed_text = self.preprocess_text(relevant_text)
            tokens = self.tokenizer.encode(processed_text)

            if total_tokens + len(tokens) > max_context_tokens:
                break

            retrieved_texts.append(processed_text)
            total_tokens += len(tokens)

        context = " ".join(retrieved_texts)

        # Construct prompt
        prompt = (
            f"Based on the following information, provide a clear and concise summary about {query}. "
            f"Focus on the most important facts and recent developments.\n\n"
            f"Information:\n{context}\n\n"
            "Concise summary:"
        )

        # Generate response
        try:
            input_ids = self.tokenizer.encode(
                prompt,
                truncation=True,
                max_length=800,
                padding=False,
                return_tensors='pt'
            ).to(self.device)

            attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(self.device)

            outputs = self.model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=100,
                num_beams=4,
                no_repeat_ngram_size=3,
                pad_token_id=self.tokenizer.eos_token_id,
                early_stopping=True,
                do_sample=True,
                temperature=0.6,
                top_k=40,
                top_p=0.85,
                length_penalty=1.2,
                repetition_penalty=1.3
            )

            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract and clean response
            if "Concise summary:" in generated_text:
                response = generated_text.split("Concise summary:")[-1].strip()
            else:
                response = generated_text.strip()

            return response.replace('\n\n', '\n').strip()

        except Exception as e:
            print(f"Error during generation: {str(e)}")
            return f"Error generating response: {str(e)}"

def main():
    """Main execution function."""
    # Mount Google Drive
    drive.mount('/content/drive')

    # Initialize document processor
    directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'
    processor = DocumentProcessor(directory_path)

    # Initialize models and load documents
    processor.init_models()
    if not processor.load_documents():
        print("Failed to load documents. Exiting.")
        return

    # Create embeddings
    processor.create_embeddings()

    # Interactive query loop
    print("\nEnter your queries (type 'quit' to exit)")
    while True:
        try:
            query = input("\nQuery: ").strip()
            if query.lower() == 'quit':
                break

            if not query:
                print("Please enter a valid query")
                continue

            response = processor.retrieve_and_generate(query)
            print(f"\nResponse:\n{response}")

        except KeyboardInterrupt:
            print("\nExiting...")
            break
        except Exception as e:
            print(f"Error processing query: {str(e)}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Fatal error: {str(e)}")

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, pymupdf
Successfully installed faiss-gpu-1.7.2 pymupdf-1.25.1
Mounted at /content/drive

Initializing models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda

Loading documents...

Processing China’s Covid-19 Surge Shuts Down Plants in Manufacturing Hubs Shenzhen and Changchun - WSJ.pdf...
Successfully loaded: China’s Covid-19 Surge Shuts Down Plants in Manufacturing Hubs Shenzhen and Changchun - WSJ.pdf (9180 characters)

Processing Russian Prosecutors Warn Western Companies of Arrests, Asset Seizures - WSJ.pdf...
Successfully loaded: Russian Prosecutors Warn Western Companies of Arrests, Asset Seizures - WSJ.pdf (11192 characters)

Processing Können russische Oligarchen Sanktionen mit Kryptowährungen umgehen.pdf...
Successfully loaded: Können russische Oligarchen Sanktionen mit Kryptowährungen umgehen.pdf (14744 characters)

Processing TikTok Influencers Get Spotlight in Information Battle Over the Russia-Ukraine War - WSJ.pdf...
Successfully loaded: TikTok Influencers Get Spotlight in Information Battle Over the Russia-Ukraine War - WSJ.pdf (11983 characters)

Processing Chip Makers Stockpiled Key Materials Ahead o

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings created and indexed successfully

Enter your queries (type 'quit' to exit)

Query: explain the influence of politics on bitcoin


Token indices sequence length is longer than the specified maximum sequence length for this model (2049 > 1024). Running this sequence through the model will result in indexing errors



Processing query: explain the influence of politics on bitcoin

Retrieved documents:
1. Score: 0.9038
Preview: DOW JONES, A NEWS CORP COMPANY About WSJ POLITICS Bitcoin Price Surges on Bidens Crypto Executive Or...

2. Score: 1.3765
Preview: Manezhnaya Square in Moscow. Analysts expect Russias economy to contract as much as 20 this quarter....

3. Score: 1.4421
Preview: This copy is for your personal, non-commercial use only. To order presentation-ready copies for dist...


Response:
The Russian government has imposed sanctions on banks and other financial institutions that have failed to comply with the country's anti-money laundering (AML) laws. The sanctions were imposed on banks that fail to meet certain requirements, such as compliance with the Foreign Account Tax Compliance Act (FATCA), which requires foreign banks to report suspicious activity to the authorities. The FATCA requires banks to file reports with the Federal Deposit Insurance Corporation (FDIC) on suspicious activit