<a href="https://colab.research.google.com/github/joepareti54/joepareti54/blob/main/lm_rag_gpt2_test5d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf sentence-transformers faiss-gpu transformers torch numpy
import fitz
import os
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from google.colab import drive
import warnings
import re
from typing import List, Dict, Any, Optional
from datetime import datetime

class FinanceNewsProcessor:
    def __init__(self, directory_path: str):
        self.directory_path = directory_path
        self.documents: List[str] = []
        self.document_metadata: List[Dict[str, Any]] = []
        self.embed_model: Optional[SentenceTransformer] = None
        self.tokenizer: Optional[GPT2Tokenizer] = None
        self.model: Optional[GPT2LMHeadModel] = None
        self.device: Optional[torch.device] = None
        self.index: Optional[faiss.IndexFlatL2] = None

    def init_models(self) -> None:
        """Initialize NLP models and move to appropriate device."""
        print("\nInitializing models...")
        try:
            self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            self.model = GPT2LMHeadModel.from_pretrained('gpt2')
            self.tokenizer.pad_token = self.tokenizer.eos_token

            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            print(f"Using device: {self.device}")
        except Exception as e:
            raise RuntimeError(f"Error initializing models: {str(e)}")

    def clean_text(self, text: str) -> str:
        """Clean and normalize text content."""
        # Remove special characters while preserving essential punctuation
        text = re.sub(r'[^\w\s.,!?;:()\-\'\"]+', ' ', text)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        # Fix common OCR issues
        text = text.replace(' ,', ',').replace(' .', '.')
        return text.strip()

    def extract_metadata(self, filename: str, text: str) -> Dict[str, Any]:
        """Extract metadata from filename and content."""
        date_match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
        date = date_match.group(0) if date_match else None

        # Extract potential title from first line
        first_line = text.split('\n')[0] if text else ''
        title = first_line[:100] if len(first_line) > 0 else filename

        return {
            'filename': filename,
            'date': date,
            'title': title,
            'length': len(text),
            'processed_date': datetime.now().isoformat()
        }

    def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
        """Extract and process text from PDF file."""
        try:
            doc = fitz.open(pdf_path)
            text_parts = []

            for page in doc:
                text = page.get_text()
                if text.strip():
                    text_parts.append(text)

            doc.close()
            full_text = ' '.join(text_parts)
            return self.clean_text(full_text)

        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {str(e)}")
            return None

    def load_documents(self, limit: int = 15) -> bool:
        """Load and process documents from the specified directory."""
        print("\nLoading documents...")

        if not os.path.exists(self.directory_path):
            raise FileNotFoundError(f"Directory not found: {self.directory_path}")

        files = [f for f in os.listdir(self.directory_path) if f.lower().endswith('.pdf')]

        for i, filename in enumerate(files):
            if i >= limit:
                break

            try:
                pdf_path = os.path.join(self.directory_path, filename)
                print(f"\nProcessing {filename}...")

                text = self.extract_text_from_pdf(pdf_path)
                if not text:
                    continue

                self.documents.append(text)
                metadata = self.extract_metadata(filename, text)
                self.document_metadata.append(metadata)

                print(f"Successfully loaded: {filename} ({len(text)} characters)")

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

        print(f"\nLoaded {len(self.documents)} documents")
        return len(self.documents) > 0

    def create_embeddings(self) -> None:
        """Create and index document embeddings."""
        print("\nCreating document embeddings...")
        try:
            embeddings = self.embed_model.encode(self.documents, show_progress_bar=True)

            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(np.array(embeddings).astype('float32'))

            print("Embeddings created and indexed successfully")
        except Exception as e:
            raise RuntimeError(f"Error creating embeddings: {str(e)}")

    def preprocess_text(self, text: str, max_tokens: int = 150) -> str:
        """Preprocess and truncate text to fit token limit."""
        tokens = self.tokenizer.encode(text)
        if len(tokens) > max_tokens:
            tokens = tokens[:max_tokens]
            text = self.tokenizer.decode(tokens)
        return text

    def extract_relevant_context(self, text: str, query_terms: set, max_chars: int = 1000) -> str:
        """Extract most relevant portions of text based on query terms."""
        sentences = re.split(r'[.!?]+', text)
        relevant_sentences = []
        char_count = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Check relevance
            is_relevant = any(term in sentence.lower() for term in query_terms)
            if is_relevant:
                if char_count + len(sentence) > max_chars:
                    break
                relevant_sentences.append(sentence)
                char_count += len(sentence)

        return '. '.join(relevant_sentences) + '.' if relevant_sentences else text[:max_chars]

    def retrieve_and_generate(self, query: str, k: int = 3) -> str:
        """Retrieve relevant documents and generate a response."""
        print(f"\nProcessing query: {query}")

        # Generate query embedding and retrieve similar documents
        query_embedding = self.embed_model.encode([query])[0]
        distances, indices = self.index.search(
            np.array([query_embedding]).astype('float32'),
            min(k, len(self.documents))
        )

        # Build context from retrieved documents
        query_terms = set(query.lower().split())
        retrieved_texts = []
        total_tokens = 0
        max_context_tokens = 300

        print("\nRetrieved relevant documents:")
        for i, idx in enumerate(indices[0]):
            if distances[0][i] > 1.5:  # Relevance threshold
                continue

            text = self.documents[idx]
            metadata = self.document_metadata[idx]
            preview = text[:100] + "..."
            print(f"{i+1}. Score: {distances[0][i]:.4f}")
            print(f"Title: {metadata['title']}")
            print(f"Preview: {preview}\n")

            # Extract relevant context
            relevant_text = self.extract_relevant_context(text, query_terms)
            processed_text = self.preprocess_text(relevant_text, max_tokens=150)
            tokens = self.tokenizer.encode(processed_text)

            if total_tokens + len(tokens) > max_context_tokens:
                break

            retrieved_texts.append(processed_text)
            total_tokens += len(tokens)

        if not retrieved_texts:
            return "No relevant information found for this query."

        context = " ".join(retrieved_texts)

        # Generate response
        prompt = (
            f"Based on recent news articles, provide a clear and focused summary about {query}. "
            f"Include key developments and their significance.\n\n"
            f"Articles:\n{context}\n\n"
            "Summary:"
        )

        try:
            input_ids = self.tokenizer.encode(
                prompt,
                truncation=True,
                max_length=512,
                padding=False,
                return_tensors='pt'
            ).to(self.device)

            outputs = self.model.generate(
                input_ids,
                max_new_tokens=150,
                num_beams=4,
                no_repeat_ngram_size=3,
                pad_token_id=self.tokenizer.eos_token_id,
                early_stopping=True,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                length_penalty=1.0,
                repetition_penalty=1.2
            )

            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract and clean response
            if "Summary:" in generated_text:
                response = generated_text.split("Summary:")[-1].strip()
            else:
                response = generated_text.strip()

            # Clean up response
            response = response.replace('\n\n', '\n').strip()
            if not response.endswith(('.', '!', '?')):
                response += '.'

            return response

        except Exception as e:
            print(f"Error during generation: {str(e)}")
            return f"Error generating response: {str(e)}"

def main():
    """Main execution function."""
    try:
        # Mount Google Drive
        drive.mount('/content/drive')

        # Initialize processor
        directory_path = '/content/drive/My Drive/All_Finance_PDF_files_old/'
        processor = FinanceNewsProcessor(directory_path)

        # Initialize models and load documents
        processor.init_models()
        if not processor.load_documents():
            print("Failed to load documents. Exiting.")
            return

        # Create embeddings
        processor.create_embeddings()

        # Interactive query loop
        print("\nEnter your queries (type 'quit' to exit)")
        while True:
            try:
                query = input("\nQuery: ").strip()
                if query.lower() == 'quit':
                    break

                if not query:
                    print("Please enter a valid query")
                    continue

                response = processor.retrieve_and_generate(query)
                print(f"\nResponse:\n{response}")

            except KeyboardInterrupt:
                print("\nExiting...")
                break
            except Exception as e:
                print(f"Error processing query: {str(e)}")

    except Exception as e:
        print(f"Fatal error: {str(e)}")

if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    main()

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, pymupdf
Successfully installed faiss-gpu-1.7.2 pymupdf-1.25.1
Mounted at /content/drive

Initializing models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda

Loading documents...

Processing China’s Covid-19 Surge Shuts Down Plants in Manufacturing Hubs Shenzhen and Changchun - WSJ.pdf...
Successfully loaded: China’s Covid-19 Surge Shuts Down Plants in Manufacturing Hubs Shenzhen and Changchun - WSJ.pdf (9214 characters)

Processing Russian Prosecutors Warn Western Companies of Arrests, Asset Seizures - WSJ.pdf...
Successfully loaded: Russian Prosecutors Warn Western Companies of Arrests, Asset Seizures - WSJ.pdf (11234 characters)

Processing Können russische Oligarchen Sanktionen mit Kryptowährungen umgehen.pdf...
Successfully loaded: Können russische Oligarchen Sanktionen mit Kryptowährungen umgehen.pdf (14759 characters)

Processing TikTok Influencers Get Spotlight in Information Battle Over the Russia-Ukraine War - WSJ.pdf...
Successfully loaded: TikTok Influencers Get Spotlight in Information Battle Over the Russia-Ukraine War - WSJ.pdf (12042 characters)

Processing Chip Makers Stockpiled Key Materials Ahead o

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings created and indexed successfully

Enter your queries (type 'quit' to exit)

Query: what is the business impact of 5g network technology 


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Processing query: what is the business impact of 5g network technology

Retrieved relevant documents:
1. Score: 0.8230
Title: China is racing ahead in building the infrastructure of 5G networks, but it is inside factories, coa
Preview: China is racing ahead in building the infrastructure of 5G networks, but it is inside factories, coa...


Response:
5G networks will be used by more than 1.5 billion people around the world by 2020, according to the World Economic Forum (WEF).
By 2020, China will have the world's largest 5G network by population. By 2030, it will have more than half of all the global 5G infrastructure. By 2040, China's 5G footprint will exceed that of the rest of the world. By 2050, the total number of people using 5G will surpass that of all other developed countries. By 2025, China is expected to have the second-largest 5G market in the world, followed by the United States, Japan, South Korea, and the United Kingdom. By the end of the decade, China has.

Query: discus