In [20]:
import os
import re
from typing import List, Dict, Any
from pathlib import Path
import logging
import fitz  
from sentence_transformers import SentenceTransformer
import chromadb
import anthropic
from dotenv import load_dotenv

load_dotenv()

True

In [21]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FinancialRAG:
    def __init__(self):
        """
        Initialize the Financial RAG system with:
        - Anthropic API client
        - Sentence Transformer for embeddings
        - ChromaDB for vector storage
        """
        api_key = os.getenv('ANTHROPIC_API_KEY')
        if not api_key:
            raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
            
        self.anthropic_client = anthropic.Anthropic(api_key=api_key)
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Lightweight model 
        
        # Simple in-memory vector store using ChromaDB
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(
            name="financial_docs",
            metadata={"hnsw:space": "cosine"} # Cosine similarity for text
        )
        
    def extract_text_from_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Extract and clean text from each page of a PDF document.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            List of page dictionaries with:
            - content: Cleaned text content
            - page_num: Page number (1-indexed)
            - source: PDF filename
        """
        doc = fitz.open(pdf_path)
        pages = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            
            # Clean up text
            text = re.sub(r'\s+', ' ', text).strip()
            
            if len(text) > 100:  # Only keep pages with substantial content
                pages.append({
                    'content': text,
                    'page_num': page_num + 1,
                    'source': Path(pdf_path).name
                })
        
        doc.close()
        logger.info(f"Extracted {len(pages)} pages from {pdf_path}")
        return pages
    
    def create_chunks(self, pages: List[Dict], chunk_size: int = 1000) -> List[Dict]:
        """
        Split page content into manageable chunks for embedding.
        
        Args:
            pages: List of page dictionaries from extract_text_from_pdf
            chunk_size: Target number of words per chunk
            
        Returns:
            List of chunk dictionaries with:
            - id: Unique chunk identifier
            - content: Chunk text
            - source/page_num: Reference back to original document
        """
        chunks = []
        
        for page in pages:
            text = page['content']
            words = text.split()
            
            # Split into chunks
            for i in range(0, len(words), chunk_size):
                chunk_words = words[i:i + chunk_size]
                chunk_text = ' '.join(chunk_words)
                
                if len(chunk_text) > 50:  # Skip tiny chunks
                    chunk_id = f"{page['source']}_page{page['page_num']}_chunk{i//chunk_size}"
                    
                    chunks.append({
                        'id': chunk_id,
                        'content': chunk_text,
                        'source': page['source'],
                        'page_num': page['page_num']
                    })
        
        return chunks
    
    def add_documents(self, pdf_paths: List[str]):
        """
        Process multiple PDFs and add them to the vector database.
        
        Steps:
        1. Extract text from each PDF
        2. Chunk the content
        3. Generate embeddings
        4. Store in ChromaDB with metadata
        
        Args:
            pdf_paths: List of paths to PDF files
        """
        all_chunks = []
        
        for pdf_path in pdf_paths:
            logger.info(f"Processing {pdf_path}")
            
            # Extract pages
            pages = self.extract_text_from_pdf(pdf_path)
            
            # Create chunks
            chunks = self.create_chunks(pages)
            all_chunks.extend(chunks)
        
        if not all_chunks:
            logger.warning("No content found in documents")
            return
        
        # Create embeddings
        logger.info("Creating embeddings...")
        texts = [chunk['content'] for chunk in all_chunks]
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        
        # Add to vector database
        self.collection.add(
            embeddings=embeddings.tolist(),
            documents=texts,
            metadatas=[{
                'source': chunk['source'], 
                'page_num': chunk['page_num']
            } for chunk in all_chunks],
            ids=[chunk['id'] for chunk in all_chunks]
        )
        
        logger.info(f"Added {len(all_chunks)} chunks to database")
    
    def search(self, question: str, n_results: int = 5) -> List[Dict]:
        """
        Semantic search for relevant document chunks.
        
        Args:
            question: Natural language question
            n_results: Number of results to return
            
        Returns:
            List of relevant chunks with:
            - content: Matching text
            - source/page_num: Document reference
            - relevance_score: 1 - cosine distance
        """
        question_embedding = self.embedding_model.encode([question])
        
        results = self.collection.query(
            query_embeddings=question_embedding.tolist(),
            n_results=n_results,
            include=['documents', 'metadatas', 'distances']
        )
        
        relevant_chunks = []
        for doc, meta, distance in zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        ):
            relevant_chunks.append({
                'content': doc,
                'source': meta['source'],
                'page_num': meta['page_num'],
                'relevance_score': 1 - distance  # Higher is better
            })
        
        return relevant_chunks
    
    def answer_question(self, question: str) -> Dict:
        """
        End-to-end question answering pipeline.
        
        1. Perform semantic search for relevant context
        2. Construct prompt with context
        3. Query Claude model
        4. Format response with sources
        
        Args:
            question: Financial question to answer
            
        Returns:
            Dictionary with:
            - answer: Formatted response
            - sources: Top document references
            - question: Echo of original question
        """
        
        # Find relevant information
        relevant_chunks = self.search(question, n_results=6)
        
        if not relevant_chunks:
            return {
                'answer': "No relevant information found.",
                'sources': []
            }
        
        # Prepare context
        context = "\n\n".join([
            f"[From {chunk['source']}, Page {chunk['page_num']}]:\n{chunk['content']}"
            for chunk in relevant_chunks
        ])
        
        # Claude prompt
        prompt = f"""You are analyzing financial statements. Use the provided excerpts to answer the question.

FINANCIAL STATEMENT EXCERPTS:
{context}

QUESTION: {question}

Instructions:
- Give specific numbers when comparing periods
- Calculate percentage changes when relevant  
- Always mention which document and page your information comes from
- If you can't find the exact information needed, say so clearly: "I don't know"

ANSWER:"""

        try:
            response = self.anthropic_client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1000,
                temperature=0.1,
                messages=[{"role": "user", "content": prompt}]
            )
            
            answer = response.content[0].text
            
        except Exception as e:
            answer = f"Error getting answer: {str(e)}"
        
        # Prepare response with sources
        sources = []
        for chunk in relevant_chunks[:3]:  # Top 3 
            sources.append({
                'document': chunk['source'],
                'page': chunk['page_num'],
                'relevance': f"{chunk['relevance_score']:.2f}"
            })
        
        return {
            'answer': answer,
            'sources': sources,
            'question': question
        }


In [26]:
def main():
    rag = FinancialRAG()
    
    pdf_files = [
        'knowledge-base/gazprom-ifrs-2024-12mnth-en.pdf'
    ]
    
    print("Loading financial statements...")
    rag.add_documents(pdf_files)
    
    questions = [
        "How much did property, plant and equipment change between periods?"
    ]
    
    for question in questions:
        print(f"\n" + "="*50)
        print(f"Q: {question}")
        print("="*50)
        
        result = rag.answer_question(question)
        
        print(f"Answer: {result['answer']}")
        print(f"\nSources:")
        for source in result['sources']:
            print(f"- {source['document']}, Page {source['page']}")

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Processing knowledge-base/gazprom-ifrs-2024-12mnth-en.pdf
INFO:__main__:Extracted 57 pages from knowledge-base/gazprom-ifrs-2024-12mnth-en.pdf
INFO:__main__:Creating embeddings...


Loading financial statements...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:Added 57 chunks to database



Q: How much did property, plant and equipment change between periods?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Answer: Based on the financial statement excerpts from gazprom-ifrs-2024-12mnth-en.pdf (Page 29), I can track the net book value of total Property, Plant and Equipment across three periods:

31 December 2022: 17,419,060 million rubles
31 December 2023: 18,436,207 million rubles
31 December 2024: 20,779,950 million rubles

Changes between periods:
- From 2022 to 2023: Increased by 1,017,147 million rubles (+5.8%)
- From 2023 to 2024: Increased by 2,343,743 million rubles (+12.7%)
- Total change from 2022 to 2024: Increased by 3,360,890 million rubles (+19.3%)

The largest year-over-year increase was between 2023 and 2024, showing significant growth in the company's property, plant and equipment assets.

Sources:
- gazprom-ifrs-2024-12mnth-en.pdf, Page 29
- gazprom-ifrs-2024-12mnth-en.pdf, Page 38
- gazprom-ifrs-2024-12mnth-en.pdf, Page 31


In [25]:
import chromadb
client = chromadb.Client()
try:
    client.delete_collection("financial_docs")
except:
    pass  

In [19]:
rag = FinancialRAG()
rag.add_documents(['knowledge-base/gazprom-ifrs-2024-12mnth-en.pdf'])
result = rag.answer_question("Which indicator had the greatest impact on the company's net profit?")
print(result['answer'])

INFO:__main__:Processing knowledge-base/gazprom-ifrs-2024-12mnth-en.pdf
INFO:__main__:Extracted 57 pages from knowledge-base/gazprom-ifrs-2024-12mnth-en.pdf
INFO:__main__:Creating embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:Added 57 chunks to database


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Based on the financial statement excerpts, I'll analyze the major factors impacting net profit:

From page 45 (Note 30 "Net Cash from Operating Activities"), we can see several key indicators and their changes:

1. Depreciation: RUB 1,377,774 million in 2024 vs RUB 982,058 million in 2023 (+40% increase)

2. Net finance expense: RUB 35,462 million in 2024 vs RUB 649,745 million in 2023 (significant decrease of 95%)

3. Impairment loss on assets and change in provision for post-employment benefits: RUB 516,519 million in 2024 vs RUB 1,490,124 million in 2023 (65% decrease)

4. Share of profit of associates and joint ventures: RUB 242,008 million in 2024 vs RUB 354,364 million in 2023 (32% decrease)

Looking at the magnitude of these changes, the most significant impact on net profit came from the reduction in impairment loss on assets and change in provision for post-employment benefits, which decreased by RUB 973,605 million (from RUB 1,490,124 million to RUB 516,519 million). This rep