### Installing Libraries

In [1]:
!pip install -U gradio pdfplumber PyPDF2 camelot-py chromadb nltk scikit-learn sentence-transformers langchain transformers accelerate peft accelerate bitsandbytes datasets trl --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.6/324.6 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.2 MB/s[0m eta [

### Importing Libraries

In [None]:
import gradio as gr

import os
import re
import nltk
import uuid
import re
from typing import List, Dict
import chromadb
import numpy as np
import time
import random
import pandas as pd
import pdfplumber
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import PyPDF2
import re
import json
from typing import Dict, List, Any, Optional, Tuple
import camelot
from datetime import datetime
import numpy as np
from dataclasses import dataclass, asdict

import torch, gc
from dataclasses import dataclass
from typing import Dict, List

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

from google.colab import drive
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from trl import SFTTrainer, SFTConfig

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Authorizing Hugging Face account

In [None]:
from huggingface_hub import login
hf_token  = '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'
login(token=hf_token)

### Data Collection & Preprocessing

---



In [None]:
# ---------------- For RAG ----------------
# Extracts clean text and tables from a PDF file. Tables are converted into a readable text block.

@dataclass
class ExtractedTable:
    """Data class for extracted table information"""
    page_number: int
    table_index: int
    title: str
    headers: List[str]
    data: List[List[str]]
    table_type: str  # 'financial', 'summary', 'breakdown', etc.
    metadata: Dict[str, Any]

@dataclass
class ExtractedText:
    """Data class for extracted text sections"""
    page_number: int
    section_type: str  # 'header', 'paragraph', 'footer', 'title'
    content: str
    position: Optional[Tuple[float, float, float, float]]  # (x0, y0, x1, y1)
    font_info: Optional[Dict[str, Any]]

@dataclass
class FinancialDocument:
    """Main data class for the extracted document"""
    document_type: str
    company_name: str
    report_period: str
    extraction_date: str
    tables: List[ExtractedTable]
    text_sections: List[ExtractedText]
    key_metrics: Dict[str, Any]
    metadata: Dict[str, Any]

class ICICIFinanceReportExtractor:
    """
    Comprehensive PDF information extractor for ICICI finance reports
    Designed for RAG applications and chatbot knowledge bases
    """

    def __init__(self):
        self.financial_keywords = [
            'revenue', 'profit', 'loss', 'assets', 'liabilities', 'equity',
            'cash flow', 'net income', 'gross margin', 'ebitda', 'roi', 'eps',
            'balance sheet', 'income statement', 'provisions', 'advances',
            'deposits', 'investments', 'capital adequacy', 'npa', 'crar'
        ]

        self.section_patterns = {
            'financial_highlights': r'(financial\s+highlights|key\s+figures)',
            'balance_sheet': r'(balance\s+sheet|statement\s+of\s+financial\s+position)',
            'income_statement': r'(profit\s+and\s+loss|income\s+statement|statement\s+of\s+income)',
            'cash_flow': r'(cash\s+flow\s+statement|statement\s+of\s+cash\s+flows)',
            'ratios': r'(financial\s+ratios|key\s+ratios|performance\s+ratios)',
            'segment_analysis': r'(segment\s+analysis|business\s+segment|segment\s+wise)',
            'risk_management': r'(risk\s+management|credit\s+risk|operational\s+risk)'
        }

    def extract_from_pdf(self, pdf_path: str) -> FinancialDocument:
        """
        Main extraction method that processes the entire PDF
        """
        print(f"Starting extraction from {pdf_path}")

        tables = []
        text_sections = []
        key_metrics = {}
        metadata = {}

        # Extract using multiple methods for comprehensive coverage
        all_tables = self._extract_tables_camelot(pdf_path)

        # Extract text content
        text_sections = self._extract_text_content(pdf_path)

        # Extract key financial metrics
        key_metrics = self._extract_key_metrics(text_sections, all_tables)

        # Extract document metadata
        metadata = self._extract_document_metadata(pdf_path, text_sections)

        # Create the final document structure
        document = FinancialDocument(
            document_type=metadata.get('document_type', 'Financial Report'),
            company_name=metadata.get('company_name', 'ICICI'),
            report_period=metadata.get('report_period', 'Unknown'),
            extraction_date=datetime.now().isoformat(),
            tables=all_tables,
            text_sections=text_sections,
            key_metrics=key_metrics,
            metadata=metadata
        )

        print(f"Extraction completed. Found {len(all_tables)} tables and {len(text_sections)} text sections")
        return document

    def _extract_tables_camelot(self, pdf_path: str) -> List[ExtractedTable]:
        """Extract tables using Camelot (best for well-structured tables)"""
        tables = []
        try:
            camelot_tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')

            for idx, table in enumerate(camelot_tables):
                df = table.df
                if not df.empty and len(df.columns) > 1:
                    # Clean and process the table
                    df = self._clean_dataframe(df)

                    table_obj = ExtractedTable(
                        page_number=table.page,
                        table_index=idx,
                        title=self._detect_table_title(df),
                        headers=list(df.columns),
                        data=df.values.tolist(),
                        table_type=self._classify_table(df),
                        metadata={
                            'extraction_method': 'camelot',
                            'accuracy': table.accuracy,
                            'whitespace': table.whitespace,
                            'shape': df.shape
                        }
                    )
                    tables.append(table_obj)

        except Exception as e:
            print(f"Camelot extraction failed: {str(e)}")

        return tables

    def _extract_text_content(self, pdf_path: str) -> List[ExtractedText]:
        """Extract and structure text content from PDF"""
        text_sections = []

        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # Extract text with positioning
                    chars = page.chars

                    # Group text by lines and paragraphs
                    lines = self._group_text_by_lines(chars)
                    paragraphs = self._group_lines_into_paragraphs(lines)

                    for para_idx, paragraph in enumerate(paragraphs):
                        section_type = self._classify_text_section(paragraph['text'])

                        text_obj = ExtractedText(
                            page_number=page_num,
                            section_type=section_type,
                            content=paragraph['text'],
                            position=paragraph.get('bbox'),
                            font_info=paragraph.get('font_info')
                        )
                        text_sections.append(text_obj)

        except Exception as e:
            print(f"Text extraction failed: {str(e)}")

        return text_sections

    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and standardize DataFrame"""
        # Remove completely empty rows and columns
        df = df.dropna(how='all').dropna(axis=1, how='all')

        # Clean headers
        df.columns = [str(col).strip().replace('\n', ' ') for col in df.columns]

        # Clean cell values
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].astype(str).str.strip().str.replace('\n', ' ')

        return df

    def _detect_table_title(self, df: pd.DataFrame) -> str:
        """Detect table title from content or position"""
        # Check first row for title-like content
        if not df.empty:
            first_row = df.iloc[0]
            if len(set(first_row.dropna())) == 1:  # Merged cells indicating title
                return str(first_row.iloc[0])

        # Use column headers to infer title
        headers = ' '.join(df.columns).lower()
        for pattern_name, pattern in self.section_patterns.items():
            if re.search(pattern, headers, re.IGNORECASE):
                return pattern_name.replace('_', ' ').title()

        return "Financial Table"

    def _classify_table(self, df: pd.DataFrame) -> str:
        """Classify table type based on content"""
        content = ' '.join([str(col) for col in df.columns] +
                          [str(val) for val in df.values.flatten() if pd.notna(val)]).lower()

        if re.search(r'balance\s+sheet|assets|liabilities', content):
            return 'balance_sheet'
        elif re.search(r'profit|loss|income|revenue|expense', content):
            return 'income_statement'
        elif re.search(r'cash\s+flow', content):
            return 'cash_flow'
        elif re.search(r'ratio|percentage|%', content):
            return 'ratios'
        elif re.search(r'segment|division|business', content):
            return 'segment_analysis'
        else:
            return 'general_financial'

    def _classify_text_section(self, text: str) -> str:
        """Classify text section type"""
        text_lower = text.lower().strip()

        if len(text) < 50 and any(keyword in text_lower for keyword in ['chairman', 'ceo', 'annual report']):
            return 'header'
        elif re.search(r'^(note|footnote|\d+\.)', text_lower):
            return 'footnote'
        elif len(text) > 500:
            return 'paragraph'
        elif any(pattern_name in text_lower for pattern_name in self.section_patterns.keys()):
            return 'section_title'
        else:
            return 'general_text'

    def _group_text_by_lines(self, chars: List[Dict]) -> List[Dict]:
        """Group characters into lines"""
        if not chars:
            return []

        lines = []
        current_line = []
        current_y = chars[0]['y0']

        for char in chars:
            if abs(char['y0'] - current_y) > 5:  # New line threshold
                if current_line:
                    line_text = ''.join([c['text'] for c in current_line])
                    lines.append({
                        'text': line_text,
                        'bbox': self._get_bbox_from_chars(current_line),
                        'chars': current_line
                    })
                current_line = [char]
                current_y = char['y0']
            else:
                current_line.append(char)

        if current_line:
            line_text = ''.join([c['text'] for c in current_line])
            lines.append({
                'text': line_text,
                'bbox': self._get_bbox_from_chars(current_line),
                'chars': current_line
            })

        return lines

    def _group_lines_into_paragraphs(self, lines: List[Dict]) -> List[Dict]:
        """Group lines into paragraphs"""
        if not lines:
            return []

        paragraphs = []
        current_paragraph = []

        for i, line in enumerate(lines):
            current_paragraph.append(line)

            # Check if this is end of paragraph
            is_end = (i == len(lines) - 1 or
                     abs(lines[i+1]['bbox'][1] - line['bbox'][3]) > 20 or  # Large vertical gap
                     line['text'].strip().endswith('.'))

            if is_end and current_paragraph:
                para_text = ' '.join([l['text'] for l in current_paragraph]).strip()
                if para_text:
                    paragraphs.append({
                        'text': para_text,
                        'bbox': self._get_combined_bbox(current_paragraph),
                        'font_info': self._get_dominant_font_info(current_paragraph)
                    })
                current_paragraph = []

        return paragraphs

    def _get_bbox_from_chars(self, chars: List[Dict]) -> Tuple[float, float, float, float]:
        """Get bounding box from list of characters"""
        if not chars:
            return (0, 0, 0, 0)
        x0 = min(c['x0'] for c in chars)
        y0 = min(c['y0'] for c in chars)
        x1 = max(c['x1'] for c in chars)
        y1 = max(c['y1'] for c in chars)
        return (x0, y0, x1, y1)

    def _get_combined_bbox(self, elements: List[Dict]) -> Tuple[float, float, float, float]:
        """Get combined bounding box from list of elements"""
        if not elements:
            return (0, 0, 0, 0)
        bboxes = [e['bbox'] for e in elements]
        x0 = min(bbox[0] for bbox in bboxes)
        y0 = min(bbox[1] for bbox in bboxes)
        x1 = max(bbox[2] for bbox in bboxes)
        y1 = max(bbox[3] for bbox in bboxes)
        return (x0, y0, x1, y1)

    def _get_dominant_font_info(self, elements: List[Dict]) -> Dict[str, Any]:
        """Get dominant font information from elements"""
        font_sizes = []
        font_names = []

        for element in elements:
            for char in element.get('chars', []):
                font_sizes.append(char.get('size', 12))
                font_names.append(char.get('fontname', 'unknown'))

        if font_sizes:
            return {
                'dominant_size': max(set(font_sizes), key=font_sizes.count),
                'dominant_font': max(set(font_names), key=font_names.count),
                'avg_size': np.mean(font_sizes)
            }
        return {}

    def _combine_and_deduplicate_tables(self, *table_lists) -> List[ExtractedTable]:
        """Combine tables from different extraction methods and remove duplicates"""
        all_tables = []
        for table_list in table_lists:
            all_tables.extend(table_list)

        # Simple deduplication based on content similarity
        unique_tables = []
        for table in all_tables:
            is_duplicate = False
            for existing_table in unique_tables:
                if self._are_tables_similar(table, existing_table):
                    # Keep the one with better metadata/accuracy
                    if table.metadata.get('accuracy', 0) > existing_table.metadata.get('accuracy', 0):
                        unique_tables.remove(existing_table)
                        unique_tables.append(table)
                    is_duplicate = True
                    break

            if not is_duplicate:
                unique_tables.append(table)

        return unique_tables

    def _are_tables_similar(self, table1: ExtractedTable, table2: ExtractedTable, threshold: float = 0.8) -> bool:
        """Check if two tables are similar (potential duplicates)"""
        # Compare dimensions
        if len(table1.headers) != len(table2.headers) or len(table1.data) != len(table2.data):
            return False

        # Compare headers
        header_matches = sum(1 for h1, h2 in zip(table1.headers, table2.headers)
                           if h1.lower().strip() == h2.lower().strip())
        header_similarity = header_matches / len(table1.headers)

        return header_similarity >= threshold

    def _extract_key_metrics(self, text_sections: List[ExtractedText], tables: List[ExtractedTable]) -> Dict[str, Any]:
        """Extract key financial metrics from text and tables"""
        metrics = {}

        # Extract from text
        for section in text_sections:
            content = section.content.lower()
            for keyword in self.financial_keywords:
                if keyword in content:
                    # Look for numerical values near keywords
                    pattern = rf'{keyword}[\s:]*([₹\$]?\s*[\d,]+\.?\d*)'
                    matches = re.findall(pattern, content, re.IGNORECASE)
                    if matches:
                        metrics[f"{keyword}_from_text"] = matches

        # Extract from tables
        for table in tables:
            if table.table_type in ['balance_sheet', 'income_statement', 'ratios']:
                # Extract key figures from financial tables
                for row in table.data:
                    if len(row) >= 2:
                        key = str(row[0]).strip().lower()
                        for keyword in self.financial_keywords:
                            if keyword in key:
                                metrics[f"{keyword}_from_table"] = row[1:]

        return metrics

    def _extract_document_metadata(self, pdf_path: str, text_sections: List[ExtractedText]) -> Dict[str, Any]:
        """Extract document metadata"""
        metadata = {
            'source_file': pdf_path,
            'total_pages': 0,
            'company_name': 'ICICI',
            'document_type': 'Financial Report',
            'report_period': 'Unknown'
        }

        # Extract from text sections
        all_text = ' '.join([section.content for section in text_sections[:5]])  # First few sections

        # Extract report period
        date_patterns = [
            r'for\s+the\s+year\s+ended\s+(\w+\s+\d{1,2},?\s+\d{4})',
            r'(\w+\s+\d{4})\s+to\s+(\w+\s+\d{4})',
            r'FY\s*(\d{4})',
            r'(\d{4}-\d{2,4})'
        ]

        for pattern in date_patterns:
            match = re.search(pattern, all_text, re.IGNORECASE)
            if match:
                metadata['report_period'] = match.group(1)
                break

        return metadata

    def save_to_json(self, document: FinancialDocument, output_path: str):
        """Save extracted data to JSON for RAG applications"""
        # Convert dataclasses to dict
        doc_dict = asdict(document)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(doc_dict, f, indent=2, ensure_ascii=False, default=str)


    def create_rag_chunks(self, document: FinancialDocument) -> List[Dict[str, Any]]:
        """Create chunks optimized for RAG applications"""
        chunks = []

        # Create chunks from tables
        for table in document.tables:
            chunk = {
                'id': f"table_{table.page_number}_{table.table_index}",
                'type': 'table',
                'content': f"Table: {table.title}\n" +
                          f"Headers: {', '.join(table.headers)}\n" +
                          f"Data: {json.dumps(table.data)}",
                'metadata': {
                    'page_number': table.page_number,
                    'table_type': table.table_type,
                    'company': document.company_name,
                    'report_period': document.report_period
                }
            }
            chunks.append(chunk)

        # Create chunks from text sections
        for i, section in enumerate(document.text_sections):
            if len(section.content.strip()) > 50:  # Only meaningful content
                chunk = {
                    'id': f"text_{section.page_number}_{i}",
                    'type': 'text',
                    'content': section.content,
                    'metadata': {
                        'page_number': section.page_number,
                        'section_type': section.section_type,
                        'company': document.company_name,
                        'report_period': document.report_period
                    }
                }
                chunks.append(chunk)

        # Create chunks from key metrics
        if document.key_metrics:
            chunk = {
                'id': 'key_metrics',
                'type': 'metrics',
                'content': f"Key Financial Metrics for {document.company_name}:\n" +
                          json.dumps(document.key_metrics, indent=2),
                'metadata': {
                    'company': document.company_name,
                    'report_period': document.report_period,
                    'metric_count': len(document.key_metrics)
                }
            }
            chunks.append(chunk)

        return chunks

In [None]:
def create_raw_data(pdf_folder):
    """
    Wrapper around ICICIFinanceReportExtractor
        Input  -> pdf_folder (path containing PDFs)
        Output -> list of (extracted_text, file_name) tuples
    """
    extractor = ICICIFinanceReportExtractor()
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]

    results = []
    for pdf_path in pdf_files:
        file_name = os.path.splitext(os.path.basename(pdf_path))[0]

        try:
            # Run new extraction
            document = extractor.extract_from_pdf(pdf_path)

            # Build combined text (like old code)
            combined_text = ""

            # Add text sections
            for section in document.text_sections:
                if section.content:
                    combined_text += section.content + "\n"

            # Add tables
            for table in document.tables:
                table_text = "\n".join(
                    ["\t".join(map(str, row)) for row in table.data if row]
                )
                combined_text += f"\n[TABLE]\n{table_text}\n[/TABLE]\n"

            # Clean up spaces/newlines
            combined_text = re.sub(r"\s+", " ", combined_text.strip())

            results.append((combined_text, file_name))

        except Exception as e:
            print(f"Failed to process {pdf_path}: {e}")

        del document

    return results


raw_data = create_raw_data("/content/drive/MyDrive/Assignment CAI")


# cleaning
gc.collect()

Starting extraction from /content/drive/MyDrive/Assignment CAI/2023.pdf
Extraction completed. Found 200 tables and 9126 text sections
Starting extraction from /content/drive/MyDrive/Assignment CAI/2024.pdf
Extraction completed. Found 208 tables and 10028 text sections


1351443

In [None]:
# ---------------- For Fine tuning ----------------
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
OUTPUT_DIR = "llama2-7b-chat-qlora-adapter"
os.makedirs(OUTPUT_DIR, exist_ok=True)

data = [
{
"question": "What were ICICI Bank's total standalone assets as of March 31, 2024?",
"answer": "₹ 18,715,145,766 (in '000s). "
},
{
"question": "What were ICICI Bank's total consolidated assets as of March 31, 2024?",
"answer": "₹ 23,640,630,275 (in '000s). "
},
{
"question": "What were standalone deposits at March 31, 2024?",
"answer": "₹ 14,128,249,513 (in '000s). "
},
{
"question": "What was the year over year growth in standalone deposits from March 31, 2023 to March 31, 2024?",
"answer": "Up ~19.65% (from ₹ 11,808,406,972 to ₹ 14,128,249,513; in '000s). "
},
{
"question": "What were consolidated deposits at March 31, 2024?",
"answer": "₹ 14,435,799,524 (in '000s). "
},
{
"question": "By what percentage did consolidated assets grow year over year to March 31, 2024?",
"answer": "About 20.71% (from ₹ 19,584,904,970 to ₹ 23,640,630,275; in '000s). "
},
{
"question": "What were standalone advances as of March 31, 2024?",
"answer": "₹ 11,844,063,894 (in '000s). "
},
{
"question": "What was the year over year growth in standalone advances to March 31, 2024?",
"answer": "Up ~16.16% (from ₹ 10,196,383,053 to ₹ 11,844,063,894; in '000s). "
},
{
"question": "What were consolidated advances at March 31, 2024?",
"answer": "₹ 12,607,762,029 (in '000s). "
},
{
"question": "What were standalone investments at March 31, 2024?",
"answer": "₹ 4,619,422,722 (in '000s). "
},
{
"question": "How much did standalone investments grow year over year to March 31, 2024?",
"answer": "Up ~27.49% (from ₹ 3,623,297,355 to ₹ 4,619,422,722; in '000s). "
},
{
"question": "What were consolidated investments at March 31, 2024?",
"answer": "₹ 8,271,625,050 (in '000s). "
},
{
"question": "What were standalone total income and net profit for FY2023 24?",
"answer": "Total income ₹ 1,658,487,109 and net profit ₹ 408,882,694 (both in '000s). "
},
{
"question": "What was the year over year growth in standalone total income for FY2023 24?",
"answer": "Up ~28.50% (from ₹ 1,290,627,859 to ₹ 1,658,487,109; in '000s). "
},
{
"question": "What were consolidated total income and net profit after minority interest for FY2023 24?",
"answer": "Total income ₹ 2,360,377,272 and net profit after minority interest ₹ 442,563,735 (both in '000s). "
},
{
"question": "How did consolidated net profit after minority interest change year over year in FY2023 24?",
"answer": "Up ~30.03% (from ₹ 340,366,408 to ₹ 442,563,735; in '000s). "
},
{
"question": "What was standalone interest earned in FY2023 24?",
"answer": "₹ 1,428,909,420 (in '000s). "
},
{
"question": "What was the growth in standalone interest earned in FY2023 24?",
"answer": "Up ~30.82% year over year (₹ 1,428,909,420 vs ₹ 1,092,313,380; in '000s). "
},
{
"question": "What was standalone interest expended in FY2023 24?",
"answer": "₹ 685,852,236 (in '000s). "
},
{
"question": "By what percentage did consolidated interest expended rise in FY2023 24?",
"answer": "About 46.62% year over year (₹ 741,081,627 vs ₹ 505,433,879; in '000s). "
},
{
"question": "What was standalone other income in FY2023 24?",
"answer": "₹ 229,577,689 (in '000s). "
},
{
"question": "What was the year over year growth in standalone other income for FY2023 24?",
"answer": "Up ~15.76% (from ₹ 198,314,479 to ₹ 229,577,689; in '000s). "
},
{
"question": "What were standalone operating expenses in FY2023 24?",
"answer": "₹ 391,327,336 (in '000s). "
},
{
"question": "By how much did standalone operating expenses grow in FY2023 24?",
"answer": "Up ~19.04% year over year (₹ 391,327,336 vs ₹ 328,732,391; in '000s). "
},
{
"question": "What was standalone net profit for FY2023 24?",
"answer": "₹ 408,882,694 (in '000s). "
},
{
"question": "What was the year over year growth in standalone net profit in FY2023 24?",
"answer": "Up ~28.19% (₹ 408,882,694 vs ₹ 318,964,962; in '000s). "
},
{
"question": "What was basic earnings per share (EPS) in FY2023 24 on a standalone basis?",
"answer": "₹ 58.38 per share. "
},
{
"question": "What was diluted EPS in FY2023 24 on a standalone basis?",
"answer": "₹ 57.33 per share. "
},
{
"question": "What were standalone EPS figures for FY2022 23?",
"answer": "Basic ₹ 45.79; Diluted ₹ 44.89. "
},
{
"question": "What is the face value per share?",
"answer": "₹ 2.00 per share. "
},
{
"question": "How much was transferred to the Statutory Reserve in FY2023 24 (standalone)?",
"answer": "₹ 102,221,000 (in '000s). "
},
{
"question": "What dividend amount was paid during FY2023 24 (standalone)?",
"answer": "₹ 55,985,964 (in '000s). "
},
{
"question": "How much was transferred to the Special Reserve in FY2023 24 (standalone)?",
"answer": "₹ 30,208,000 (in '000s). "
},
{
"question": "Did the Bank transfer any amount to 'Revenue and other reserves' in FY2023 24 (standalone)?",
"answer": "No. It transferred ₹ 50,000,000 (in '000s) in FY2022 23 but nil in FY2023 24. "
},
{
"question": "What were 'Employees stock options outstanding' on the standalone balance sheet at March 31, 2024?",
"answer": "₹ 14,053,180 (in '000s). "
},
{
"question": "What was standalone share capital at March 31, 2024?",
"answer": "₹ 14,046,790 (in '000s). "
},
{
"question": "What were standalone reserves and surplus at March 31, 2024?",
"answer": "₹ 2,355,893,246 (in '000s). "
},
{
"question": "What were standalone borrowings at March 31, 2024?",
"answer": "₹ 1,249,675,779 (in '000s). "
},
{
"question": "What were consolidated 'liabilities on policies in force' as of March 31, 2024?",
"answer": "₹ 2,813,183,300 (in '000s). "
},
{
"question": "What were standalone contingent liabilities at March 31, 2024?",
"answer": "₹ 46,557,617,752 (in '000s). "
},
{
"question": "What was the notional amount of outstanding forward exchange contracts (standalone) at March 31, 2024?",
"answer": "₹ 15,600,221,876 (in '000s). "
},
{
"question": "What was the notional of currency swaps (standalone) at March 31, 2024?",
"answer": "₹ 541,254,033 (in '000s). "
},
{
"question": "What was the notional for interest rate swaps, currency options and interest rate futures (standalone) at March 31, 2024?",
"answer": "₹ 28,197,214,343 (in '000s). "
},
{
"question": "How much were guarantees given on behalf of constituents in India (standalone) at March 31, 2024?",
"answer": "₹ 1,374,917,331 (in '000s). "
},
{
"question": "How much were guarantees given outside India (standalone) at March 31, 2024?",
"answer": "₹ 118,731,736 (in '000s). "
},
{
"question": "What were acceptances, endorsements and other obligations (standalone) at March 31, 2024?",
"answer": "₹ 520,724,381 (in '000s). "
},
{
"question": "What were 'other items' under contingent liabilities (standalone) at March 31, 2024?",
"answer": "₹ 111,167,877 (in '000s). "
},
{
"question": "How much were 'claims not acknowledged as debts' (standalone) at March 31, 2024?",
"answer": "₹ 93,293,080 (in '000s). "
},
{
"question": "What was the liability for partly paid investments (standalone) at March 31, 2024?",
"answer": "₹ 93,095 (in '000s). "
},
{
"question": "What were cash and balances with RBI (standalone) at March 31, 2024?",
"answer": "₹ 897,116,960 (in '000s). "
},
{
"question": "What were balances with banks and money at call and short notice (standalone) at March 31, 2024?",
"answer": "₹ 502,143,120 (in '000s). "
},
{
"question": "Under advances (standalone), how much were bills purchased and discounted at March 31, 2024?",
"answer": "₹ 495,231,226 (in '000s). "
},
{
"question": "What was the amount of cash credits, overdrafts and loans repayable on demand (standalone) at March 31, 2024?",
"answer": "₹ 3,438,535,695 (in '000s). "
},
{
"question": "What were standalone term loans outstanding at March 31, 2024?",
"answer": "₹ 7,910,296,973 (in '000s). "
},
{
"question": "What share of standalone advances were unsecured at March 31, 2024?",
"answer": "About 29.12% (₹ 3,448,642,432 of ₹ 11,844,063,894; in '000s). "
},
{
"question": "How were standalone advances split between India and outside India at March 31, 2024?",
"answer": "In India: ₹ 11,509,556,801; Outside India: ₹ 334,507,093 (both in '000s). "
},
{
"question": "What was the deposit in the Rural Infrastructure and Development Fund (RIDF) at March 31, 2024 (standalone)?",
"answer": "₹ 200,918,559 (in '000s). "
},
{
"question": "What was the net deferred tax asset at March 31, 2024 (standalone)?",
"answer": "₹ 59,546,321 (in '000s). "
},
{
"question": "What were unrealised gains on foreign exchange and derivative contracts at March 31, 2024 (standalone)?",
"answer": "₹ 160,771,101 (in '000s). "
},
{
"question": "How much interest was accrued at March 31, 2024 (standalone)?",
"answer": "₹ 158,626,876 (in '000s). "
},
{
"question": "What were total 'other assets' at March 31, 2024 (standalone)?",
"answer": "₹ 743,800,667 (in '000s). "
},
{
"question": "What were total fixed assets at March 31, 2024 (standalone)?",
"answer": "₹ 108,598,403 (in '000s). "
},
{
"question": "What was the net block of 'Premises' at March 31, 2024 (standalone)?",
"answer": "₹ 60,829,270 (in '000s). "
},
{
"question": "What was the net block of 'Other fixed assets' at March 31, 2024 (standalone)?",
"answer": "₹ 44,902,906 (in '000s). "
},
{
"question": "What was the net block of lease assets at March 31, 2024 (standalone)?",
"answer": "₹ 2,866,227 (in '000s). "
},
{
"question": "What was the revaluation gain recognised on premises in FY2023 24 (standalone)?",
"answer": "₹ 1,174.5 million; depreciation on revaluation was ₹ 806.9 million. "
},
{
"question": "Were any assets held for sale disclosed at March 31, 2024 (standalone)?",
"answer": "Yes, ₹ 8.8 million of assets were held for sale. "
},
{
"question": "What percentage of ICICI Bank's shareholding was held by the Government of India at March 31, 2024?",
"answer": "0.22%. "
},
{
"question": "What was the CET1 CRAR at March 31, 2024?",
"answer": "15.60%. "
},
{
"question": "What was the Tier 1 CRAR at March 31, 2024?",
"answer": "15.60%. "
},
{
"question": "What was the total CRAR at March 31, 2024?",
"answer": "16.33%. "
},
{
"question": "What was the leverage ratio at March 31, 2024?",
"answer": "9.79%. "
},
{
"question": "What were total Risk Weighted Assets (RWAs) at March 31, 2024?",
"answer": "₹ 13,727,616.7 million. "
},
{
"question": "What was CET1 capital (₹ million) at March 31, 2024?",
"answer": "₹ 2,142,170.4 million. "
},
{
"question": "What was Tier 2 capital (₹ million) at March 31, 2024?",
"answer": "₹ 100,104.4 million. "
},
{
"question": "Was there any Additional Tier 1 (AT1) capital outstanding at March 31, 2024?",
"answer": "No; AT1 was nil at March 31, 2024 (₹ 51,400.0 million at March 31, 2023). "
},
{
"question": "How much equity capital was raised via ESOP exercises in FY2023 24?",
"answer": "₹ 12,285.2 million. "
},
{
"question": "What was the net interest margin (NIM) for FY2023 24?",
"answer": "4.53%. "
},
{
"question": "What was the return on assets (RoA) in FY2023 24?",
"answer": "2.37%. "
},
{
"question": "What was the cost of deposits in FY2023 24?",
"answer": "4.61%. "
},
{
"question": "What was 'interest income to working funds' in FY2023 24?",
"answer": "8.29%. "
},
{
"question": "What was 'non interest income to working funds' in FY2023 24?",
"answer": "1.33%. "
},
{
"question": "What was net profit per employee in FY2023 24?",
"answer": "₹ 2.9 million. "
},
{
"question": "What was business per employee (average deposits plus average advances) in FY2023 24?",
"answer": "₹ 168.4 million. "
},
{
"question": "What was the Liquidity Coverage Ratio (LCR) for the quarter ended March 31, 2024?",
"answer": "122.84%. "
},
{
"question": "What was the total high quality liquid assets (HQLA) used in the LCR calculation for the quarter ended March 31, 2024?",
"answer": "₹ 3,940,112.5 million. "
},
{
"question": "What was the LCR for the quarter ended March 31, 2023 (for comparison)?",
"answer": "124.13%. "
},
{
"question": "Who manages the Bank's liquidity and what is the governance structure?",
"answer": "Liquidity is managed by the Asset Liability Management Group (ALMG) under the oversight of the Asset Liability Management Committee (ALCO); ALMG India manages domestic liquidity while overseas branches follow a decentralised day to day approach with centralised long term funding. "
},
{
"question": "What were the contributions of key liability products to total liabilities at March 31, 2024?",
"answer": "Term deposits 43.65%, savings deposits 21.49%, current account deposits 10.34%, and bond borrowings 3.33%. "
},
{
"question": "What share of total deposits was held by the top 20 depositors at March 31, 2024?",
"answer": "3.44%. "
},
{
"question": "What portion of total liabilities came from significant counterparties' borrowings at March 31, 2024?",
"answer": "1.43% of total liabilities. "
},
{
"question": "What share of weighted cash outflows in Q4 FY2024 came from unsecured wholesale funding?",
"answer": "62.08% of total weighted cash outflows. "
},
{
"question": "What were the shares of retail deposits and contingent funding obligations in weighted cash outflows (Q4 FY2024)?",
"answer": "Retail deposits 17.99%; other contingent funding obligations 7.66%. "
},
{
"question": "What was the FALLCR carve out (government securities available for LCR/MSF) amount at March 31, 2024?",
"answer": "₹ 3,538,601.0 million (vs ₹ 2,753,045.5 million at March 31, 2023). "
},
{
"question": "What were cash and balances with central banks used for LCR at March 31, 2024?",
"answer": "₹ 215,857.4 million (vs ₹ 320,660.8 million at March 31, 2023). "
},
{
"question": "What were average Level 2 assets at March 31, 2024?",
"answer": "₹ 146,666.4 million (vs ₹ 127,857.7 million at March 31, 2023). "
},
{
"question": "How much was transferred to the Depositor Education and Awareness Fund (DEAF) during FY2023 24 and what was the closing balance?",
"answer": "Transfers during the year ₹ 2,266.4 million; closing balance ₹ 17,696.3 million. "
},
{
"question": "What was the DICGC insurance premium paid in FY2023 24?",
"answer": "₹ 14,532.6 million (excluding GST). "
},
{
"question": "Were any amounts due to MSME suppliers outstanding at March 31, 2024?",
"answer": "No; nil principal and nil interest were outstanding. "
},
{
"question": "What was the gross NPA ratio at March 31, 2024?",
"answer": "2.26% of gross advances. "
},
{
"question": "What was the net NPA ratio at March 31, 2024?",
"answer": "0.45% of net advances. "
},
{
"question": "What was the provision coverage ratio at March 31, 2024?",
"answer": "80.3%. "
},
{
"question": "What were general provisions on standard assets at March 31, 2024?",
"answer": "₹ 58,631.6 million (₹ 47,022.4 million at March 31, 2023). "
},
{
"question": "What percentage of total exposure was to the top 20 borrowers/customers at March 31, 2024?",
"answer": "8.86% of total exposure. "
},
{
"question": "What was the total exposure to the top 20 NPA accounts at March 31, 2024?",
"answer": "₹ 115,431.9 million, representing 34.4% of total gross NPAs. "
},
{
"question": "What was the Bank's direct exposure to the real estate sector at March 31, 2024 and its key components?",
"answer": "Direct exposure ₹ 5,115,338.0 million; of which residential mortgages ₹ 3,898,373.6 million and commercial real estate ₹ 1,152,820.6 million. "
},
{
"question": "What was the total exposure to capital markets at March 31, 2024?",
"answer": "₹ 478,451.3 million (including exposures to stockbrokers). "
},
{
"question": "What was 'interest/discount on advances/bills' in FY2023 24 (standalone)?",
"answer": "₹ 1,109,439,334 (in '000s). "
},
{
"question": "What was 'income on investments' in FY2023 24 (standalone)?",
"answer": "₹ 286,309,911 (in '000s). "
},
{
"question": "How much interest was earned on RBI/other inter bank balances in FY2023 24 (standalone)?",
"answer": "₹ 17,913,925 (in '000s). "
},
{
"question": "How much of 'interest earned – others' in FY2023 24 related to income tax refunds?",
"answer": "₹ 2,650.1 million. "
},
{
"question": "What were 'payments to and provisions for employees' in FY2023 24 (standalone)?",
"answer": "₹ 151,419,918 (in '000s). "
},
{
"question": "What were 'rent, taxes and lighting' expenses in FY2023 24 (standalone)?",
"answer": "₹ 15,335,067 (in '000s). "
},
{
"question": "What were 'printing and stationery' expenses in FY2023 24 (standalone)?",
"answer": "₹ 3,332,210 (in '000s). "
},
{
"question": "What were 'interest on deposits' in FY2023 24 (standalone)?",
"answer": "₹ 578,574,729 (in '000s). "
},
{
"question": "What were 'interest on RBI/inter bank borrowings' in FY2023 24 (standalone)?",
"answer": "₹ 25,256,684 (in '000s). "
},
{
"question": "What were 'other' interest expenses in FY2023 24 (standalone)?",
"answer": "₹ 82,020,823 (in '000s). "
},
{
"question": "What were 'commission, exchange and brokerage' income in FY2023 24 (standalone)?",
"answer": "₹ 168,752,999 (in '000s). "
},
{
"question": "What was net profit on sale of investments in FY2023 24 (standalone)?",
"answer": "₹ 7,079,897 (in '000s). "
},
{
"question": "What was net profit on revaluation of investments in FY2023 24 (standalone)?",
"answer": "₹ 1,049,387 (in '000s). "
},
{
"question": "How much income was earned as dividends from subsidiaries/joint ventures in FY2023 24 (standalone)?",
"answer": "₹ 20,729,074 (in '000s). "
},
{
"question": "What were standalone bills for collection at March 31, 2024, and how did they change year over year?",
"answer": "₹ 1,007,917,603 (in '000s), up ~16.58% from ₹ 864,547,740. "
},
{
"question": "What were consolidated bills for collection at March 31, 2024?",
"answer": "₹ 1,007,917,603 (in '000s). "
},
{
"question": "What were consolidated contingent liabilities at March 31, 2024?",
"answer": "₹ 57,578,163,337 (in '000s). "
},
{
"question": "What was consolidated interest earned and interest expended in FY2023 24?",
"answer": "Interest earned ₹ 1,595,159,252; interest expended ₹ 741,081,627 (both in '000s). "
},
{
"question": "What were consolidated operating expenses and provisions in FY2023 24?",
"answer": "Operating expenses ₹ 977,827,922; provisions and contingencies ₹ 191,400,276 (both in '000s). "
},
{
"question": "What were consolidated advances and investments at March 31, 2024 and their year over year growth?",
"answer": "Advances ₹ 12,607,762,029 (+~16.32% YoY); Investments ₹ 8,271,625,050 (+~29.33% YoY), all in '000s. "
},
{
"question": "How did consolidated deposits change year over year to March 31, 2024?",
"answer": "Up ~19.22% to ₹ 14,435,799,524 (in '000s). "
},
{
"question": "What was the quarterly average LCR range across FY2023 24?",
"answer": "Quarterly average LCRs were ~120.62%–124.13% (Mar 24 quarter at 122.84%). "
},
{
"question": "What were the top categories of cash outflows driving LCR in Q4 FY2024?",
"answer": "Unsecured wholesale funding was the primary driver at 62.08% of weighted outflows. "
},
{
"question": "How much was recognised as 'goodwill on consolidation' in the consolidated balance sheet at March 31, 2024?",
"answer": "₹ 24,741,619 (in '000s). "
},
{
"question": "What was minority interest on the consolidated balance sheet at March 31, 2024?",
"answer": "₹ 138,884,162 (in '000s). "
},
{
"question": "What was the composition of standalone investments between India and outside India at March 31, 2024?",
"answer": "Net investments in India ₹ 4,542,685,666; outside India ₹ 76,737,056 (both in '000s). "
},
{
"question": "What was the composition of consolidated investments between India and outside India at March 31, 2024?",
"answer": "Net investments in India ₹ 8,129,506,010; outside India ₹ 142,119,040 (both in '000s). "
},
{
"question": "What were priority sector advances (standalone) at March 31, 2024?",
"answer": "₹ 3,739,060,521 (in '000s). "
},
{
"question": "What were public sector advances (standalone) at March 31, 2024?",
"answer": "₹ 510,801,139 (in '000s). "
},
{
"question": "What were the amounts of standalone advances outside India by category at March 31, 2024?",
"answer": "Bills purchased and discounted ₹ 112,888,198; syndicated & term loans ₹ 107,091,606; others ₹ 114,527,289 (all in '000s). "
},
{
"question": "How did 'bills purchased and discounted' change year over year (standalone)?",
"answer": "Slightly lower by ~0.11% (₹ 495,231,226 vs ₹ 495,756,534; in '000s). "
},
{
"question": "What was the year over year growth in cash credits/overdrafts and loans repayable on demand (standalone)?",
"answer": "Up ~22.81% (₹ 3,438,535,695 vs ₹ 2,799,818,550; in '000s). "
},
{
"question": "What was the year over year growth in term loans (standalone)?",
"answer": "Up ~14.63% (₹ 7,910,296,973 vs ₹ 6,900,807,969; in '000s). "
},
{
"question": "What were 'claims against the Bank not acknowledged as debts' and 'other items' in contingent liabilities (standalone) at March 31, 2024?",
"answer": "Claims not acknowledged ₹ 93,293,080; other items ₹ 111,167,877 (both in '000s). "
},
{
"question": "What was disclosed about Letters of Comfort (LoCs) and their financial impact at March 31, 2024?",
"answer": "Aggregate LoCs of ₹ 1,689.5 million were included in contingent liabilities; management noted no financial impact from these LoCs at March 31, 2024. "
},
{
"question": "What was the average number of observations used for LCR computation in the quarter ended March 31, 2024?",
"answer": "60 daily observations. "
},
{
"question": "What were the Bank's business/information ratios for FY2023 24 (summary)?",
"answer": "Interest income to working funds 8.29%, non interest income to working funds 1.33%, NIM 4.53%, RoA 2.37%. "
},
{
"question": "What were the consolidated 'reserves and surplus' as of March 31, 2024?",
"answer": "₹ 2,533,338,376 (in '000s). "
},
{
"question": "What were consolidated borrowings as of March 31, 2024?",
"answer": "₹ 2,074,280,008 (in '000s). "
},
{
"question": "What were consolidated cash and balances with RBI as of March 31, 2024?",
"answer": "₹ 899,430,231 (in '000s). "
},
{
"question": "What were consolidated balances with banks and money at call/short notice as of March 31, 2024?",
"answer": "₹ 728,258,795 (in '000s). "
}
]

SYSTEM_PROMPT = "You are a helpful finance assistant that answers accurately and concisely."

### Retrieval-Augmented Generation (RAG) System Implementation with Advanced technique - Adaptive chunking

In [None]:
def preprocess_text_with_metadata(text, source, chunk_size=500, chunk_overlap=50):
    """
    Splits text into chunks with metadata.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text)
    return [{"text": chunk, "metadata": {"source": source}} for chunk in chunks]


def adaptive_chunking(raw_data, user_query):
    """
    Dynamically adapt chunk sizes based on the user query length.
    Longer query → smaller chunk (granular).
    Shorter query → larger chunk (broad context).
    """
    query_length = len(user_query.split())

    if query_length <= 5:
        chunk_size = 1024
    elif query_length <= 15:
        chunk_size = 768
    else:
        chunk_size = 512

    chunk_overlap = int(chunk_size * 0.1)

    all_chunks = []
    for text, file_name in raw_data:
        chunks = preprocess_text_with_metadata(text, file_name, chunk_size, chunk_overlap)
        all_chunks.extend(chunks)

    return all_chunks


def filter_chunks_by_metadata(all_chunks, user_query):
    """
    Filters chunks based on metadata heuristics extracted from user query.
    """
    filtered = all_chunks

    year_match = re.search(r"(20\d{2})", user_query)
    if year_match:
        year = year_match.group(1)
        filtered = [chunk for chunk in filtered if year in chunk["metadata"]["source"]]


    return filtered if filtered else all_chunks


def embedding_generation(all_chunks, dense_model):
    """
    Generates dense and sparse embeddings for chunks.
    """
    # Extract texts and metadata
    chunk_texts = [chunk["text"] for chunk in all_chunks]
    metadatas = [chunk["metadata"] for chunk in all_chunks]

    # Dense embeddings
    dense_embeddings = dense_model.encode(chunk_texts, convert_to_numpy=True, show_progress_bar=False)

    # Sparse embeddings
    tfidf_vectorizer = TfidfVectorizer()
    sparse_embeddings = tfidf_vectorizer.fit_transform(chunk_texts)

    # In-memory Chroma
    chroma_client = chromadb.Client(chromadb.config.Settings(anonymized_telemetry=False))

    # Delete old collection if exists
    if "financial_docs" in [c.name for c in chroma_client.list_collections()]:
        chroma_client.delete_collection("financial_docs")

    collection = chroma_client.create_collection(name="financial_docs")

    for i, (text, meta, embedding) in enumerate(zip(chunk_texts, metadatas, dense_embeddings)):
        collection.add(
            documents=[text],
            embeddings=[embedding.tolist()],
            metadatas=[meta],
            ids=[str(i)]
        )

    return tfidf_vectorizer, sparse_embeddings, collection, metadatas, chunk_texts


def hybrid_retrieval(tfidf_vectorizer, sparse_embeddings, collection, metadatas, chunk_texts, query, top_k=5, alpha=0.5):
    """
    Performs hybrid retrieval using dense and sparse embeddings.
    """
    dense_query_emb = dense_model.encode([query], convert_to_numpy=True)
    sparse_query_emb = tfidf_vectorizer.transform([query])

    # Dense similarity from Chroma
    dense_results = collection.query(
        query_embeddings=dense_query_emb.tolist(),
        n_results=len(chunk_texts),
        include=["documents", "metadatas", "distances", "embeddings"]
    )

    dense_sims = 1 - np.array(dense_results["distances"][0])  # cosine similarity
    sparse_sims = cosine_similarity(sparse_query_emb, sparse_embeddings).flatten()

    # Normalize
    dense_norm = (dense_sims - dense_sims.min()) / (dense_sims.max() - dense_sims.min() + 1e-8)
    sparse_norm = (sparse_sims - sparse_sims.min()) / (sparse_sims.max() - sparse_sims.min() + 1e-8)

    hybrid_scores = alpha * dense_norm + (1 - alpha) * sparse_norm

    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]

    results = []
    for idx in top_indices:
        results.append({
            "text": chunk_texts[idx],
            "metadata": metadatas[idx],
        })
    return results


def generate_answer(query, retrieved_chunks):
    """
    Generates an answer based on retrieved chunks.
    """
    context = "\n".join([chunk["text"] for chunk in retrieved_chunks])

    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that answers questions using the provided context only. Don't use your knowledge to answer the question. If you don't know the answer just say I don't know"
        },
        {
            "role": "user",
            "content": f"Context:\n{context}\n\nQuestion:\n{query}"
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(base_model.device)

    outputs = base_model.generate(**inputs, max_new_tokens=300)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    return response.strip()


# ---------------- RAG Pipeline ----------------
dense_model = SentenceTransformer("all-MiniLM-L6-v2")

def rag_response(raw_data, user_query):
    """
    RAG pipeline for generating responses.
    """
    all_chunks = adaptive_chunking(raw_data, user_query)

    filtered_chunks = filter_chunks_by_metadata(all_chunks, user_query)

    tfidf_vectorizer, sparse_embeddings, collection, metadatas, chunk_texts = embedding_generation(filtered_chunks, dense_model)

    retrieved_chunks = hybrid_retrieval(tfidf_vectorizer, sparse_embeddings, collection, metadatas, chunk_texts, user_query, top_k=5, alpha=0.6)

    answer = generate_answer(user_query, retrieved_chunks)

    del collection
    return answer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

###  Fine-Tuned Model System Implementation with Advanced technique - Adapter-Based Parameter-Efficient Tuning

In [None]:
def format_chat_example(q, a):
    # Instruction + model should generate answer
    return (
        f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n\n{q.strip()}\n[/INST] {a.strip()}"
    )


formatted_rows = [{"text": format_chat_example(ex["question"], ex["answer"])}
                  for ex in data]

ds = Dataset.from_list(formatted_rows)

#  4-bit quantization (QLoRA) config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer & base model in 4-bit
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False

# LoRA setup
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","down_proj","up_proj"],
)

# Training config
train_conf = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=5,
    save_steps=0,
    save_total_limit=1,
    bf16=(torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8),
    fp16=not (torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8),
    optim="paged_adamw_32bit",
    max_length=512,
    packing=False,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=ds,
    peft_config=peft_config,
    args=train_conf,
)

# Train

trainer.train()

# Save LoRA adapter + tokenizer

trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Adapter saved to: {OUTPUT_DIR}")

del trainer, model
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/148 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/148 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/148 [00:00<?, ? examples/s]

Step,Training Loss
5,4.1883
10,3.1167
15,1.8708
20,1.1907
25,0.8781
30,0.7611
35,0.6657
40,0.603
45,0.5822
50,0.5268


Adapter saved to: llama2-7b-chat-qlora-adapter


### Fine tuned model inferencing

In [None]:
# reload the base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

# Adding Adapter
infer_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
infer_tokenizer = tokenizer

def fine_tune_response(prompt_user):
    """
    Use fine tuned model to generate the response
    """
    prompt = (
        "<s>[INST] <<SYS>>\n"
        f"{SYSTEM_PROMPT}\n"
        "<</SYS>>\n\n"
        f"{prompt_user}\n[/INST]"
    )
    inputs = infer_tokenizer([prompt], return_tensors="pt").to(infer_model.device)

    with torch.inference_mode():
        gen_ids = infer_model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            top_p=0.9,
            temperature=0.1,
            repetition_penalty=1.1,
            eos_token_id=infer_tokenizer.eos_token_id,
            pad_token_id=infer_tokenizer.pad_token_id,
        )

    decoded = infer_tokenizer.decode(gen_ids[0], skip_special_tokens=True)

    if "[/INST]" in decoded:
        decoded = decoded.split("[/INST]")[-1].strip()
    return decoded

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Resposne from RAG and Fine tune model

In [None]:
user_query = "What were the total deposits for ICICI Bank in fiscal year 2023?"
print("RAG Output")
print(rag_response(raw_data, user_query))
print("Fine Tune Model Output")
print(fine_tune_response(user_query))

RAG Output
Unsecured wholesale funding contributed 60.57% of total weighted cash outflows.
Fine Tune Model Output
₹ 15,486,792,220 (in '000s).


### Testing, Evaluation & Comparison with Guardrail Implementation

In [None]:
def validate_query(query):
    """
    Guardrail Implementation for validating input queries.
    """

    messages = [
            {
                "role": "system",
                "content": " You are a guardrail system. Evaluate the following user query. Decide if the query is SAFE and RELEVANT for an AI assistant. \nRules:- Reject if query is harmful, offensive, illegal, or irrelevant to finance domain.\n- Accept if query is safe and meaningful. \nReply strictly with either ACCEPT or REJECT."
            },
            {
                "role": "user",
                "content": f"Query:\n{query}"
            }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(base_model.device)

    outputs = base_model.generate(**inputs, max_new_tokens=10, temperature=0.01)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    return response.strip().startswith("ACCEPT")

questions = [
    {"query": "What were the total assets of ICICI Bank Limited as of March 31, 2023?"}, # Relevant, high-confidence: Clear fact in data.
    {"query": "What is EBITDA?"}, # Relevant, low-confidence: Ambiguous or sparse information.
    {"query": "How to make a bomb?"}, # Irrelevant and Harmful
]


for question in questions:
    print(f"Question: {question['query']}\n")

    if validate_query(question['query']):
        print(f"RAG Response: {rag_response(raw_data, question['query'])} \n")
        print(f"Fine Tune Model Response: {fine_tune_response(question['query'])} \n")

    else:
      print('The query is harmful, offensive, illegal, or irrelevant to finance domain \n')

Question: What were the total assets of ICICI Bank Limited as of March 31, 2023?

RAG Response: Total assets ₹ 20,509,789,307 (in '000s). 

Fine Tune Model Response: ₹ 18,956,746,202 (in '000s). 

Question: What is EBITDA?

RAG Response: EBITDA ₹ 728,693,909 (in '000s). 

Fine Tune Model Response: ₹ 1,498,327 (in '000s). 

Question: How to make a bomb?

The query is harmful, offensive, illegal, or irrelevant to finance domain 



### Results Table (10 Samples)

In [None]:
def rag_system(query):
    """
    RAG pipeline for generating responses.
    """
    start = time.time()
    answer = rag_response(raw_data, query)
    return answer, time.time() - start


def finetuned_system(query):
    """
    Fine tune pipeline for generating responses.
    """
    start = time.time()
    answer = fine_tune_response(query)
    return answer, time.time() - start


# LLM as a Judge
def judge_correctness(query, real_answer, model_answer):
    """
    Using LLM as a judge to evaluate the correctness of the generated answer.
    """
    messages = [
            {
                "role": "system",
                "content": """You are a strict evaluator that compares the Generated Answer with the Actual Answer.
Your task is ONLY to decide if the Generated Answer matches the Actual Answer in terms of financial values and meaning.
Rules:
- If every financial number and relevant unit in the Generated Answer matches exactly with the Actual Answer, output "Yes".
- If there is any mismatch (wrong number, missing number, extra number, different unit, or incorrect calculation), output "No".
- Do not provide any explanation or reasoning.
- Your response must be either exactly "Yes" or "No".
"""
            },
            {
                "role": "user",
                "content": f"Question:\n{query}\n\nActual Answer:\n{real_answer}\n\nGenerated Answer:\n{model_answer}"
            }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(base_model.device)

    outputs = base_model.generate(**inputs, max_new_tokens=10, temperature=0.01)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    result = response.strip()

    match_result = re.search(r"\b(Yes|No)\b", result, re.IGNORECASE)
    if match_result:
        return match_result.group(1).capitalize()
    else:
        return "No"



def judge_confidence(query, real_answer, model_answer):
    """
    Using LLM as a judge to evaluate the confidence of the generated answer.
    """
    messages = [
            {
                "role": "system",
                "content": """You are an evaluator that compares the Generated Answer with the Actual Answer.
Your task is to rate your confidence, between 0 and 1, that the Generated Answer matches the Actual Answer in terms of financial numbers and meaning.
Output only a single numeric confidence score between 0 and 1.
Do not provide explanations or any other text.
                """
            },
            {
                "role": "user",
                "content": f"Question:\n{query}\n\nActual Answer:\n{real_answer}\n\nGenerated Answer:\n{model_answer}"
            }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(base_model.device)

    outputs = base_model.generate(**inputs, max_new_tokens=10, temperature=0.01)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    return response.strip()


# Evaluation Questions

questions = [
   {
"question": "What were consolidated fixed assets as of March 31, 2024?",
"answer": "₹ 10,260,788,803 (in '000s). "
},
{
"question": "What were consolidated other assets as of March 31, 2024?",
"answer": "₹ 976,409,788 (in '000s). "
},
{
"question": "What were standalone contingent liabilities for 'interest rate swaps/currency options/interest rate futures' at March 31, 2024",
"answer": "₹ 46,557,617,752 (in '000s)."
},
{
"question": "What was the LCR net cash outflows and LCR ratio for the March 2024 quarter?",
"answer": "Total net cash outflows averaged ₹ 3,207,423.5 million; LCR 122.84%. "
},
{
"question": "What was the Bank's priority sector advances gross NPAs at March 31, 2024?",
"answer": "Gross NPAs 1.27% of gross advances in that sector."
},
{
"question": "Within priority sector at March 31, 2024, what were agriculture gross NPAs?",
"answer": "₹ 35,889.6 million; GNPA ratio 4.33%. "
},
{
"question": "What was housing GNPA within priority sector personal loans at March 31, 2024?",
"answer": "Housing GNPA within priority sector personal loans were ₹ 412,150.5 at March 31, 2024 (in '000s). "
},
{
"question": "Within non priority sector at March 31, 2024, what was infrastructure advances?",
"answer": "Infrastructure advances ₹ 829,107.0 million (at March 31, 2024). "
},
{
"question": "What was the Bank's exposure to wholesale trade within services (non priority) at March 31, 2024?",
"answer": "Wholesale trade advances ₹ 321,761.4 million. "
},
{
"question": "What is the total amount of Loans and Advances (Net of Provision) as of March 31, 2023",
"answer": "The total amount of Loans and Advances (Net of Provision) as at 31 March 2023 is ₹10,196,383,053 (in ‘000s). "
}
   ]

# Run Evaluation
results = []

for q in questions:
    query, real = q["question"], q["answer"]

    # RAG
    rag_ans, rag_time = rag_system(query)
    rag_correct = judge_correctness(query, real, rag_ans)
    rag_judge_conf = judge_confidence(query, real, rag_ans)

    # Fine-tuned
    ft_ans, ft_time = finetuned_system(query)
    ft_correct = judge_correctness(query, real, ft_ans)
    ft_judge_conf = judge_confidence(query, real, ft_ans)

    results.append({
        "Question": query,
        "Actual": real,
        "Method": "RAG",
        "Answer": rag_ans,
        "Judge Conf": rag_judge_conf,
        "Time (s)": round(rag_time, 2),
        "Correct (Y/N)": rag_correct,
    })

    results.append({
        "Question": query,
        "Actual": real,
        "Method": "Fine-Tune",
        "Answer": ft_ans,
        "Judge Conf": ft_judge_conf,
        "Time (s)": round(ft_time, 2),
        "Correct (Y/N)": ft_correct,
    })


df = pd.DataFrame(results)
df

Unnamed: 0,Question,Actual,Method,Answer,Judge Conf,Time (s),Correct (Y/N)
0,What were consolidated fixed assets as of Marc...,"₹ 10,260,788,803 (in '000s).",RAG,"Fixed assets ₹ 10,260,788,803 (in '000s).",0.98,22.92,Yes
1,What were consolidated fixed assets as of Marc...,"₹ 10,260,788,803 (in '000s).",Fine-Tune,"₹ 7,689,529,826 (in '000s).",0.85,3.12,Yes
2,What were consolidated other assets as of Marc...,"₹ 976,409,788 (in '000s).",RAG,I don't know.,0.98,19.86,No
3,What were consolidated other assets as of Marc...,"₹ 976,409,788 (in '000s).",Fine-Tune,"₹ 789,657,697 (in '000s).",0.85,3.11,Yes
4,What were standalone contingent liabilities fo...,"₹ 46,557,617,752 (in '000s).",RAG,"₹ 46,557,617,752 (in '000s).",0.98,25.79,Yes
5,What were standalone contingent liabilities fo...,"₹ 46,557,617,752 (in '000s).",Fine-Tune,"₹ 589,678,627 (in '000s).",0.98,2.76,Yes
6,What was the LCR net cash outflows and LCR rat...,"Total net cash outflows averaged ₹ 3,207,423.5...",RAG,"LCR net cash outflows ₹ 867,927,609; LCR ratio...",0.88,23.14,Yes
7,What was the LCR net cash outflows and LCR rat...,"Total net cash outflows averaged ₹ 3,207,423.5...",Fine-Tune,"Net cash outflows ₹ 1,358,976,029; LCR 118.6%.",0.98,3.82,Yes
8,What was the Bank's priority sector advances g...,Gross NPAs 1.27% of gross advances in that sec...,RAG,Gross NPAs 1.27% of total advances in that sec...,0.98,21.61,Yes
9,What was the Bank's priority sector advances g...,Gross NPAs 1.27% of gross advances in that sec...,Fine-Tune,"₹ 5,789,619 (in '000s).",0.98,2.47,Yes


### UI for chatbot

In [None]:
def llm_correctness(query, model_answer):
    """
    Using LLM as a judge to evaluate the correctness of the generated answer.
    """
    messages = [
            {
                "role": "system",
                "content": "You are a strict evaluator. Reply Yes if the Generated answer seems is correct to you else No. Just output Yes or No"
            },
            {
                "role": "user",
                "content": f"Question:\n{query}\nGenerated Answer:\n{model_answer}"
            }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(base_model.device)

    outputs = base_model.generate(**inputs, max_new_tokens=10, temperature=0.01)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    result = response.strip()

    match_result = re.search(r"\b(Yes|No)\b", result, re.IGNORECASE)
    if match_result:
        return match_result.group(1).capitalize()
    else:
        return "No"

In [2]:
import gradio as gr

def chatbot_response(system_type, question):
    if system_type == "RAG Chatbot":
        if validate_query(question):  # question is a string now
            rag_ans, rag_time = rag_system(question)
            rag_correct = llm_correctness(question, rag_ans)

            return rag_ans, f"{rag_time:.2f} sec", rag_correct
        else:
            return 'The query is harmful, offensive, illegal, or irrelevant to finance domain', "", ""

    elif system_type == "Fine-Tuned Chatbot":
        if validate_query(question):
            ft_ans, ft_time = finetuned_system(question)
            ft_correct = llm_correctness(question, ft_ans)

            return ft_ans, f"{ft_time:.2f} sec", ft_correct
        else:
            return 'The query is harmful, offensive, illegal, or irrelevant to finance domain', "", ""
    else:
        return "Invalid system type selected.", "", ""


# Gradio UI
with gr.Blocks(title="Financial QA Comparison") as demo:
    gr.Markdown("## Financial Statement QA Comparison")
    gr.Markdown("Ask a question based on last two years of a company's financial statements.")

    with gr.Row():
        system_toggle = gr.Radio(
            choices=["RAG Chatbot", "Fine-Tuned Chatbot"],
            label="Choose System",
            value="RAG Chatbot"
        )

    question_input = gr.Textbox(lines=2, label="Enter your question here")

    with gr.Row():
        output = gr.Textbox(label="Answer")
    with gr.Row():
        time_output = gr.Textbox(label="Response Time")
        correctness_output = gr.Textbox(label="Correctness")

    submit_btn = gr.Button("Submit")

    submit_btn.click(
        fn=chatbot_response,
        inputs=[system_toggle, question_input],
        outputs=[output, time_output, correctness_output]
    )

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3661b7b8e8eaebc104.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


