<a href="https://colab.research.google.com/github/joodk1/CV-For-FS/blob/main/Financial_Stataements_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Financial Statements Classifier - PDF Processing Tool

This tool classifies each page of a financial statement PDF using our trained EfficientNet-B2 model.

Just **add the path for the PDF you want to process below**, and run the rest of the notebook to get the results!

In [28]:
PDF_PATH = 'FileTest.pdf'

---
# Installing Packages & Importing Libraries

In [1]:
!pip install -q pymupdf torch torchvision timm albumentations opencv-python-headless

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import os
import cv2
import timm
import json
import fitz  # PyMuPDF
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import albumentations as A
from albumentations.pytorch import ToTensorV2
import warnings
warnings.filterwarnings('ignore')

# Device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Device: cpu


---
# Loading Model and Configurations

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
MODEL_PATH = '/content/drive/MyDrive/Sukuk AI Assessment/FS_Classification/efficientnet_b2/best_model.pth'
CONFIG_PATH = '/content/drive/MyDrive/Sukuk AI Assessment/FS_Classification/config.json'

In [9]:
# Load configurations
if os.path.exists(CONFIG_PATH):
    with open(CONFIG_PATH) as f:
        config = json.load(f)
    CLASS_NAMES = config['class_names']
    INPUT_SIZE = config['model_configs']['efficientnet_b2']['input_size']
    MEAN = config['model_configs']['efficientnet_b2']['mean']
    STD = config['model_configs']['efficientnet_b2']['std']
else:
    CLASS_NAMES = ["Independent Auditor's Report", "Financial Sheets", "Notes (Tabular)", "Notes (Text)", "Other Pages"]
    INPUT_SIZE = 260
    MEAN = [0.485, 0.456, 0.406]
    STD = [0.229, 0.224, 0.225]

NUM_CLASSES = len(CLASS_NAMES)

print(f"Input size: {INPUT_SIZE}x{INPUT_SIZE}")
print(f"Classes: {NUM_CLASSES}")

Input size: 260x260
Classes: 5


In [25]:
# Load the trained model
model = timm.create_model('efficientnet_b2', pretrained=False, num_classes=NUM_CLASSES)
checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
model = model.to(DEVICE)

In [26]:
# Create preprocessing transform
transform = A.Compose([
    A.LongestMaxSize(max_size=INPUT_SIZE),
    A.PadIfNeeded(
        min_height=INPUT_SIZE,
        min_width=INPUT_SIZE,
        border_mode=cv2.BORDER_CONSTANT,
        value=(255, 255, 255)
    ),
    A.CenterCrop(height=INPUT_SIZE, width=INPUT_SIZE),
    A.Normalize(mean=MEAN, std=STD),
    ToTensorV2()
])

In [29]:
# Extracting Pages from PDF
doc = fitz.open(PDF_PATH)
images = []

for page_num in range(len(doc)):
    page = doc[page_num]

    # Render at 150 DPI
    mat = fitz.Matrix(150/72, 150/72)
    pix = page.get_pixmap(matrix=mat)

    # Convert to RGB numpy array
    img = np.frombuffer(pix.samples, dtype=np.uint8)
    img = img.reshape(pix.height, pix.width, pix.n)

    if pix.n == 4:
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)

    images.append(img)

doc.close()

print(f"Extracted {len(images)} pages from {os.path.basename(PDF_PATH)}")

Extracted 36 pages from FileTest.pdf


---
# Processing Your PDF

In [20]:
print(f"\nClassifying {len(images)} pages\n")
print(f"{'Page':<6} {'Class':<35} {'Confidence':<12}")

results = []

for page_num, image in enumerate(images, start=1):
    # Preprocess
    transformed = transform(image=image)
    img_tensor = transformed['image'].unsqueeze(0).to(DEVICE)

    # Predict
    with torch.no_grad():
        outputs = model(img_tensor)
        probs = torch.softmax(outputs, dim=1)
        confidence, predicted = torch.max(probs, 1)

    pred_class = predicted.item()
    conf_score = confidence.item()
    class_name = CLASS_NAMES[pred_class]

    # Store result
    results.append({'page': page_num, 'class': class_name, 'confidence': conf_score})

    # Print
    print(f"{page_num:<6} {class_name:<35} {conf_score*100:>5.2f}%")


Classifying 36 pages

Page   Class                               Confidence  
1      Other Pages                         100.00%
2      Other Pages                         100.00%
3      Independent Auditor's Report        100.00%
4      Independent Auditor's Report        100.00%
5      Financial Sheets                    99.95%
6      Financial Sheets                    100.00%
7      Financial Sheets                    98.04%
8      Financial Sheets                    99.63%
9      Notes (Text)                        98.96%
10     Notes (Text)                        99.97%
11     Notes (Text)                        98.95%
12     Notes (Text)                        99.46%
13     Notes (Text)                        92.47%
14     Notes (Tabular)                     99.75%
15     Notes (Text)                        96.26%
16     Notes (Text)                        99.01%
17     Notes (Text)                        99.98%
18     Notes (Text)                        99.47%
19     Notes (Te

In [30]:
# Results Summary
class_counts = {}
for r in results:
    class_name = r['class']
    class_counts[class_name] = class_counts.get(class_name, 0) + 1

avg_conf = np.mean([r['confidence'] for r in results])

print("Summary of Results")
print(f"\nPDF: {os.path.basename(PDF_PATH)}")
print(f"Total Pages: {len(results)}")
print(f"Average Confidence: {avg_conf*100:.2f}%")

print(f"\nClass Distribution:")
for class_name in CLASS_NAMES:
    count = class_counts.get(class_name, 0)
    if count > 0:
        pct = (count / len(results)) * 100
        print(f"  {class_name:30s}: {count:3d} pages ({pct:5.1f}%)")

Summary of Results

PDF: FileTest.pdf
Total Pages: 36
Average Confidence: 96.72%

Class Distribution:
  Independent Auditor's Report  :   2 pages (  5.6%)
  Financial Sheets              :   4 pages ( 11.1%)
  Notes (Tabular)               :  15 pages ( 41.7%)
  Notes (Text)                  :  13 pages ( 36.1%)
  Other Pages                   :   2 pages (  5.6%)


In [23]:
# Find all Auditor Report pages
auditor_pages = [r['page'] for r in results if r['class'] == "Independent Auditor's Report"]
print(f"Auditor Report pages: {auditor_pages}")

# Find all Financial Sheets
financial_pages = [r['page'] for r in results if r['class'] == "Financial Sheets"]
print(f"Financial Sheet pages: {financial_pages}")

# Find pages with low confidence (<80%)
low_conf = [r for r in results if r['confidence'] < 0.80]
if low_conf:
    print(f"\nPages with low confidence (<80%):")
    for r in low_conf:
        print(f"  Page {r['page']}: {r['class']} ({r['confidence']*100:.2f}%)")
else:
    print(f"\nAll pages classified with high confidence (>80%)!")

Auditor Report pages: [3, 4]
Financial Sheet pages: [5, 6, 7, 8]

Pages with low confidence (<80%):
  Page 35: Notes (Tabular) (66.71%)
  Page 36: Notes (Text) (43.85%)
