In [1]:
# ---------------------------
# Install dependencies (first-time setup)
# ---------------------------

#!pip install pdfplumber pandas tabulate transformers torch tqdm evaluate

try:
    import pdfplumber, pandas, re, tabulate, transformers, torch
except ImportError:
    import os
    os.system("pip install pdfplumber pandas tabulate transformers torch")
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

    %env CUDA_LAUNCH_BLOCKING=1

In [2]:
# ---------------------------
# Imports
# ---------------------------

import pdfplumber
import pandas as pd
import re
from transformers import pipeline, AutoTokenizer
import tabulate
from tqdm import tqdm
import torch
import evaluate

In [3]:
# ============================================================
# Setup device
# ============================================================
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

Using device: GPU


In [4]:
# ============================================================
# Tokenizer for safe truncation
# ============================================================
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

def safe_encode(text, max_tokens=1024):
    """Ensure input fits within model limits."""
    if not text or not text.strip():
        return None
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# ---------------------------
# Utility functions
# ---------------------------

def clean_text(txt: str) -> str:
    txt = str(txt)
    txt = re.sub(r'\n+', ' ', txt)              # remove newlines
    txt = re.sub(r'\s+', ' ', txt)              # normalize spaces
    txt = re.sub(r'Page \d+ of \d+', '', txt)   # remove page numbers
    txt = re.sub(r'(Integrated Annual Report \d{4}-\d{2})', '', txt)  # headers
    txt = re.sub(r'-+\s*$', '', txt)            # footers
    return txt.strip()

In [6]:
def extract_columns(page):
    """Handle 2-column layouts, fallback to single column if empty."""
    width = page.width
    left_bbox = (0, 0, width / 2, page.height)
    right_bbox = (width / 2, 0, width, page.height)

    left_text = page.within_bbox(left_bbox).extract_text()
    right_text = page.within_bbox(right_bbox).extract_text()

    if not left_text and not right_text:
        return page.extract_text() or ""
    return (left_text or "") + "\n" + (right_text or "")

In [7]:
def extract_tables(pdf_path):
    """Extract tables from PDF and convert to pandas DataFrames."""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_tables = page.extract_tables()
            for table in page_tables:
                if table:
                    # first row is header
                    df = pd.DataFrame(table[1:], columns=table[0])
                    tables.append(df)
    return tables

In [8]:
def preprocess_pdf(pdf_path):
    texts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            col_text = extract_columns(page)
            col_text = clean_text(col_text)
            if len(col_text) > 100:  # skip very short/empty
                texts.append(col_text)
    return texts

In [9]:
def preprocess_tables(tables):
    """Convert tables to markdown-like text for summarization."""
    tables_text = []
    for df in tables:
        tab_txt = tabulate.tabulate(df, headers="keys", tablefmt="pipe")
        tables_text.append(tab_txt)
    return tables_text

In [10]:
def chunk_texts(texts, chunk_size=3000):
    """Split long texts into smaller chunks."""
    chunks = []
    for txt in texts:
        for i in range(0, len(txt), chunk_size):
            chunk = txt[i:i+chunk_size]
            if len(chunk) > 50:
                chunks.append(chunk)
    return chunks

In [11]:
# ============================================================
# 6. Safe summarization function (fixes CUDA crash)
# ============================================================
def summarize_with_safe_params(text, summarizer, default_max=120, default_min=20):
    safe_text = safe_encode(text, max_tokens=1024)
    if not safe_text:
        return ""  # skip empty

    input_len = len(safe_text.split())

    # dynamic max_length
    if input_len < 50:
        max_len = max(15, int(input_len * 0.6))
    elif input_len < 200:
        max_len = max(30, int(input_len * 0.7))
    else:
        max_len = default_max

    # ensure min_length < max_length
    min_len = max(5, min(default_min, max_len - 5))

    return summarizer(
        safe_text,
        max_length=max_len,
        min_length=min_len,
        truncation=True
    )[0]["summary_text"]

def batch_summarize(chunks, summarizer):
    summaries = []
    for i in tqdm(range(len(chunks))):
        summaries.append(summarize_with_safe_params(chunks[i], summarizer))
    return summaries


# ============================================================
# 7. Improved Table Summarization
# ============================================================
def narrate_table(df, max_rows=5):
    """Convert a dataframe into natural language narration for first few rows."""
    narrations = []
    for i, row in df.head(max_rows).iterrows():
        parts = []
        for col in df.columns:
            val = str(row[col]).strip()
            if val and val.lower() != "nan":
                parts.append(f"{col}: {val}")
        if parts:
            narrations.append(f"Row {i+1}: " + ", ".join(parts))
    return " ".join(narrations)

def table_descriptive_summary(df):
    """Generate simple numeric stats for numeric columns."""
    summary_lines = []
    for col in df.columns:
        try:
            numeric = pd.to_numeric(df[col], errors="coerce").dropna()
            if not numeric.empty:
                summary_lines.append(
                    f"For column '{col}', min={numeric.min()}, max={numeric.max()}, mean={numeric.mean():.2f}."
                )
        except Exception:
            continue
    return " ".join(summary_lines)

def summarize_tables(tables, summarizer):
    """Summarize tables by converting to narration + numeric stats before BART."""
    summaries = []
    for df in tables:
        narr_text = narrate_table(df, max_rows=5)
        stats_text = table_descriptive_summary(df)
        combined_text = (narr_text + " " + stats_text).strip()

        if not combined_text:
            summaries.append("Table skipped (empty or invalid).")
            continue

        summary = summarize_with_safe_params(
            combined_text,
            summarizer,
            default_max=120,
            default_min=30
        )
        summaries.append(summary)
    return summaries


In [12]:
# ============================================================
# 7. Multi-stage summarization
# ============================================================
def multi_stage_summary(chunk_summaries, summarizer, group_size=20):
    # Stage 1: Group ~20 chunk summaries → section summaries
    groups = []
    for i in range(0, len(chunk_summaries), group_size):
        groups.append(" ".join(chunk_summaries[i:i+group_size]))

    section_summaries = []
    for g in groups:
        res = summarize_with_safe_params(g, summarizer, default_max=400, default_min=150)
        section_summaries.append(res)

    # Stage 2: Merge all section summaries into final doc
    final_summary = " ".join(section_summaries)
    return final_summary, section_summaries


In [13]:
# ---------------------------
# Main execution
# ---------------------------

pdf_path = "/content/Annual report HDFC.pdf"  # replace with your actual file path

print("Extracting tables...")
tables = extract_tables(pdf_path)

print("Extracting text...")
texts = preprocess_pdf(pdf_path)

print("Loading summarizer model...")
summarizer = pipeline(
    "summarization",
    #model="facebook/bart-large-cnn"
    model="sshleifer/distilbart-cnn-12-6"
    #model="google/pegasus-xsum"
    #model="google/long-t5-local-base"
    ,
    device=device
)

print("Chunk-level summaries...")
chunks = chunk_texts(texts, 1500)
chunk_summaries = batch_summarize(chunks, summarizer)

print("Table summaries...")
#table_txts = preprocess_tables(tables)
table_summaries = summarize_tables(tables, summarizer)

print("Multi-stage final summary...")
final_summary_long, section_summaries = multi_stage_summary(chunk_summaries, summarizer)

# ============================================================
# 9. Save outputs
# ============================================================
pd.DataFrame({"chunk_summary": chunk_summaries}).to_csv("chunk_summaries_bart.csv", index=False)
pd.DataFrame({"table_summary": table_summaries}).to_csv("table_summaries_bart.csv", index=False)
pd.DataFrame({"section_summary": section_summaries}).to_csv("section_summaries.csv", index=False)

with open("final_summary_long.txt", "w", encoding="utf-8") as f:
    f.write(final_summary_long)

print("✅ Outputs saved: chunk_summaries_bart.csv, table_summaries_bart.csv, section_summaries.csv, final_summary_long.txt")

Extracting tables...
Extracting text...
Loading summarizer model...


Device set to use cuda:0


Chunk-level summaries...


  1%|          | 10/1381 [00:07<17:11,  1.33it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 10%|█         | 144/1381 [01:22<12:39,  1.63it/s]Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
 14%|█▎        | 188/1381 [01:45<12:13,  1.63it/s]Your max_length is set to 15, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
 15%|█▌        | 214/1381 [01:59<09:58,  1.95it/s]Your max_length is set to 15, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

Table summaries...


Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 15, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_leng

Multi-stage final summary...
✅ Outputs saved: chunk_summaries_bart.csv, table_summaries_bart.csv, section_summaries.csv, final_summary_long.txt


In [14]:
# ============================================================
# Evaluation
# ============================================================
def compression_ratio(original_texts, summaries):
    orig_len = sum(len(t.split()) for t in original_texts)
    sum_len = sum(len(s.split()) for s in summaries)
    return round(sum_len / orig_len, 3) if orig_len > 0 else 0

def avg_summary_length(summaries):
    return round(sum(len(s.split()) for s in summaries) / len(summaries), 2)

print("\n--- Evaluation Results ---")
print("Compression ratio:", compression_ratio(texts, chunk_summaries))
print("Avg chunk summary length:", avg_summary_length(chunk_summaries))


--- Evaluation Results ---
Compression ratio: 0.215
Avg chunk summary length: 40.15
