<a href="https://colab.research.google.com/github/ldsbalu/Balu-Portfolio/blob/main/Research_Paper_Digest_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet pypdf transformers sentencepiece torch tiktoken



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m327.7/329.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.5/329.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#imports
import re
import io
from dataclasses import dataclass
from typing import Dict, List

from pypdf import PdfReader
from google.colab import files

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import tiktoken

In [3]:
MODEL_NAME = "facebook/bart-large-cnn"  # ~1k token max input

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [4]:
def extract_pdf_text_from_bytes(pdf_bytes) -> str:
    reader = PdfReader(pdf_bytes)
    pages = []
    for page in reader.pages:
        text = page.extract_text() or ""
        pages.append(text)
    return "\n".join(pages)

SECTION_PATTERNS = [
    r"\babstract\b",
    r"\bintroduction\b",
    r"\brelated work\b",
    r"\bbackground\b",
    r"\bmethods?\b",
    r"\bmaterials and methods\b",
    r"\bexperiments?\b",
    r"\bresults?\b",
    r"\bdiscussion\b",
    r"\bconclusion[s]?\b",
    r"\blimitations\b",
    r"\bfuture work\b",
]

In [5]:
def split_into_sections(text: str) -> Dict[str, str]:
    lower = text.lower()
    pattern = r"(?m)^(?:\d+\.?\s+)?(" + "|".join(SECTION_PATTERNS) + r")\s*$"
    matches = list(re.finditer(pattern, lower))
    if not matches:
        return {"paper": text}

    sections = {}
    for i, m in enumerate(matches):
        title = m.group(1).strip()
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        raw_section = text[start:end].strip()
        norm_title = " ".join(w.capitalize() for w in title.split())
        sections[norm_title] = raw_section
    return sections

In [6]:
enc = tiktoken.get_encoding("cl100k_base")

def chunk_text(text: str, max_tokens: int = 800) -> List[str]:
    tokens = enc.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i : i + max_tokens]
        chunks.append(enc.decode(chunk_tokens))
    return chunks

In [7]:
def summarize_chunk_local(text: str, max_input_tokens: int = 800, max_summary_tokens: int = 160) -> str:
    tokens = enc.encode(text)
    if len(tokens) > max_input_tokens:
        text = enc.decode(tokens[:max_input_tokens])

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=max_summary_tokens,
            num_beams=4,
            early_stopping=True,
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [8]:
def multi_step_summary(text: str, section_name: str, goal: str) -> str:
    """
    Recursive summarization:
      1) chunk long text
      2) summarize each chunk
      3) summarize concatenation of chunk summaries
    """
    chunks = chunk_text(text, max_tokens=800)
    if len(chunks) == 1:
        base = f"{section_name}: {goal}\n\n{chunks[0]}"
        return summarize_chunk_local(base)

    partial_summaries = []
    for ch in chunks:
        content = f"{section_name}: {goal}\n\n{ch}"
        partial_summaries.append(summarize_chunk_local(content))

    combined = "\n".join(partial_summaries)
    combined_prompt = f"{section_name}: {goal}\n\n{combined}"
    return summarize_chunk_local(combined_prompt)

In [9]:
@dataclass
class PaperDigest:
    section_summaries: Dict[str, str]
    contributions: str
    method: str
    limitations: str

In [10]:
def summarize_sections(sections: Dict[str, str]) -> Dict[str, str]:
    result = {}
    for name, content in sections.items():
        goal = "summarize this section for a research digest"
        result[name] = multi_step_summary(content, section_name=name, goal=goal)
    return result

In [11]:
def summarize_focus_text(text: str, label: str) -> str:
    goal = f"summarize the {label} of this paper"
    return multi_step_summary(text, section_name=label, goal=goal)

In [12]:
def explain_method(full_text: str, sections: Dict[str, str]) -> str:
    method_text = ""
    for key, val in sections.items():
        low = key.lower()
        if "method" in low or "experiment" in low or "model" in low:
            method_text += "\n\n" + val
    if not method_text:
        method_text = full_text
    summary = summarize_focus_text(method_text, label="methods and experiments")
    return "Method and experimental setup (summary):\n" + summary


In [13]:
def extract_limitations(full_text: str, sections: Dict[str, str]) -> str:
    limit_text = ""
    for key, val in sections.items():
        if "limitation" in key.lower() or "discussion" in key.lower() or "conclusion" in key.lower():
            limit_text += "\n\n" + val
    if not limit_text:
        # back part of the paper is more likely to contain limitations
        tokens = enc.encode(full_text)
        tail = enc.decode(tokens[-3000:])  # last ~3k tokens
        limit_text = tail

    summary = summarize_focus_text(limit_text, label="limitations and future work")
    return "Limitations and future work (summary):\n" + summary


In [14]:
def extract_contributions(full_text: str, sections: Dict[str, str]) -> str:
    focus_parts = []
    for key in sections:
        if key.lower() in ["abstract", "introduction", "conclusion", "conclusions"]:
            focus_parts.append(sections[key])
    base = "\n\n".join(focus_parts) if focus_parts else full_text
    rough = summarize_focus_text(base, label="main contributions")

    # Light post-processing: prepend a label; detailed bulleting can be done manually
    return "Main contributions (summary):\n" + rough


In [15]:
def generate_paper_digest_from_text(full_text: str) -> PaperDigest:
    sections = split_into_sections(full_text)
    section_summaries = summarize_sections(sections)
    contributions = extract_contributions(full_text, sections)
    method = explain_method(full_text, sections)
    limitations = extract_limitations(full_text, sections)

    return PaperDigest(
        section_summaries=section_summaries,
        contributions=contributions,
        method=method,
        limitations=limitations,
    )


In [16]:
uploaded = files.upload()
pdf_filename = next(iter(uploaded.keys()))
print("Uploaded file:", pdf_filename)

pdf_bytes = io.BytesIO(uploaded[pdf_filename])
full_text = extract_pdf_text_from_bytes(pdf_bytes)

digest = generate_paper_digest_from_text(full_text)

Saving jennings.pdf to jennings.pdf
Uploaded file: jennings.pdf


In [17]:
print("=== SECTION SUMMARIES ===")
for name, summ in digest.section_summaries.items():
    print(f"\n## {name}\n")
    print(summ)

print("\n=== CONTRIBUTIONS ===\n")
print(digest.contributions)

print("\n=== METHOD EXPLANATION ===\n")
print(digest.method)

print("\n=== LIMITATIONS ===\n")
print(digest.limitations)

=== SECTION SUMMARIES ===

## Conclusions

This paper introduced the concept of creative autonomy, which requires that a system be able to evaluate its creations without consulting others. Theories of cognitive dissonance are discussed in the next section. The final section of the book is titled ‘The Psychology of Persuasion’

=== CONTRIBUTIONS ===

Main contributions (summary):
This paper introduced the concept of creative autonomy, which requires that a system be able to evaluate its creations without consulting others. A notation was developed to denote evaluations drawn from the integration of knowledge about critics’ standards. In a single critic society, it is unlikely that the system’s standards would be more than distorted agglom-related.

=== METHOD EXPLANATION ===

Method and experimental setup (summary):
The paper is based on a 2005 workshop on Computational Creativity. It is published by the AAAI, which is a member of the Association for Computing Machinery (ACM) The paper 

In [18]:
uploaded = files.upload()
pdf_filename = next(iter(uploaded.keys()))
print("Uploaded file:", pdf_filename)

pdf_bytes = io.BytesIO(uploaded[pdf_filename])
full_text = extract_pdf_text_from_bytes(pdf_bytes)

digest = generate_paper_digest_from_text(full_text)

Saving jordanous_eval.pdf to jordanous_eval.pdf
Uploaded file: jordanous_eval.pdf


In [19]:
print("=== SECTION SUMMARIES ===")
for name, summ in digest.section_summaries.items():
    print(f"\n## {name}\n")
    print(summ)

print("\n=== CONTRIBUTIONS ===\n")
print(digest.contributions)

print("\n=== METHOD EXPLANATION ===\n")
print(digest.method)

print("\n=== LIMITATIONS ===\n")
print(digest.limitations)

=== SECTION SUMMARIES ===

## Abstract

Computational creativity research has produced many creative systems. But there is a distinct lack of evaluation of the creativity of these systems. This paper proposes a standard but impossible approach to evaluation of creativity. It argues that this approach should be taken up as standard practice in creativity research.

## Introduction

Only a third of systems presented as creative were actually evaluated on how creative they are. Less than a quarter of systems made any practical use of creativity evaluation methodologies. Four jazz improvisation systems were evaluated to see which were more creative. The results show that the V oyager system (Lewis 2000) can in general be considered most creative.

=== CONTRIBUTIONS ===

Main contributions (summary):
Survey of 75 journal and conference papers on computational creativity. Only a third of systems presented as creative were actually evaluated on how creative they are. Less than a quarter of sy