In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from typing import Literal

class ResearchTextSimplifier:
    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
        """
        Pure neural research text simplifier

        Args:
            model_name: Pretrained model identifier (default: BART-large-CNN)
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)

        # Optimization parameters
        self.generation_config = {
            "max_length": 512,
            "min_length": 64,
            "num_beams": 4,
            "length_penalty": 2.0,
            "early_stopping": True,
            "no_repeat_ngram_size": 3
        }

    def simplify(
        self,
        text: str,
        audience: Literal["researchers", "students", "public"] = "students"
    ) -> str:
        """
        Simplify research text for specific audience

        Args:
            text: Input research text
            audience: Target audience level
                      ("researchers", "students", or "public")

        Returns:
            Simplified version of the input text
        """
        prompt = self._create_audience_prompt(text, audience)

        inputs = self.tokenizer(
            prompt,
            max_length=1024,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)

        # Adjust parameters based on audience
        config = self.generation_config.copy()
        if audience == "researchers":
            config["length_penalty"] = 1.0  # Stay closer to original
        elif audience == "public":
            config["length_penalty"] = 3.0  # More aggressive simplification

        outputs = self.model.generate(
            inputs["input_ids"],
            **config
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def _create_audience_prompt(self, text: str, audience: str) -> str:
        """Create audience-specific prompt"""
        prompts = {
            "researchers": (
                "Simplify this academic text for fellow researchers while "
                "preserving all technical accuracy and key terms: {text}"
            ),
            "students": (
                "Explain this research content clearly for undergraduate "
                "students while keeping important concepts: {text}"
            ),
            "public": (
                "Describe this scientific finding in simple terms that "
                "a general audience could understand: {text}"
            )
        }
        return prompts[audience].format(text=text)

# Example usage
if __name__ == "__main__":
    simplifier = ResearchTextSimplifier()

    research_paragraph = """
    The photocatalytic activity of the semiconductor nanocomposites was significantly
    enhanced due to the synergistic effect between the constituent materials, which
    facilitated efficient charge separation and suppressed electron-hole recombination,
    as evidenced by photoluminescence spectroscopy and transient absorption measurements.
    """

    print("=== Original Research Text ===")
    print(research_paragraph.strip())

    print("\n=== Simplified for Researchers ===")
    print(simplifier.simplify(research_paragraph, "researchers"))

    print("\n=== Simplified for Students ===")
    print(simplifier.simplify(research_paragraph, "students"))

    print("\n=== Simplified for General Public ===")
    print(simplifier.simplify(research_paragraph, "public"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

=== Original Research Text ===
The photocatalytic activity of the semiconductor nanocomposites was significantly
    enhanced due to the synergistic effect between the constituent materials, which
    facilitated efficient charge separation and suppressed electron-hole recombination,
    as evidenced by photoluminescence spectroscopy and transient absorption measurements.

=== Simplified for Researchers ===
The photocatalytic activity of the semiconductor nanocomposites was significantly enhanced due to the synergistic effect between the constituent materials. The semiconductor materials facilitated efficient charge separation and suppressed electron-hole recombination, as evidenced by photoluminescence spectroscopy and transient absorption measurements. The material was developed at the University of California, San Diego.

=== Simplified for Students ===
The photocatalytic activity of the semiconductor nanocomposites was significantly enhanced due to the synergistic effect between th