##Summarisation data cleaning

In [18]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ccdv/arxiv-summarization")

# Check available splits
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

section/train-00000-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00001-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

section/train-00002-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

section/train-00003-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

section/train-00004-of-00015.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

section/train-00005-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

section/train-00006-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

section/train-00007-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00008-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00009-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

section/train-00010-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

section/train-00011-of-00015.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

section/train-00012-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00013-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00014-of-00015.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

section/validation-00000-of-00001.parque(…):   0%|          | 0.00/105M [00:00<?, ?B/s]

section/test-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})


In [19]:
sample = dataset["train"][0]
print("Title-like text from article:\n", sample['article'][:500], "...\n")
print("Abstract:\n", sample['abstract'])

Title-like text from article:
 additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . 
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models ...

Abstract:
 additive models play an important role in semiparametric statistics . 
 this paper gives learning rates for regularized kernel based methods for additive models . 
 these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is va

In [20]:
train_subset = dataset['train'].shuffle(seed=42).select(range(20000))   # 20k training samples
val_subset = dataset['validation'].shuffle(seed=42).select(range(1000)) # 1k validation samples
test_subset = dataset['test'].shuffle(seed=42).select(range(1000))      # 1k test samples

print(f"Training samples: {len(train_subset)}")
print(f"Validation samples: {len(val_subset)}")
print(f"Test samples: {len(test_subset)}")

Training samples: 20000
Validation samples: 1000
Test samples: 1000


In [21]:
from datasets import load_dataset
from transformers import AutoTokenizer
import re
def clean_text(text):
    text = re.sub(r"@\w+", "", text)           # remove citations like @xcite
    text = re.sub(r"\\[a-zA-Z]+{.*?}", "", text)  # remove LaTeX commands
    text = re.sub(r"\$.*?\$", "", text)        # remove math formulas
    text = re.sub(r"\s+", " ", text)           # normalize whitespace
    return text.strip()

def is_long_enough(text, min_words=200):
    return len(text.split()) >= min_words

def preprocess_example(example):
    article = clean_text(example['article'])
    abstract = clean_text(example['abstract'])

    if not article or not abstract:
        return {"article": "", "abstract": ""}  # empty strings
    if not is_long_enough(article):
        return {"article": "", "abstract": ""}

    return {"article": article, "abstract": abstract}

train_clean = train_subset.map(preprocess_example)
val_clean = val_subset.map(preprocess_example)
test_clean = test_subset.map(preprocess_example)

model_name = "google/pegasus-arxiv"  # Best for scientific summarization
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input_length = 512
max_output_length = 128

def tokenize_example(example):
    inputs = tokenizer(
        example["article"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )
    targets = tokenizer(
        example["abstract"],
        max_length=max_output_length,
        padding="max_length",
        truncation=True
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

train_tokenized = train_clean.map(tokenize_example, batched=True)
val_tokenized = val_clean.map(tokenize_example, batched=True)
test_tokenized = test_clean.map(tokenize_example, batched=True)

for i in range(3):
    print(f"\n--- Article {i+1} ---")
    print("Article snippet:", train_clean[i]['article'][:300], "...")
    print("Abstract snippet:", train_clean[i]['abstract'][:200], "...")

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


--- Article 1 ---
Article snippet: arp 220 is the nearest ( 77 mpc ) example of an ultraluminous infrared galaxy ( ulirg ) that supports star formation at extreme levels . it contains two nuclei separated by 350 pc , both surrounded by massive discs of dense molecular gas ( e.g. , * ? ? ? * ; * ? ? ? * ; * ? ? ? * ; * ? ? ? * ; * ? ? ...
Abstract snippet: the cores of arp 220 , the closest ultraluminous infrared starburst galaxy , provide an opportunity to study interactions of cosmic rays under extreme conditions . in this paper , we model the populat ...

--- Article 2 ---
Article snippet: this study was supported by the danish national research foundation through the center for models of life and by physics of geological processes , a center of excellence at the university of oslo . email data from the university of oslo were collected with the help and support of ingar vindenes and  ...
Abstract snippet: in communication networks structure and dynamics are tightly coupled . the st

In [22]:
import os
from datasets import DatasetDict
tokenized_data = DatasetDict({
    "train": train_tokenized,
    "validation": val_tokenized,
    "test": test_tokenized
})


In [5]:
save_path = "/content/drive/MyDrive/pegasus_arxiv_dataset"


In [24]:
tokenized_data["train"].save_to_disk(os.path.join(save_path, "train"))
tokenized_data["validation"].save_to_disk(os.path.join(save_path, "validation"))
tokenized_data["test"].save_to_disk(os.path.join(save_path, "test"))


Saving the dataset (0/2 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [25]:
from datasets import DatasetDict

DatasetDict({
    "train": tokenized_data["train"],
    "validation": tokenized_data["validation"],
    "test": tokenized_data["test"]
}).save_to_disk(save_path)

Saving the dataset (0/2 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
from datasets import load_from_disk

tokenized_data = load_from_disk(save_path)

# Access splits
train_dataset = tokenized_data["train"]
val_dataset = tokenized_data["validation"]
test_dataset = tokenized_data["test"]


##Model Training

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/pegasus-arxiv"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-arxiv and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
os.environ["WANDB_DISABLED"] = "true"
output_dir = "/content/drive/MyDrive/pegasus_arxiv_finetuned"
os.makedirs(output_dir, exist_ok=True)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='longest',  # dynamic padding for efficiency
    return_tensors="pt"
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./pegasus_arxiv_finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    logging_dir="./logs",
    fp16=torch.cuda.is_available()
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Fine-tuned model and tokenizer saved to Drive at: {output_dir}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
500,2.8775
1000,2.3955
1500,2.0258




Step,Training Loss
500,2.8775
1000,2.3955
1500,2.0258
2000,1.9805
2500,1.9667
3000,1.9449
3500,1.9332


✅ Fine-tuned model and tokenizer saved to Drive at: /content/drive/MyDrive/pegasus_arxiv_finetuned


In [3]:
import os

model_path = "/content/drive/MyDrive/pegasus_arxiv_finetuned"
print(os.listdir(model_path))

['training_args.bin', 'tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.json', 'config.json', 'generation_config.json', 'spiece.model', 'model.safetensors']


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
drive_path = "/content/drive/MyDrive/pegasus_arxiv_finetuned"
old_name = os.path.join(drive_path, "model-001.safetensors")
new_name = os.path.join(drive_path, "model.safetensors")

if os.path.exists(old_name):
    os.rename(old_name, new_name)
    print("Renamed successfully!")
else:
    print("File not found!")

Renamed successfully!


In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/content/drive/MyDrive/pegasus_arxiv_finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Test summarization on a sample
sample_text = test_clean[0]["article"]  # Pick 1 article

inputs = tokenizer(sample_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=128,
    num_beams=5,
    length_penalty=0.9
)
print("\nOriginal Article:\n", sample_text[:600], "...")
print("\nGenerated Summary:\n", tokenizer.decode(summary_ids[0], skip_special_tokens=True))



Original Article:
 access to large data sets on human activities and interactions has long been limited by the difficulty and cost of gathering such information . recently , the ever increasing availability of digital traces of human actions is widely enabling the representation and the analysis of massive amounts of information on human behavior . the representation of this information in terms of complex networks has led to many research efforts because of the naturally interlinked nature of these new data sources . tracing human behavior in a variety of contexts has become possible at very different spatial a ...

Generated Summary:
 the increasing availability of digital traces of human activities and interactions has led to many research efforts because of the naturally interlinked nature of these new data sources . recent technological advances further support mining real - world interactions by means of mobile devices and wearable sensors , opening up new avenues for gathering 

In [7]:
pip install wikipedia


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=87564d9c5542e87fe3a408b58aff55881dddfe239460d3925ffb51172055dd0b
  Stored in directory: /root/.cache/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import wikipedia
import torch
model_path = "/content/drive/MyDrive/pegasus_arxiv_finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

def get_topic_text(topic, sentences=20):
    try:
        wikipedia.set_lang("en")
        page = wikipedia.page(topic)
        summary = wikipedia.summary(topic, sentences=sentences)
        return summary
    except wikipedia.exceptions.DisambiguationError as e:
        return f"Topic too broad. Suggestions: {e.options[:5]}"
    except Exception as e:
        return f"Error fetching topic: {str(e)}"

# Summarization function with chunking
def summarize_text(text, max_chunk_tokens=500, max_summary_length=150):
    words = text.split()
    chunks = [" ".join(words[i:i + max_chunk_tokens]) for i in range(0, len(words), max_chunk_tokens)]

    summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding="longest")
        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_summary_length,
                min_length=50,
                num_beams=4,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=3
            )
        summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary_text)

    return " ".join(summaries)

# Full pipeline for user topic
def summarize_topic(topic):
    text = get_topic_text(topic, sentences=30)
    if "Error" in text or "Suggestions" in text:
        return text
    summary = summarize_text(text)
    return summary

In [10]:
user_topic = input("Enter your topic: ")
final_summary = summarize_topic(user_topic)
print("\n=== SUMMARY ===")
print(final_summary)

Enter your topic: Blockchain

=== SUMMARY ===
blockchain is a distributed ledger with growing lists of records that are securely linked via cryptographic hashblocks , where each transaction is represented by a tree containing data from the previous block , and each block contains data from subsequent blocks that are changed without altering them .
