In [1]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

# Load small XSum dataset
dataset = load_dataset("xsum", split="train[:10000]").train_test_split(test_size=0.1)

# Load model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Determine if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples["document"], padding="max_length", truncation=True, max_length=256)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments (evaluation removed as requested)
training_args = TrainingArguments(
    output_dir="./bart-xsum-small",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_total_limit=1,
    fp16=True,  # Use 16-bit precision for GPUs
    logging_steps=10,
    report_to="none",
    save_steps=100,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Function to generate summary of a given length
def generate_summary(input_text, max_length=256):  # Increased max length
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=int(max_length * 0.75),  # Ensures a minimum summary length
        num_beams=8,
        early_stopping=True,
        length_penalty=1.2,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0,
        temperature=0.9,
        top_k=50,
        top_p=0.95
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary




  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 9000/9000 [00:29<00:00, 302.81 examples/s]
Map: 100%|██████████| 1000/1000 [00:03<00:00, 289.61 examples/s]
  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
10,9.2532
20,6.63
30,4.7232
40,3.6297
50,2.8781
60,2.4289
70,1.8275
80,1.5048
90,1.3361
100,1.3131




In [3]:
# Save the fine-tuned model and tokenizer
save_directory = "./bart-xsum-finetuned"

# Save model
model.save_pretrained(save_directory)

# Save tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to '{save_directory}'")


Model and tokenizer saved to './bart-xsum-finetuned'


In [5]:
# Take user input
input_text = input("Please enter the text you want to summarize: ")

# Generate a longer summary
summary = generate_summary(input_text, max_length=64)

# Print summary
print("\nGenerated Summary (long):", summary)


Generated Summary (long): Advances in artificial intelligence are revolutionising the way we think about business, and how it can help us improve our lives by more than half a million dollars (Â£1.5bn; £1.8bn) over the next decade.
