In [2]:
from datasets import load_dataset

samsum_train_dataset = load_dataset("csv", data_files={"train": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_test_dataset = load_dataset("csv", data_files={"test": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_validate_dataset = load_dataset("csv", data_files={"validation": "/kaggle/input/samsum-dataset-text-summarization/samsum-validation.csv"})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [3]:
samsum_train_dataset["train"][1]

{'id': '13729565',
 'dialogue': "Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric: I know! And shows how Americans see Russian ;)\r\nRob: And it's really funny!\r\nEric: I know! I especially like the train part!\r\nRob: Hahaha! No one talks to the machine like that!\r\nEric: Is this his only stand-up?\r\nRob: Idk. I'll check.\r\nEric: Sure.\r\nRob: Turns out no! There are some of his stand-ups on youtube.\r\nEric: Gr8! I'll watch them now!\r\nRob: Me too!\r\nEric: MACHINE!\r\nRob: MACHINE!\r\nEric: TTYL?\r\nRob: Sure :)",
 'summary': 'Eric and Rob are going to watch a stand-up on youtube.'}

In [4]:
print(samsum_test_dataset.keys())

dict_keys(['test'])


In [5]:
from transformers import pipeline

text_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [6]:
text_summarizer(samsum_train_dataset["train"][1]["dialogue"], max_length=20, min_length=10, do_sample= False )

[{'summary_text': 'Rob: Is this his only stand-up? Eric: Sure. Rob: Id'}]

In [7]:
# Fine tune the SamSUM model to improve the summarize performance
# Add the BART tokenizer and model
from transformers import BartForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [8]:
def preprocessData(records, tokenizer, max_length_preprocess=128):
    sources = records["dialogue"]
    targets = records["summary"]

    input_encoding = tokenizer(sources, max_length=max_length_preprocess, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        output_encoding = tokenizer(targets, max_length=max_length_preprocess, padding="max_length", truncation=True)

    # Return as lists to ensure compatibility with DataLoader
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": output_encoding["input_ids"],
    }


In [9]:
pip install evaluate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [10]:
#Apply the preprocessing task
train_data = samsum_train_dataset['train'].map(
    lambda x: preprocessData(x, tokenizer),
    batched=True
)
test_data = samsum_test_dataset['test'].map(
    lambda x: preprocessData(x, tokenizer),
    batched=True
)
validate_data = samsum_validate_dataset["validation"].map(
    lambda x: preprocessData(x, tokenizer),
    batched=True
)
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
validate_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/819 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [11]:
#Convert the preprocessed datasets to PyTorch DataLoader
from transformers import BartForConditionalGeneration, AutoTokenizer, AdamW
from torch.utils.data import DataLoader
import evaluate
import torch

def create_dataloader(dataset, batch_size):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=lambda x: {
            "input_ids": torch.stack([item["input_ids"] for item in x]),
            "attention_mask": torch.stack([item["attention_mask"] for item in x]),
            "labels": torch.stack([item["labels"] for item in x]),
        },
    )


In [12]:
batch_size = 128
train_dataloader = create_dataloader(train_data, batch_size)
validate_dataloader = create_dataloader(validate_data, batch_size)
test_dataloader = create_dataloader(test_data, batch_size)

In [None]:
#Fine-tune the model
from transformers import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {total_loss / len(train_dataloader)}")

# Save the fine-tuned model
model.save_pretrained("./finetuned_bart_samsum")
tokenizer.save_pretrained("./finetuned_bart_samsum")

