In [None]:
!pip install transformers datasets evaluate rouge_score py7zr

### Login to HuggingFace

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Dataset

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

# Load dataset from the hub
dataset = load_dataset("samsum")

modified_data = DatasetDict({
      "train":Dataset.from_dict(dataset["train"][:2000]),
      "test":Dataset.from_dict(dataset["test"][:])
                 })

print(f"Train dataset size: {len(modified_data['train'])}")
print(f"Test dataset size: {len(modified_data['test'])}")

Downloading and preparing dataset samsum/samsum (download: 2.81 MiB, generated: 10.04 MiB, post-processed: Unknown size, total: 12.85 MiB) to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6...


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset samsum downloaded and prepared to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Train dataset size: 2000
Test dataset size: 819


In [3]:
modified_data["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

### Load Tokenizer

In [4]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

### Define Preprocess Function

In [5]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Preprocess Data

In [6]:
tokenized_data = modified_data.map(preprocess_function,batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Define Data Collator

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-02-18 16:21:31.834412: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-18 16:21:31.834517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-18 16:21:31.956925: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Exception ignored in: <function _xla_gc_callback at 0x7fbbf72fdea0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


### Load Rouge Metrics

In [8]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### Initialize model

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Define Training Arguments and start Finetuning

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="T5-small-summarization",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.116498,0.338,0.1186,0.2811,0.2813,16.7595
2,No log,2.021013,0.3612,0.1338,0.2982,0.2985,16.5592
3,No log,1.983838,0.3652,0.1384,0.3034,0.304,16.1197
4,No log,1.962277,0.3715,0.142,0.3077,0.3079,16.2308
5,No log,1.951277,0.3727,0.1441,0.308,0.3084,16.1453
6,No log,1.941851,0.375,0.1438,0.309,0.3093,16.2234
7,No log,1.937559,0.3748,0.144,0.3102,0.3104,16.1465
8,2.245200,1.932376,0.3754,0.1451,0.3098,0.3099,16.1893
9,2.245200,1.930173,0.3769,0.1459,0.3112,0.3113,16.1966
10,2.245200,1.92943,0.3772,0.1453,0.3105,0.3106,16.1832




TrainOutput(global_step=630, training_loss=2.2192771790519594, metrics={'train_runtime': 652.2761, 'train_samples_per_second': 30.662, 'train_steps_per_second': 0.966, 'total_flos': 2419847968849920.0, 'train_loss': 2.2192771790519594, 'epoch': 10.0})

### Push To huggingface Hub

In [12]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1708273308.6a6946c5f35f.109.0:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/likhith231/T5-small-summarization/commit/7481be01bf76991cf17bcf1e91caa65e2988dced', commit_message='End of training', commit_description='', oid='7481be01bf76991cf17bcf1e91caa65e2988dced', pr_url=None, pr_revision=None, pr_num=None)

### Inference

In [13]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."


In [16]:
from transformers import pipeline

summarizer = pipeline("summarization", model="likhith231/T5-small-summarization")
summarizer(text)

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history."}]

In [17]:
text1= "The history of India is a tapestry woven with the threads of ancient civilizations, remarkable empires, and diverse cultures. From the sophisticated urban planning of the Indus Valley Civilization to the grandeur of the Maurya Empire under Ashoka's rule, India's past is a saga of innovation, conquest, and enlightenment. The Gupta Dynasty ushered in a golden age of art, science, and literature, while the Mughal Empire left an indelible mark with its architectural marvels like the Taj Mahal. The struggle for independence led by Mahatma Gandhi against British colonial rule culminated in 1947, marking the birth of modern India as a sovereign nation. Today, India stands as a vibrant mosaic of tradition and modernity, shaped by the rich tapestry of its historical legacy."
summarizer(text1)

Your max_length is set to 200, but your input_length is only 195. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)


[{'summary_text': "India's history is a tapestry woven with the threads of ancient civilizations, remarkable empires, and diverse cultures. The Gupta Dynasty ushered in a golden age of art, science, and literature, while the Mughal Empire left an indelible mark with its architectural marvels like Taj Mahal."}]

In [18]:
text2= "The history of Germany is a riveting tale of triumphs, setbacks, and resilience that has shaped the course of Europe and the world. From the legendary battles of the Germanic tribes against the Roman Empire to the formation of the Holy Roman Empire under Charlemagne's reign, Germany's early history is marked by a complex tapestry of kingdoms and principalities. The Renaissance and Reformation periods brought profound cultural and religious transformations, with figures like Martin Luther sparking movements that reverberated across Europe.The rise of Prussia in the 18th century laid the groundwork for German unification, culminating in the formation of the German Empire under Otto von Bismarck's leadership in 1871. However, the empire's ambitions would contribute to the outbreak of two devastating world wars in the 20th century, with Germany emerging as a central player in both conflicts."
summarizer(text2)

Your max_length is set to 200, but your input_length is only 198. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=99)


[{'summary_text': "Germany's early history is marked by a complex tapestry of kingdoms and principalities. The rise of Prussia in the 18th century laid the groundwork for German unification, culminating in the formation of the German Empire under Otto von Bismarck's leadership in 1871."}]