In [None]:
!pip install transformers datasets rouge_score

from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from rouge_score import rouge_scorer
import numpy as np


model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

dataset = load_dataset('xsum', split='test[:100]')


scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_model(model):
    predictions = []
    references = []

    for example in dataset:
        text = example['document']
        reference_summary = example['summary']


        inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        predictions.append(generated_summary)
        references.append(reference_summary)


    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    print(f"Pre-trained ROUGE-1 F1 Score: {np.mean(rouge1_scores):.4f}")
    print(f"Pre-trained ROUGE-2 F1 Score: {np.mean(rouge2_scores):.4f}")
    print(f"Pre-trained ROUGE-L F1 Score: {np.mean(rougeL_scores):.4f}")

def summarize_text(text, max_length=150, min_length=40, length_penalty=2.0, num_beams=4):

    preprocess_text = "summarize: " + text.strip().replace("\n", " ")


    inputs = tokenizer.encode(preprocess_text, return_tensors="pt", max_length=512, truncation=True)


    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length,
                                 length_penalty=length_penalty, num_beams=num_beams, early_stopping=True)


    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


def summarize_from_file(input_file):

    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()


    summarized_text = summarize_text(text)
    print("Summarized Text: \n", summarized_text)

evaluate_model(model)

input_file = '/content/ai.txt'

summarize_from_file(input_file)


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-n

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

The repository for xsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/xsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


(…)SUM-EMNLP18-Summary-Data-Original.tar.gz:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Pre-trained ROUGE-1 F1 Score: 0.1897
Pre-trained ROUGE-2 F1 Score: 0.0278
Pre-trained ROUGE-L F1 Score: 0.1331
Summarized Text: 
 artificial intelligence is one of the most transformative technologies of the modern era. it refers to the simulation of human intelligence processes by machines. these processes include learning, reasoning, problem-solving, perception, and natural language understanding. the concept of AI can be trace back to ancient myths and stories.


In [None]:
!pip uninstall -y torch torchvision torchaudio

!pip install torch torchvision torchaudio

!pip install transformers datasets

Found existing installation: torch 2.3.1+cu121
Uninstalling torch-2.3.1+cu121:
  Successfully uninstalled torch-2.3.1+cu121
Found existing installation: torchvision 0.18.1+cu121
Uninstalling torchvision-0.18.1+cu121:
  Successfully uninstalled torchvision-0.18.1+cu121
Found existing installation: torchaudio 2.3.1+cu121
Uninstalling torchaudio-2.3.1+cu121:
  Successfully uninstalled torchaudio-2.3.1+cu121
Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.19.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting torchaudio
  Downloading torchaudio-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_



In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

dataset = load_dataset('xsum', split='train[:1%]')

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2040 [00:00<?, ? examples/s]

Step,Training Loss
500,1.2613
1000,0.6157


('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_t5")
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_t5")

def summarize_text(text, max_length=150, min_length=40, length_penalty=2.0, num_beams=4):
    preprocess_text = "summarize: " + text.strip().replace("\n", " ")

    inputs = tokenizer.encode(preprocess_text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length,
                                 length_penalty=length_penalty, num_beams=num_beams, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


def summarize_from_file(input_file):

    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()


    summarized_text = summarize_text(text)
    print("Summarized Text: \n", summarized_text)


input_file = '/content/ai.txt'


summarize_from_file(input_file)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Summarized Text: 
 The term "artificial intelligence" was coined in 1956 by a British mathematician and logician. it refers to AI systems that have the ability to perform any intellectual task that a human being can do.
