<a href="https://colab.research.google.com/github/itsmetoby/AI_model_test/blob/main/flanT5_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install evaluate Pillow==10.2.0



In [2]:
!pip install pytesseract transformers datasets rouge-score nltk py7zr --upgrade



In [3]:
dataset_id = "cnn_dailymail"

In [4]:
from datasets import load_dataset

dataset = load_dataset(dataset_id, "3.0.0")
print(f"Train dataset size: {len(dataset['train'])}")
print(f"test dataset size: {len(dataset['test'])}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train dataset size: 287113
test dataset size: 11490


In [5]:
from random import randrange

sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"article: \n{sample['article']}\n---")
print(f"highlights: \n{sample['highlights']}\n---")


article: 
---
highlights: 
MP John Mann said he handed evidence of 'abuse parties' to police in 1998 .
Claims case was closed within three months on orders of 'those at the top'
Police probing reports of murders linked to paedophile ring in 1970 and 80s .
---


In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
from datasets import concatenate_datasets

tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["article"], truncation=True), batched=True, remove_columns=["article", "highlights"])
max_source_length = max([len(x) for x in tokenized_inputs["id"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["highlights"], truncation=True), batched=True, remove_columns=["article", "highlights"])
max_target_length = max([len(x) for x in tokenized_targets["id"]])
print(f"Max target length: {max_target_length}")

Max source length: 40


Map:   0%|          | 0/298603 [00:00<?, ? examples/s]

Max target length: 40


In [8]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
inputs = ["highlights:" + item for item in sample["article"]]
model_inputs = tokenizer(inputs, max_length=max_source_length, padding="max_length", truncation=True)
labels = tokenizer(text_target=sample["article"], max_length=max_target_length, padding="max_length", truncation=True)
print(labels)

{'input_ids': [16117, 5220, 1079, 6362, 243, 3, 88, 14014, 2084, 13, 3, 31, 9, 3465, 15, 2251, 31, 147, 16, 6260, 68, 3213, 8, 495, 47, 3168, 441, 386, 767, 57, 8288, 16093, 3, 5, 5076, 130, 1219, 3, 9, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
def preprocess_function(sample, padding="max_length"):
  inputs = ["highlights:" + item for item in sample["article"]]
  model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
  labels = tokenizer(text_target=sample["article"], max_length=max_target_length, padding=padding, truncation=True)
  if padding == "max_length":
      labels["input_ids"] = [
          [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [11]:
from transformers import AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [12]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [14]:
! pip install -U accelerate
! pip install -U transformers



In [15]:
import accelerate

accelerate.__version__

'0.26.1'

In [16]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
mode.save_pretrained("flant5_model_v1")

In [None]:
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
trainer.push_to_hub()


In [None]:
from transformers import pipeline
from random import randrange

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)

# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-base summary:\n{res[0]['summary_text']}")


In [27]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

inputs = tokenizer("Please summarize each main point in great details, especially how well they perform in each esg metric they mentioned: Increased representation in leadership In calendar year 2021, 47% of open leadership** roles were filled by women globally, an increase of 10 percentage points since 2020, and we’ve had an 87% increase in women in leadership since 2014. And in 2021, we hired more Black and Hispanic/Latinx team members in the U.S. than ever before, with 13% open leadership roles filled by Black candidates and 12% filled by Hispanic/Latinx candidates. Since 2014, we’ve had an 84% increase in the number of Black employees and a 90% increase in the number of Hispanic/Latinx employees in leadership in the U.S. -––> Continue reading on page 24 More than doubled renewable energy in our supply chain As of March 2022, 213 suppliers have committed to renewable electricity for Apple production, representing the majority of Apple’s direct supplier spend. In fiscal year 2021, Apple and its suppliers brought online over 10 megawatts of renewable energy in our supply chain, doubling the amount from the prior year. -––> Continue reading on page 16 Continued to maintain pay equity Since 2017, Apple has achieved and maintained gender pay equity for our employees worldwide. In the U.S., we’ve also achieved pay equity with respect to race and ethnicity — as well as pay equity at the intersections of race and ethnicity with gender.* -––> Continue reading on page 28 Carbon neutral for corporate emissions Since April 2020, we’ve achieved carbon neutrality for our corporate emissions by sourcing 100 percent renewable electricity for Apple facilities, implementing energy efficiency initiatives, and securing carbon offsets for remaining emissions. -––> Continue reading on page 13 Protected privacy with App Tracking Transparency With iOS 14.5 in April 2021, we released App Tracking Transparency for iPad and iPhone, requiring developers to obtain a user’s permission to track them across apps or websites owned by other companies for advertising purposes. -––> Continue reading on page 49", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


['Apple’s leadership has increased representation in leadership roles, and we’ve had an 87%']


In [29]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
mytext= "Please summarize each main point in great details, especially how well they perform in each esg metric they mentioned: Increased representation in leadership In calendar year 2021, 47% of open leadership** roles were filled by women globally, an increase of 10 percentage points since 2020, and we’ve had an 87% increase in women in leadership since 2014. And in 2021, we hired more Black and Hispanic/Latinx team members in the U.S. than ever before, with 13% open leadership roles filled by Black candidates and 12% filled by Hispanic/Latinx candidates. Since 2014, we’ve had an 84% increase in the number of Black employees and a 90% increase in the number of Hispanic/Latinx employees in leadership in the U.S. More than doubled renewable energy in our supply chain As of March 2022, 213 suppliers have committed to renewable electricity for Apple production, representing the majority of Apple’s direct supplier spend. In fiscal year 2021, Apple and its suppliers brought online over 10 megawatts of renewable energy in our supply chain, doubling the amount from the prior year.  Continued to maintain pay equity Since 2017, Apple has achieved and maintained gender pay equity for our employees worldwide. In the U.S., we’ve also achieved pay equity with respect to race and ethnicity — as well as pay equity at the intersections of race and ethnicity with gender. Carbon neutral for corporate emissions Since April 2020, we’ve achieved carbon neutrality for our corporate emissions by sourcing 100 percent renewable electricity for Apple facilities, implementing energy efficiency initiatives, and securing carbon offsets for remaining emissions. Protected privacy with App Tracking Transparency With iOS 14.5 in April 2021, we released App Tracking Transparency for iPad and iPhone, requiring developers to obtain a user’s permission to track them across apps or websites owned by other companies for advertising purposes"
inputs = tokenizer(mytext, return_tensors="pt")
outputs = model.generate(**inputs, min_length=0, max_new_tokens=512, length_penalty=2, num_beans=16, no_repeat_ngran_size=2, early_stopping=True)
output_text = tokenizer.batch.decode(outputs, skip_special_tokens=True)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

ChunkedEncodingError: ('Connection broken: IncompleteRead(1262905957 bytes read, 4800248283 more expected)', IncompleteRead(1262905957 bytes read, 4800248283 more expected))