In [10]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore") 

In [11]:
# from huggingface_hub import notebook_login

# notebook_login()

In [12]:
import datasets
from datasets import load_dataset

df = load_dataset('csv', data_files='data/data.csv', split='train')

dataset = df.train_test_split(test_size=.2)
dataset

Using custom data configuration default-ddba5c8acb0ba08a
Found cached dataset csv (/home/hsong1101/.cache/huggingface/datasets/csv/default-ddba5c8acb0ba08a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


DatasetDict({
    train: Dataset({
        features: ['Content', 'Summary'],
        num_rows: 696389
    })
    test: Dataset({
        features: ['Content', 'Summary'],
        num_rows: 174098
    })
})

In [13]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 696389
Test dataset size: 174098


In [14]:
from random import randrange


sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"Content: \n{sample['Content']}\n---------------")
print(f"Summary: \n{sample['Summary']}\n---------------")

Content: 
By . Associated Press . Last updated at 5:14 AM on 2nd November 2011 . A Florida man pleaded not guilty today to hacking into the emails of stars such as Christina Aguilera, Mila Kunis and Scarlett Johansson, whose nude photos were eventually splashed on the Internet. Christopher Chaney, 35, of Jacksonville, Fla., made his first court appearance in California, where he's been indicted on 26 counts, including unauthorized access to a computer and wiretapping. Invaded: Naked pictures of Scarlett Johansson circulated the internet after they were stolen from her phone . If convicted, he faces up to 121 years in prison. U.S. Magistrate Judge Patrick Walsh denied federal prosecutors' request to remand Chaney to custody but modified his bond to $110,000, and he will wear an electronic monitoring device upon his return to Florida. A trial has been scheduled for Dec. 27. Chaney was arrested as part of a yearlong investigation of celebrity hacking that authorities dubbed 'Operation Hac

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [16]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["Content"], truncation=True), batched=True, remove_columns=["Content", "Summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["Summary"], truncation=True), batched=True, remove_columns=["Content", "Summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

A Jupyter Widget

Max target length: 512


In [22]:
def preprocess_function(sample, padding="max_length"):
    inputs = [item for item in sample["Content"]]
    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["Summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["Content", "Summary"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

A Jupyter Widget

A Jupyter Widget

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [32]:
tokenized_dataset.push_to_hub('hsong1101/news_summarization')

Pushing split train to the Hub.


A Jupyter Widget

Pushing split test to the Hub.


A Jupyter Widget

A Jupyter Widget

In [23]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /home/hsong1101/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to('cuda')

In [25]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [26]:
model_id

'google/flan-t5-base'

In [27]:
repository_id = "flan_t5_summarization"
repository_id

'flan_t5_summarization'

In [28]:
from huggingface_hub import create_repo, delete_repo

In [29]:
# delete_repo(repo_id=repository_id)

In [30]:
repository_id

'flan_t5_summarization'

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    overwrite_output_dir=repository_id,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=10,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


In [None]:
# Start training
trainer.train()

In [None]:
trainer.save_model('model/flan_t5_summarizer')

In [None]:
trainer.save_state('model/flan_t5_summarizer_state')

In [None]:
# tokenizer.save_pretrained(repository_id)
# trainer.create_model_card()
# # Push the results to the hub
# trainer.push_to_hub()

In [None]:
sample = dataset['test'][randrange(len(dataset["test"]))]

In [None]:
sample_ids = tokenizer('this is just a test', return_tensors='pt').input_ids.to('cuda')

In [None]:
output = model.generate(sample_ids)
print(tokenizer.decode(output[0], skip_special_tokens=True))