In [None]:
!pip install -U accelerate --user
!pip install -U transformers --user
!pip install rouge_score evaluate

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_metric
import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
import nltk
import numpy as np

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [None]:
model_name = 'google-t5/t5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [None]:
dataset_file = './data/reviews.csv'
data = pd.read_csv("./data/Reviews.csv")

data = data.dropna()
data = data.reset_index(drop=True)

data = data.loc[:, ['Summary', 'Text']]
data['Text']= data['Text'].apply(lambda w: w.lower())
data['Summary'] = data['Summary'].apply(lambda w: w.lower())

data.head()

In [None]:
def preprocess(data):
    inputs = ['summarize: ' + text for text in data["Text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=data["Summary"], max_length=16, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

train_split = 0.8
test_split = 1-train_split

train_data = data[:int(len(data)*train_split)].reset_index(drop=True)
test_data = data[int(len(data)*train_split):].reset_index(drop=True)

In [None]:
train_data = Dataset.from_pandas(train_data)
test_data = Dataset.from_pandas(test_data)

tokenized_train = train_data.map(preprocess, batched=True)
tokenized_valid = test_data.map(preprocess, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = load_metric("rouge", trust_remote_code=True)

In [None]:
## code from a Huggingface notebook: https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb
def compute_metrics(eval_pred):
    preds, rfs = eval_pred
    rfs = np.where(rfs != -100, rfs, tokenizer.pad_token_id)
    de_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    def_rfs = tokenizer.batch_decode(rfs, skip_special_tokens=True)
    de_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in de_preds]
    def_rfs = ["\n".join(nltk.sent_tokenize(label.strip())) for label in def_rfs]
    metrics = metric.compute(preds=de_preds, references=def_rfs, use_stemmer=True)
    metrics = {key: value.mid.fmeasure * 100 for key, value in metrics.items()}
    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    metrics["gen_len"] = np.mean(pred_lens)
    return {k: round(v, 4) for k, v in metrics.items()}

In [None]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    f"{model_name}_amazon",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=1e-2,
    save_total_limit=3,
    num_train_epochs=45,
    predict_with_generate=True,
    fp16=True,
    eval_strategy = "epoch"
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()