In [19]:
import evals
import torch
import logging
import transformers
import sys
import os
import nltk
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from trl import SFTConfig, SFTTrainer
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from peft import LoraConfig, PeftModel

In [4]:
"""
Initialize logger.
"""
transformers.utils.logging.set_verbosity_info()
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)

In [6]:
"""
Load model and tokenizer.
"""
# set seed before initializing model
set_seed(777)

# modified model & tokenizer code (due to transformers compatibility issue)
# model_id = "/home/javen/Projects/geb-1.3b"
# model_id = "GEB-AGI/geb-1.3b"
model_id = "t5-small"

# load model
# model = AutoModel.from_pretrained(model_id, trust_remote_code=True).bfloat16() #.cuda()
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True).bfloat16()
# model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).bfloat16()

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# print(tokenizer.special_tokens)
tokenizer.add_special_tokens({'pad_token': '<pad>'})

loading configuration file config.json from cache at /home/javen/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
 

0

In [23]:
"""
Load & clean WikiHow dataset.

https://huggingface.co/datasets/gursi26/wikihow-cleaned
https://github.com/mahnazkoupaee/WikiHow-Dataset
"""
def clean_dataset(dataset):
    df = pd.DataFrame(dataset)
    print(len(df))
    df = df.dropna()
    # df = df.iloc[:100]
    print(len(df))
    return Dataset.from_pandas(df)

def load_dataset():
    df = pd.read_csv("/home/javen/Projects/wikihow-cleaned/wikihow-cleaned.csv")
    # dataset = load_dataset("gursi26/wikihow-cleaned", split="train")
    dataset = Dataset.from_pandas(df)
    return dataset

# load dataset
dataset = load_dataset()
dataset = clean_dataset(dataset)

# split dataset
a = dataset.train_test_split(test_size=0.1)
b = a['test'].train_test_split(test_size=0.5)
dataset = DatasetDict({
    'train': a['train'],
    'test': b['test'],
    'valid': b['train']
})
print(dataset)

214293
213892
DatasetDict({
    train: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 192502
    })
    test: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 10695
    })
    valid: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 10695
    })
})


In [24]:
"""
Data tokenization & collation.
"""

# data collator
label_pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8,
)

max_input_length = 1024
max_target_length = 128

if model_id in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples['text']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # setup the tokenizer for targets
    labels = tokenizer(text_target=examples['summary'], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# apply tokenization
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/192502 [00:00<?, ? examples/s]

Map:   0%|          | 0/10695 [00:00<?, ? examples/s]

Map:   0%|          | 0/10695 [00:00<?, ? examples/s]

In [25]:
"""
Evaluation metrics.
"""
def compute_metrics(eval_preds):
    # prepare prediction data
    labels, preds = eval_preds.label_ids, eval_preds.predictions
    labels[labels == -100] = tokenizer.pad_token_id

    # decode
    preds_decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # calculate rouge scores
    rouge_scores = evals.calculate_rouge(labels_decoded, preds_decoded)

    # return metrics
    return rouge_scores

In [31]:
"""
Create Trainer.
"""
batch_size = 16
training_args = Seq2SeqTrainingArguments(
    learning_rate=5e-05,
    weight_decay=0.01,
    logging_dir='./logs',
    log_level='debug',
    output_dir='./output',
    save_steps=10,
    save_total_limit=3,
    use_cpu=True,
    eval_strategy='steps',
    eval_steps=10,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    # greater_is_better=False,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


In [None]:
"""
Train model.
"""
training_results = trainer.train()

Currently training with a batch size of: 16
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, __index_level_0__, text, summary. If title, __index_level_0__, text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 192,502
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12,032
  Number of trainable parameters = 60,506,624


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, __index_level_0__, text, summary. If title, __index_level_0__, text, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10695
  Batch size = 16


In [None]:
"""
Generate summaries
"""
def summarize(model, text: str):
    # encode input text
    inputs = tokenizer(text, return_tensors="pt")
    inputs_length = len(inputs["input_ids"][0])
    print(inputs_length)

    # generate new text with model
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1,
        )
        # print(outputs[0][inputs_length:])
        print(len(outputs[0]))

    # decode generated output text
    decoded = tokenizer.decode(
        # outputs[0][inputs_length:],
        outputs[0][inputs_length:],
        skip_special_tokens=True
    )
    return decoded

def generate_summaries(model, dataset, tokenizer, num_samples=5):
    summaries = []
    for i, example in enumerate(dataset):
        if i >= num_samples:
            break
        print(i)
        prompt = example['text']
        summary = summarize(model, prompt)
        summaries.append({'text': prompt, 'summary': summary})
    return summaries

In [None]:
"""
Evaluate model.
"""
# summaries_hat = generate_summaries(model, dataset['test'], tokenizer)
# print(summaries_hat)