In [14]:
import evals
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from trl import SFTConfig, SFTTrainer
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [3]:
"""
Load model and tokenizer.
"""
model_id = "GEB-AGI/geb-1.3b"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True).bfloat16() #.cuda()

# modified tokenizer class due to version incompatibility bug.
tokenizer = AutoTokenizer.from_pretrained("/home/javen/Projects/geb-1.3b", trust_remote_code=True)
print(tokenizer.special_tokens)
tokenizer.add_special_tokens({'pad_token': '<pad>'})
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

[2024-10-25 12:43:06,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cpu (auto detect)
{'<bos>': 1, '<eos>': 2, '<pad>': 0}


1

In [8]:
def clean_dataset(dataset):
    df = pd.DataFrame(dataset)
    print(len(df))
    df = df.dropna()
    df = df.iloc[:100]
    print(len(df))
    return Dataset.from_pandas(df)

In [9]:
"""
Load & prepare WikiHow dataset.

https://huggingface.co/datasets/gursi26/wikihow-cleaned
https://github.com/mahnazkoupaee/WikiHow-Dataset
"""
dataset = load_dataset("gursi26/wikihow-cleaned", split="train")
dataset = clean_dataset(dataset)
dataset = dataset.train_test_split(test_size=0.15)
print(dataset)

214293
100
DatasetDict({
    train: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 85
    })
    test: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 15
    })
})


In [12]:
"""
Create Trainer.
"""
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print(eval_pred)
    predictions = np.argmax(logits, axis=-1)
    rouge_scores = evals.calculate_rouge(predictions, labels)
    return rouge_scores

training_args = SFTConfig(
    dataset_text_field="text",
    max_seq_length=512,
    output_dir='./output',
    learning_rate=5e-05,
    logging_steps=1,
    logging_dir='./logs',
    log_level='debug',
    save_steps=5,
    use_cpu=True,
    label_names=['summary'],
    max_steps=2,
    # num_train_epochs=3,
    # eval_strategy='epoch',
    eval_strategy='steps',
    load_best_model_at_end=True,
    # metric_for_best_model='eval_loss',
    greater_is_better=False,
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [13]:
"""
Train model.
"""
trainer.train()

Currently training with a batch size of: 8
***** Running training *****
  Num examples = 85
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 1,348,044,800


Step,Training Loss,Validation Loss
1,3.0469,No log



***** Running Evaluation *****
  Num examples = 15
  Batch size = 8
Saving model checkpoint to ./output/checkpoint-1
Configuration saved in ./output/checkpoint-1/config.json
Configuration saved in ./output/checkpoint-1/generation_config.json
Model weights saved in ./output/checkpoint-1/model.safetensors
tokenizer config file saved in ./output/checkpoint-1/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1/special_tokens_map.json


KeyError: "The `metric_for_best_model` training argument is set to 'eval_loss', which is not found in the evaluation metrics. The available evaluation metrics are: []. Consider changing the `metric_for_best_model` via the TrainingArguments."