In [61]:
import evals
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from trl import SFTConfig, SFTTrainer

In [26]:
model_id = "GEB-AGI/geb-1.3b"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True).bfloat16() #.cuda()

tokenizer = AutoTokenizer.from_pretrained("/home/javen/Projects/geb-1.3b", trust_remote_code=True)
# tokenizer.add_special_tokens({'pad_token': '[pad]'})

In [68]:
def clean_dataset(dataset):
    df = pd.DataFrame(dataset)
    print(len(df))
    df = df.dropna()
    print(len(df))
    return Dataset.from_pandas(df)

In [69]:
"""
Load & prepare WikiHow dataset.

https://huggingface.co/datasets/gursi26/wikihow-cleaned
https://github.com/mahnazkoupaee/WikiHow-Dataset
"""
dataset = load_dataset("gursi26/wikihow-cleaned", split="train")
dataset = clean_dataset(dataset)
dataset = dataset.train_test_split(test_size=0.15)
print(dataset)

214293
213892
DatasetDict({
    train: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 181808
    })
    test: Dataset({
        features: ['summary', 'title', 'text', '__index_level_0__'],
        num_rows: 32084
    })
})


In [70]:
"""
Create Trainer.
"""
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return calculate_bleu(predictions, labels)

training_args = SFTConfig(
    dataset_text_field="text",
    max_seq_length=512,
    output_dir="/tmp",
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/181808 [00:00<?, ? examples/s]

Map:   0%|          | 0/32084 [00:00<?, ? examples/s]



In [None]:
"""
Train model.
"""
trainer.train()