In [11]:
import evals
import torch
import logging
import transformers
import sys
import os
import nltk
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from trl import SFTConfig, SFTTrainer
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from peft import LoraConfig, PeftModel

In [2]:
"""
Initialize logger.
"""
transformers.utils.logging.set_verbosity_info()
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)

In [3]:
"""
Load model and tokenizer.
"""
# set seed before initializing model
set_seed(777)

# modified model & tokenizer code (due to transformers compatibility issue)
# model_id = "/home/javen/Projects/geb-1.3b"
model_id = "GEB-AGI/geb-1.3b"

# load model
model = AutoModel.from_pretrained(model_id, trust_remote_code=True).bfloat16() #.cuda()
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True).bfloat16()
# model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).bfloat16()

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
print(tokenizer.special_tokens)
tokenizer.add_special_tokens({'pad_token': '<pad>'})

config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/javen/.cache/huggingface/hub/models--GEB-AGI--geb-1.3b/snapshots/3612b088ae491042c6f7d590e6313463b11f00c9/config.json


configuration_geblm.py:   0%|          | 0.00/2.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/GEB-AGI/geb-1.3b:
- configuration_geblm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
loading configuration file config.json from cache at /home/javen/.cache/huggingface/hub/models--GEB-AGI--geb-1.3b/snapshots/3612b088ae491042c6f7d590e6313463b11f00c9/config.json
Model config GEBConfig {
  "_name_or_path": "GEB-AGI/geb-1.3b",
  "add_bias_linear": false,
  "apply_query_key_layer_scaling": false,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "GEBForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": false,
  "auto_map": {
    "AutoConfig": "GEB-AGI/geb-1.3b--configuration_geblm.GEBConfig",
    "AutoModel": "GEB-AGI/geb-1.3b--modeling_geb.GEBForCausalLM"
  },
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
 

modeling_geb.py:   0%|          | 0.00/50.4k [00:00<?, ?B/s]

[2024-10-25 17:48:51,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cpu (auto detect)


A new version of the following files was downloaded from https://huggingface.co/GEB-AGI/geb-1.3b:
- modeling_geb.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/2.70G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /home/javen/.cache/huggingface/hub/models--GEB-AGI--geb-1.3b/snapshots/3612b088ae491042c6f7d590e6313463b11f00c9/pytorch_model.bin
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 512,
  "pad_token_id": 2,
  "repetition_penalty": 1.15,
  "temperature": 0.3,
  "top_k": 5,
  "top_p": 0.5
}

All model checkpoint weights were used when initializing GEBForCausalLM.

All the weights of GEBForCausalLM were initialized from the model checkpoint at GEB-AGI/geb-1.3b.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GEBForCausalLM for predictions without further training.
Generation config file not found, using a generation config created from the model config.


tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenization_geb.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/GEB-AGI/geb-1.3b:
- tokenization_geb.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


GEBtokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

loading file GEBtokenizer.model from cache at /home/javen/.cache/huggingface/hub/models--GEB-AGI--geb-1.3b/snapshots/3612b088ae491042c6f7d590e6313463b11f00c9/GEBtokenizer.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/javen/.cache/huggingface/hub/models--GEB-AGI--geb-1.3b/snapshots/3612b088ae491042c6f7d590e6313463b11f00c9/tokenizer_config.json
loading file tokenizer.json from cache at None


{'<bos>': 1, '<eos>': 2, '<pad>': 0}


1

In [14]:
"""
Load & prepare WikiHow dataset.

https://huggingface.co/datasets/gursi26/wikihow-cleaned
https://github.com/mahnazkoupaee/WikiHow-Dataset
"""
def clean_dataset(dataset):
    df = pd.DataFrame(dataset)
    print(len(df))
    df = df.dropna()
    df = df.iloc[:100]
    print(len(df))
    return Dataset.from_pandas(df)

def load_dataset():
    df = pd.read_csv("/home/javen/Projects/wikihowAll.csv")
    # dataset = load_dataset("gursi26/wikihow-cleaned", split="train")
    dataset = Dataset.from_pandas(df)
    return dataset

# load dataset
dataset = load_dataset()
dataset = clean_dataset(dataset)
dataset = dataset.train_test_split(test_size=0.15)
print(dataset)

# data collator
"""
label_pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    # pad_to_multiple_of=8 if training_args.fp16 else None,
    pad_to_multiple_of=8,
)
"""

215365
100
DatasetDict({
    train: Dataset({
        features: ['headline', 'title', 'text', '__index_level_0__'],
        num_rows: 85
    })
    test: Dataset({
        features: ['headline', 'title', 'text', '__index_level_0__'],
        num_rows: 15
    })
})


'\nlabel_pad_token_id = tokenizer.pad_token_id\ndata_collator = DataCollatorForSeq2Seq(\n    tokenizer,\n    model=model,\n    label_pad_token_id=label_pad_token_id,\n    # pad_to_multiple_of=8 if training_args.fp16 else None,\n    pad_to_multiple_of=8,\n)\n'

In [17]:
"""
Evaluation metrics.
"""
def compute_metrics(eval_preds):
    # prepare prediction data
    labels, preds = eval_preds.label_ids, eval_preds.predictions
    labels[labels == -100] = tokenizer.pad_token_id

    # decode
    preds_decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # calculate rouge scores
    rouge_scores = eval.calculate_rouge(labels_decoded, preds_decoded)

    # return metrics
    return rouge_scores

In [13]:
"""
Create Trainer.
"""

lora_alpha = 32
lora_dropout = 0.05
lora_r = 16
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

training_args = SFTConfig(
    dataset_text_field="text",
    max_seq_length=512,
    output_dir='./output',
    learning_rate=5e-05,
    logging_steps=1,
    logging_dir='./logs',
    log_level='debug',
    save_steps=5,
    use_cpu=True,
    label_names=['summary'],
    max_steps=2,
    # num_train_epochs=3,
    # eval_strategy='epoch',
    eval_strategy='steps',
    load_best_model_at_end=True,
    # metric_for_best_model='eval_loss',
    greater_is_better=False,
    fp16=True,
    lr_scheduler_type="cosine",
    # optim="paged_adamw_32bit",
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # data_collator=data_collator,
)

using `logging_steps` to initialize `eval_steps` to 1
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [None]:
"""
Train model.
"""
training_results = trainer.train()

In [6]:
"""
Generate summaries
"""
def summarize(model, text: str):
    # encode input text
    inputs = tokenizer(text, return_tensors="pt")
    inputs_length = len(inputs["input_ids"][0])
    print(inputs_length)

    # generate new text with model
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1,
        )
        # print(outputs[0][inputs_length:])
        print(len(outputs[0]))

    # decode generated output text
    decoded = tokenizer.decode(
        # outputs[0][inputs_length:],
        outputs[0][inputs_length:],
        skip_special_tokens=True
    )
    return decoded

def generate_summaries(model, dataset, tokenizer, num_samples=5):
    summaries = []
    for i, example in enumerate(dataset):
        if i >= num_samples:
            break
        print(i)
        prompt = example['text']
        summary = summarize(model, prompt)
        summaries.append({'text': prompt, 'summary': summary})
    return summaries

In [None]:
"""
Evaluate model.
"""
summaries_hat = generate_summaries(model, dataset['test'], tokenizer)
print(summaries_hat)