In [1]:
from random import randrange
import torch, re
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline,
    TrainerCallback,
    AutoModelForSequenceClassification,
)
from transformers.trainer_callback import TrainerControl, TrainerState
from trl import SFTTrainer
from huggingface_hub import login
import os, gc, time, evaluate
import numpy as np
import warnings
from sklearn.metrics import f1_score
import logging
warnings.filterwarnings("ignore")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()
clear_memory()

os.environ["WANDB_NOTEBOOK_NAME "] = "CH-Phi3m4k"
os.environ["WANDB_PROJECT"] = "CH-Phi3m4k"
os.environ["HF_HUB_TOKEN"] = "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"

login(token=os.getenv("HF_HUB_TOKEN"))
local_model_dir = "./CH-Phi3m4k"

# 'model_id' and 'model_name' are the identifiers for the pre-trained model from Hugging Face hub that you want to fine-tune.
model_id = "microsoft/Phi-3-mini-4k-instruct"
model_name = "microsoft/Phi-3-mini-4k-instruct"
new_model = "CH-Phi3m4k"
hf_model_repo="nguyenkhanh87/"+new_model
dataset_name = "casehold/casehold"

# Load Model on GPU 
device_map = {"": 0}

# Bits and Bytes configuration for the model
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_double_quant = True

# LoRA configuration for the model
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
set_seed(1234)

isSetupParams = False
if isSetupParams:
    dataset = load_dataset("lex_glue", "case_hold")
    patience = 1
    train_batch_size = 1
    eval_batch_size = 1
    evalsteps = 20
    logsteps = 1
    num_train_epochs = 10
    save_step = 20
else:
    dataset = load_dataset("lex_glue", "case_hold")
    patience = 5
    train_batch_size = 8
    eval_batch_size = 1
    evalsteps = 200
    logsteps = 200
    num_train_epochs = 10
    save_step = 200

tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right'

def create_message_column(row):
    messages = []
    answers = "".join([f"{i}. {val}\n" for i, val in enumerate(row['endings'])])
    user = {
        "content": f"Choose best option to fill <HOLDING> base on this context, answer only with the number of option(for example: 0):\n{row['context']}\n Options to choose:\n {answers}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['label']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def create_message_column_val(row):
    messages = []
    answers = "".join([f"{i}. {val}\n" for i, val in enumerate(row['endings'])])
    user = {
        "content": f"Choose best option to fill <HOLDING> base on this context, answer only with the number of option(for example: 0):\n{row['context']}\n Options to choose:\n {answers}",
        "role": "user"
    }
    messages.append(user)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

def format_func(example):
    answers = "".join([f"{i}. {val}\n" for i, val in enumerate(example['endings'])])
    text = f"<|user|>\n Choose the best option to fill <HOLDING> based on the context below, answer only with the number of option (for example: 0):\n{example['context']}\nOptions to choose:\n{answers}<|end|>\n<|assistant|>\n{example['label']}<|end|>"
    return {"text": text, "labels": example["label"], "endings": example["endings"], "context":  example["context"]}

# if isSetupParams is True, take only 20 example
if isSetupParams:
    dataset['train'] = dataset['train'].select(list(range(200)))
    dataset['validation'] = dataset['validation'].select(list(range(200)))
else:
    dataset['validation'] = dataset['validation'].select(list(range(500)))

dataset_chatml = dataset['train'].map(format_func)
# dataset_chatml = dataset_chatml.map(format_dataset_chatml)

dataset_val = dataset['validation'].map(format_func)
# dataset_val = dataset_val.map(format_dataset_chatml)

if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'
tokenizer.add_tokens(["<HOLDING>"])

use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_double_quant = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_double_quant,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          attn_implementation=attn_implementation
)

#model2 = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map).base_model

model = prepare_model_for_kbit_training(model)

rouge = evaluate.load("rouge")

class CustomEarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=5, threshold=0.0):
        self.patience = patience
        self.threshold = threshold
        self.best_f1 = 0
        self.patience_counter = 0
        self.rouge = evaluate.load("rouge") 

    def on_train_begin(self, args, state, control, **kwargs):
        self.patience_counter = 0
        self.best_f1 = 0

    def get_answer(self, predict, label):
        answer = predict.strip().split("<|assistant|>")[-1].strip().replace("<|end|>", "").replace("<|endoftext|>", "").strip()
        label_val = label.strip().split("<|assistant|>")[-1].strip().replace("<|end|>", "").replace("<|endoftext|>", "").strip()
        if answer[0].isdigit():
            return int(answer[0]), int(label_val[0])
        else:
            label_txt = label.strip().split("<|assistant|>")[0].strip().split(f"{label_val[0]}.")[1].strip().split("\n")[0].strip()
            scores = self.rouge.compute(predictions=[answer], references=[label_txt])
            if sum(scores.values())==4:
                return int(label_val[0]), int(label_val[0])
            else:
                return -1, int(label_val[0])
    def on_evaluate(self, args, state, control, **kwargs):
        eval_dataloader = kwargs.get('eval_dataloader')
        model = kwargs.get('model')

        all_predictions = []
        all_labels = []

        model.eval()
        for batch in eval_dataloader:
            inputs = batch['input_ids'].to(args.device)
            labels = batch['labels'].to(args.device)

            with torch.no_grad():
                outputs = model(inputs)
                predictions = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), )
                labels = tokenizer.batch_decode(labels)
            for prediction, label in zip(predictions, labels):
                pred, refs = self.get_answer(prediction, label)
                all_predictions.append(pred)
                all_labels.append(refs)
        f1 = f1_score(y_true=all_labels, y_pred=all_predictions, average='micro')
        state.eval_loss = f1

        print(f"Eval f1: {f1}")

        if state.epoch > 0: # at least 1 epoch
            if f1 >= self.best_f1 - self.threshold:
                self.best_f1 = f1
                self.patience_counter = 0
            else:
                self.patience_counter += 1

            if self.patience_counter >= self.patience:
                control.should_training_stop = True
                print(f"Early stopping triggered with best f1 is {self.best_f1}!")
                print(f"Total steps: {state.global_step}")
        return control

early_stopping = CustomEarlyStoppingCallback(patience=patience)

TRAINER_STATE_NAME = "trainer_state.json"
PREFIX_CHECKPOINT_DIR = "checkpoint"

class CustomSFTTrainer(SFTTrainer):
    def _save_checkpoint(self, model, trial=None, metrics=None):
        # Copy from Trainer but remove 2 lines to avoid eval_ prefix
        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
        if self.hp_search_backend is None and trial is None:
            self.store_flos()
        run_dir = self._get_output_dir(trial=trial)
        output_dir = os.path.join(run_dir, checkpoint_folder)
        self.save_model(output_dir, _internal_call=True)
        if not self.args.save_only_model:
            self._save_optimizer_and_scheduler(output_dir)
            self._save_rng_state(output_dir)
        if metrics is not None and self.args.metric_for_best_model is not None:
            metric_to_check = self.args.metric_for_best_model
            if metric_to_check in metrics:
                metric_value = metrics[metric_to_check]
                operator = np.greater if self.args.greater_is_better else np.less
                if (
                    self.state.best_metric is None
                    or self.state.best_model_checkpoint is None
                    or operator(metric_value, self.state.best_metric)
                ):
                    self.state.best_metric = metric_value
                    self.state.best_model_checkpoint = output_dir
        if self.args.should_save:
            self.state.stateful_callbacks["TrainerControl"] = self.control.state()
            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
        if self.args.push_to_hub:
            self._push_from_checkpoint(output_dir)
        if self.args.should_save:
            self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)

run_name = time.strftime("%Y-%m-%d-%H-%M")

args = TrainingArguments(
    output_dir=local_model_dir,
    eval_strategy="steps",
    save_strategy="steps",
    do_eval=False,
    optim="adamw_torch",
    per_device_train_batch_size=train_batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=eval_batch_size,
    log_level="debug",
    learning_rate=1e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    eval_steps=evalsteps,
    logging_steps=logsteps,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    report_to=None,
    seed=342,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    run_name=run_name,
    save_steps=save_step,
    # prediction_loss_only=False, # to enable computing metrics
)

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    task_type=TaskType.CAUSAL_LM,
)

 
trainer = CustomSFTTrainer(
    model=model,
    train_dataset=dataset_chatml,
    eval_dataset=dataset_val,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=args,
    callbacks=[early_stopping],
)

trainer.train()
if isSetupParams==False:
    trainer.save_model(local_model_dir)

    # trainer.push_to_hub(hf_model_repo)

    tokenizer.save_pretrained(local_model_dir)
    model.save_pretrained(local_model_dir)

    trainer.push_to_hub(hf_model_repo)
    tokenizer.push_to_hub(hf_model_repo)
    model.push_to_hub(hf_model_repo)

del model
del tokenizer
del trainer
gc.collect()
torch.cuda.empty_cache()



  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/nhk/.cache/huggingface/token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
Map: 100%|██████████| 45000/45000 [00:06<00:00, 7341.06 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2922.40 examples/s]
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 45,000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 14,060
  Number of trainable parameters = 8,912,896
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manual



Step,Training Loss,Validation Loss
200,2.0053,1.892748
400,1.6481,1.606505
600,1.5653,1.586402
800,1.5516,1.57621
1000,1.537,1.568918
1200,1.5318,1.563681
1400,1.5249,1.559466
1600,1.5208,1.554966
1800,1.5234,1.551101
2000,1.5099,1.54801


***** Running Evaluation *****
  Num examples = 500
  Batch size = 1


Eval f1: 0.722


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-200
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32

Eval f1: 0.732


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-400
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32

Eval f1: 0.746


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-600
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32

Eval f1: 0.758


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-800
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32

Eval f1: 0.768


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-1000
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.772


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-1200
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.774


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-1400
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.768


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-1600
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.778


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-1800
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.774


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-2000
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.774


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-2200
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.776


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-2400
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.778


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-2600
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.77


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-2800
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.768


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-3000
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.768


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-3200
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.772


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-3400
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3

Eval f1: 0.772
Early stopping triggered with best f1 is 0.778!
Total steps: 3600


Saving model checkpoint to ./CH-Phi3m4k/checkpoint-3600
loading configuration file config.json from cache at /home/nhk/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/c1358f8a35e6d2af81890deffbbfa575b978c62f/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 3