# Finetune LLaMA2 and MPT on NVIDIA GPU

## 1. Prerequisite​

### 1.1 Setup Environment​

In [None]:
!pip install intel-extension-for-transformers torch datasets

### 1.2 Prepare Dataset

Download Alpaca dataset from [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json).

In [None]:
alpaca_data_path = "/path/to/alpaca_data.json"
llama2_model_name_or_path = "meta-llama/Llama-2-7b-hf"
mpt_model_name_or_path = "mosaicml/mpt-7b"

## 2. Finetune LLaMA2 on NVIDIA GPU with LoRA

### 2.1 Setup Finetuning Config

In [2]:
from transformers import TrainingArguments
from intel_extension_for_transformers.neural_chat.config import (
    ModelArguments,
    DataArguments,
    FinetuningArguments,
    TextGenerationFinetuningConfig,
)

model_args = ModelArguments(
    model_name_or_path=llama2_model_name_or_path,
    use_fast_tokenizer=False,
)

data_args = DataArguments(
    train_file=alpaca_data_path,
    dataset_concatenation=True,
)

training_args = TrainingArguments(
    output_dir="./llama_peft_finetuned_model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    save_strategy="no",
    log_level="info",
    save_total_limit=2,
    bf16=True,
)

finetune_args = FinetuningArguments(
    lora_all_linear=True,
    do_lm_eval=True,
)

finetune_cfg = TextGenerationFinetuningConfig(
        model_args=model_args,
        data_args=data_args,
        training_args=training_args,
        finetune_args=finetune_args,
)

  from .autonotebook import tqdm as notebook_tqdm


Package 'habana_frameworks.torch.hpu' is not installed.


### 2.2 Finetuning

In [3]:
from intel_extension_for_transformers.neural_chat.chatbot import finetune_model
finetune_model(finetune_cfg)

distributed training: True, 16-bits training: True
[2023-08-28 21:57:21,419] [    INFO] finetuning.py:101 - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointi

trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2957573965106688


[INFO|trainer.py:1686] 2023-08-28 21:58:53,236 >> ***** Running training *****
[INFO|trainer.py:1687] 2023-08-28 21:58:53,237 >>   Num examples = 12,390
[INFO|trainer.py:1688] 2023-08-28 21:58:53,238 >>   Num Epochs = 3
[INFO|trainer.py:1689] 2023-08-28 21:58:53,238 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:1692] 2023-08-28 21:58:53,239 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1693] 2023-08-28 21:58:53,239 >>   Gradient Accumulation steps = 2
[INFO|trainer.py:1694] 2023-08-28 21:58:53,239 >>   Total optimization steps = 4,647
[INFO|trainer.py:1695] 2023-08-28 21:58:53,249 >>   Number of trainable parameters = 19,988,480


Step,Training Loss
500,1.1515
1000,1.1212
1500,1.1132
2000,1.0758
2500,1.0716
3000,1.0752
3500,1.0448
4000,1.0314
4500,1.0353


[INFO|trainer.py:1934] 2023-08-28 23:37:43,349 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|configuration_utils.py:710] 2023-08-28 23:37:43,535 >> loading configuration file /models/Llama-2-7b-hf/config.json
[INFO|configuration_utils.py:768] 2023-08-28 23:37:43,536 >> Model config LlamaConfig {
  "_name_or_path": "/models/Llama-2-7b-hf/",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 32000
}

[INFO|tokenization_utils_base.py

Running loglikelihood requests


100%|██████████| 5882/5882 [06:23<00:00, 15.32it/s]
[2023-08-28 23:44:16,990] [    INFO] finetuning.py:554 - {'results': {'truthfulqa_mc': {'mc1': 0.31701346389228885, 'mc1_stderr': 0.016289203374403382, 'mc2': 0.4622660487303061, 'mc2_stderr': 0.015048114647057709}}, 'versions': {'truthfulqa_mc': 1}}


|    Task     |Version|Metric|Value |   |Stderr|
|-------------|------:|------|-----:|---|-----:|
|truthfulqa_mc|      1|mc1   |0.3170|±  |0.0163|
|             |       |mc2   |0.4623|±  |0.0150|



## 3. Finetune MPT on NVIDIA GPU with LoRA

### 3.1 Setup Finetuning Config

In [4]:
from transformers import TrainingArguments
from intel_extension_for_transformers.neural_chat.config import (
    ModelArguments,
    DataArguments,
    FinetuningArguments,
    TextGenerationFinetuningConfig,
)

model_args = ModelArguments(
    model_name_or_path=mpt_model_name_or_path,
    trust_remote_code=True,
)

data_args = DataArguments(
    train_file=alpaca_data_path,
    dataset_concatenation=True,
)

training_args = TrainingArguments(
    output_dir="./mpt_peft_finetuned_model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    save_strategy="no",
    log_level="info",
    save_total_limit=2,
    bf16=True,
)

finetune_args = FinetuningArguments(
    lora_all_linear=True,
    do_lm_eval=True,
)

finetune_cfg = TextGenerationFinetuningConfig(
        model_args=model_args,
        data_args=data_args,
        training_args=training_args,
        finetune_args=finetune_args,
)

[INFO|training_args.py:1299] 2023-08-28 23:46:58,375 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1713] 2023-08-28 23:46:58,377 >> PyTorch: setting up devices
[INFO|training_args.py:1439] 2023-08-28 23:46:58,377 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### 3.2 Finetuning

In [5]:
from intel_extension_for_transformers.neural_chat.chatbot import finetune_model
finetune_model(finetune_cfg)

distributed training: True, 16-bits training: True
[2023-08-28 23:47:04,823] [    INFO] finetuning.py:101 - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointi

You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.50s/it]
[INFO|modeling_utils.py:3329] 2023-08-28 23:47:15,869 >> All model checkpoint weights were used when initializing MPTForCausalLM.

[INFO|modeling_utils.py:3337] 2023-08-28 23:47:15,870 >> All the weights of MPTForCausalLM were initialized from the model checkpoint at /models/mpt-7b.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MPTForCausalLM for predictions without further training.
[INFO|configuration_utils.py:559] 2023-08-28 23:47:15,873 >> loading configuration file /models/mpt-7b/generation_config.json
[INFO|configuration_utils.py:599] 2023-08-28 23:47:15,874 >> Generate config GenerationConfig {
  "_from_model_config": true,
  "transformers_version": "4.31.0",
  "use_cache": false
}

Map:   0%|          | 0/52002 [00:00<?, ? examples/s][2023-08-28 23:47:17,729] [    INFO] arrow_dataset.py:3325 - Caching processed dataset at /models/huggingface/datasets/js

trainable params: 16,777,216 || all params: 6,666,063,872 || trainable%: 0.2516809967943853


[INFO|trainer.py:1686] 2023-08-28 23:48:45,740 >> ***** Running training *****
[INFO|trainer.py:1687] 2023-08-28 23:48:45,741 >>   Num examples = 10,743
[INFO|trainer.py:1688] 2023-08-28 23:48:45,741 >>   Num Epochs = 3
[INFO|trainer.py:1689] 2023-08-28 23:48:45,741 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:1692] 2023-08-28 23:48:45,742 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1693] 2023-08-28 23:48:45,742 >>   Gradient Accumulation steps = 2
[INFO|trainer.py:1694] 2023-08-28 23:48:45,742 >>   Total optimization steps = 4,029
[INFO|trainer.py:1695] 2023-08-28 23:48:45,745 >>   Number of trainable parameters = 16,777,216


Step,Training Loss
500,1.3222
1000,1.2887
1500,1.284
2000,1.2462
2500,1.2358
3000,1.2152
3500,1.2028
4000,1.2116


[INFO|trainer.py:1934] 2023-08-29 00:54:47,851 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|configuration_utils.py:710] 2023-08-29 00:54:47,936 >> loading configuration file /models/mpt-7b/config.json
[INFO|configuration_utils.py:710] 2023-08-29 00:54:47,939 >> loading configuration file /models/mpt-7b/config.json
[INFO|configuration_utils.py:768] 2023-08-29 00:54:47,940 >> Model config MPTConfig {
  "_name_or_path": "/models/mpt-7b",
  "architectures": [
    "MPTForCausalLM"
  ],
  "attn_config": {
    "alibi": true,
    "alibi_bias_max": 8,
    "attn_impl": "torch",
    "attn_pdrop": 0,
    "attn_type": "multihead_attention",
    "attn_uses_sequence_id": false,
    "clip_qkv": null,
    "prefix_lm": false,
    "qk_ln": false,
    "softmax_scale": null
  },
  "auto_map": {
    "AutoConfig": "configuration_mpt.MPTConfig",
    "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
  },
  "d_model": 4096,
  "emb_pdrop": 0,
  "embedding_fra

Running loglikelihood requests


100%|██████████| 5882/5882 [03:02<00:00, 32.22it/s]
[2023-08-29 00:57:58,494] [    INFO] finetuning.py:554 - {'results': {'truthfulqa_mc': {'mc1': 0.25091799265605874, 'mc1_stderr': 0.015176985027707689, 'mc2': 0.36732950033142087, 'mc2_stderr': 0.01428485016453199}}, 'versions': {'truthfulqa_mc': 1}}


|    Task     |Version|Metric|Value |   |Stderr|
|-------------|------:|------|-----:|---|-----:|
|truthfulqa_mc|      1|mc1   |0.2509|±  |0.0152|
|             |       |mc2   |0.3673|±  |0.0143|



## 4. Finetune LLaMA2 on NVIDIA GPU with QLoRA

### 4.1 Setup Finetuning Config

In [6]:
from transformers import TrainingArguments
from intel_extension_for_transformers.neural_chat.config import (
    ModelArguments,
    DataArguments,
    FinetuningArguments,
    TextGenerationFinetuningConfig,
)

model_args = ModelArguments(
    model_name_or_path=llama2_model_name_or_path,
    use_fast_tokenizer=False,
)

data_args = DataArguments(
    train_file=alpaca_data_path,
    dataset_concatenation=True,
)

training_args = TrainingArguments(
    output_dir="./llama_peft_finetuned_model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    save_strategy="no",
    log_level="info",
    save_total_limit=2,
    bf16=True,
)

finetune_args = FinetuningArguments(
    lora_all_linear=True,
    do_lm_eval=True,
    qlora=True,
)

finetune_cfg = TextGenerationFinetuningConfig(
        model_args=model_args,
        data_args=data_args,
        training_args=training_args,
        finetune_args=finetune_args,
)

[INFO|training_args.py:1299] 2023-08-29 01:38:40,537 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1713] 2023-08-29 01:38:40,538 >> PyTorch: setting up devices
[INFO|training_args.py:1439] 2023-08-29 01:38:40,539 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### 4.2 Finetuning

In [7]:
from intel_extension_for_transformers.neural_chat.chatbot import finetune_model
finetune_model(finetune_cfg)

distributed training: True, 16-bits training: True
[2023-08-29 01:38:46,979] [    INFO] finetuning.py:101 - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointi

trainable params: 19,988,480 || all params: 3,520,401,408 || trainable%: 0.5677897967708119


[INFO|trainer.py:1686] 2023-08-29 01:40:31,905 >> ***** Running training *****
[INFO|trainer.py:1687] 2023-08-29 01:40:31,906 >>   Num examples = 12,390
[INFO|trainer.py:1688] 2023-08-29 01:40:31,906 >>   Num Epochs = 3
[INFO|trainer.py:1689] 2023-08-29 01:40:31,906 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:1692] 2023-08-29 01:40:31,906 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1693] 2023-08-29 01:40:31,907 >>   Gradient Accumulation steps = 2
[INFO|trainer.py:1694] 2023-08-29 01:40:31,907 >>   Total optimization steps = 4,647
[INFO|trainer.py:1695] 2023-08-29 01:40:31,912 >>   Number of trainable parameters = 19,988,480


Step,Training Loss
500,1.1636
1000,1.1297
1500,1.1211
2000,1.0822
2500,1.0775
3000,1.0813
3500,1.0496
4000,1.0364
4500,1.04


[INFO|trainer.py:1934] 2023-08-29 03:21:28,703 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|configuration_utils.py:710] 2023-08-29 03:21:28,817 >> loading configuration file /models/Llama-2-7b-hf/config.json
[INFO|configuration_utils.py:768] 2023-08-29 03:21:28,818 >> Model config LlamaConfig {
  "_name_or_path": "/models/Llama-2-7b-hf/",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 32000
}

[INFO|tokenization_utils_base.py

Running loglikelihood requests


100%|██████████| 5882/5882 [04:10<00:00, 23.52it/s]
[2023-08-29 03:25:49,385] [    INFO] finetuning.py:554 - {'results': {'truthfulqa_mc': {'mc1': 0.31456548347613217, 'mc1_stderr': 0.016255241993179178, 'mc2': 0.4529007721356952, 'mc2_stderr': 0.01493900384942977}}, 'versions': {'truthfulqa_mc': 1}}


|    Task     |Version|Metric|Value |   |Stderr|
|-------------|------:|------|-----:|---|-----:|
|truthfulqa_mc|      1|mc1   |0.3146|±  |0.0163|
|             |       |mc2   |0.4529|±  |0.0149|



## 5. Finetune MPT on NVIDIA GPU with QLoRA

### 5.1 Setup Finetuning Config

In [8]:
from transformers import TrainingArguments
from intel_extension_for_transformers.neural_chat.config import (
    ModelArguments,
    DataArguments,
    FinetuningArguments,
    TextGenerationFinetuningConfig,
)

model_args = ModelArguments(
    model_name_or_path=mpt_model_name_or_path,
    trust_remote_code=True,
)

data_args = DataArguments(
    train_file=alpaca_data_path,
    dataset_concatenation=True,
)

training_args = TrainingArguments(
    output_dir="./mpt_peft_finetuned_model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    save_strategy="no",
    log_level="info",
    save_total_limit=2,
    bf16=True,
)

finetune_args = FinetuningArguments(
    lora_all_linear=True,
    do_lm_eval=True,
    qlora=True,
)

finetune_cfg = TextGenerationFinetuningConfig(
        model_args=model_args,
        data_args=data_args,
        training_args=training_args,
        finetune_args=finetune_args,
)

[INFO|training_args.py:1299] 2023-08-29 03:29:46,914 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1713] 2023-08-29 03:29:46,916 >> PyTorch: setting up devices
[INFO|training_args.py:1439] 2023-08-29 03:29:46,918 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### 5.2 Finetuning

In [9]:
from intel_extension_for_transformers.neural_chat.chatbot import finetune_model
finetune_model(finetune_cfg)

distributed training: True, 16-bits training: True
[2023-08-29 03:29:52,604] [    INFO] finetuning.py:101 - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointi

You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.50s/it]
[INFO|modeling_utils.py:3329] 2023-08-29 03:30:07,939 >> All model checkpoint weights were used when initializing MPTForCausalLM.

[INFO|modeling_utils.py:3337] 2023-08-29 03:30:07,939 >> All the weights of MPTForCausalLM were initialized from the model checkpoint at /models/mpt-7b.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MPTForCausalLM for predictions without further training.
[INFO|configuration_utils.py:559] 2023-08-29 03:30:07,942 >> loading configuration file /models/mpt-7b/generation_config.json
[INFO|configuration_utils.py:599] 2023-08-29 03:30:07,943 >> Generate config GenerationConfig {
  "_from_model_config": true,
  "transformers_version": "4.31.0",
  "use_cache": false
}

Map:   0%|          | 0/52002 [00:00<?, ? examples/s][2023-08-29 03:30:09,754] [    INFO] arrow_dataset.py:3325 - Caching processed dataset at /models/huggingface/datasets/js

trainable params: 16,777,216 || all params: 3,444,838,400 || trainable%: 0.4870247614517999


[INFO|trainer.py:1686] 2023-08-29 03:31:04,599 >> ***** Running training *****
[INFO|trainer.py:1687] 2023-08-29 03:31:04,600 >>   Num examples = 10,743
[INFO|trainer.py:1688] 2023-08-29 03:31:04,601 >>   Num Epochs = 3
[INFO|trainer.py:1689] 2023-08-29 03:31:04,601 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:1692] 2023-08-29 03:31:04,602 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1693] 2023-08-29 03:31:04,602 >>   Gradient Accumulation steps = 2
[INFO|trainer.py:1694] 2023-08-29 03:31:04,603 >>   Total optimization steps = 4,029
[INFO|trainer.py:1695] 2023-08-29 03:31:04,607 >>   Number of trainable parameters = 16,777,216


Step,Training Loss
500,1.3356
1000,1.2987
1500,1.2928
2000,1.2538
2500,1.2439
3000,1.2225
3500,1.2099
4000,1.2181


[INFO|trainer.py:1934] 2023-08-29 04:45:45,421 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|configuration_utils.py:710] 2023-08-29 04:45:45,507 >> loading configuration file /models/mpt-7b/config.json
[INFO|configuration_utils.py:710] 2023-08-29 04:45:45,509 >> loading configuration file /models/mpt-7b/config.json
[INFO|configuration_utils.py:768] 2023-08-29 04:45:45,510 >> Model config MPTConfig {
  "_name_or_path": "/models/mpt-7b",
  "architectures": [
    "MPTForCausalLM"
  ],
  "attn_config": {
    "alibi": true,
    "alibi_bias_max": 8,
    "attn_impl": "torch",
    "attn_pdrop": 0,
    "attn_type": "multihead_attention",
    "attn_uses_sequence_id": false,
    "clip_qkv": null,
    "prefix_lm": false,
    "qk_ln": false,
    "softmax_scale": null
  },
  "auto_map": {
    "AutoConfig": "configuration_mpt.MPTConfig",
    "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
  },
  "d_model": 4096,
  "emb_pdrop": 0,
  "embedding_fra

Running loglikelihood requests


100%|██████████| 5882/5882 [04:13<00:00, 23.18it/s]
[2023-08-29 04:50:04,639] [    INFO] finetuning.py:554 - {'results': {'truthfulqa_mc': {'mc1': 0.24479804161566707, 'mc1_stderr': 0.01505186948671501, 'mc2': 0.36567230051304445, 'mc2_stderr': 0.014372749953340895}}, 'versions': {'truthfulqa_mc': 1}}


|    Task     |Version|Metric|Value |   |Stderr|
|-------------|------:|------|-----:|---|-----:|
|truthfulqa_mc|      1|mc1   |0.2448|±  |0.0151|
|             |       |mc2   |0.3657|±  |0.0144|

