## Import Lib

In [1]:
!nvidia-smi

Mon Apr 15 10:07:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10              Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     On  | 00000000:C1:00.0 Off |                    0 |
|  0%   28C    P8              29W / 300W |      4MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [2]:
import os
import torch
from datasets import load_metric, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

## Dataset

In [11]:
dataset_name = "mlabonne/guanaco-llama2-1k"
dataset = load_dataset(dataset_name, split="train")
print(type(dataset))

<class 'datasets.arrow_dataset.Dataset'>


## Quantization Config

In [3]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


## Load Model

In [4]:
device_map = {"": 0}
model_name = "NousResearch/Llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


## Load Token

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## LoRA Config

In [6]:
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

## Train Config

In [15]:
output_dir = "./results"
num_train_epochs = 1

#set bf16 to True with an A100
fp16 = False
bf16 = True

per_device_train_batch_size = 4

per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

max_grad_norm = 0.3

#AdamW
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

optim = "paged_adamw_32bit"

lr_scheduler_type = "constant"

warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

save_steps = 25

logging_steps = 25

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
#     max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

## SFT Config

In [16]:
max_seq_length = None
packing = False

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Train Model(with Lora)

In [17]:
trainer.train()



Step,Training Loss
25,1.3479
50,1.6173
75,1.2085
100,1.4377
125,1.177
150,1.3614
175,1.1719
200,1.4594
225,1.1546
250,1.5303




TrainOutput(global_step=250, training_loss=1.3466120834350586, metrics={'train_runtime': 457.9452, 'train_samples_per_second': 2.184, 'train_steps_per_second': 0.546, 'total_flos': 8755214190673920.0, 'train_loss': 1.3466120834350586, 'epoch': 1.0})

In [18]:
new_model = "llama-2-7b-miniguanaco"
trainer.model.save_pretrained(new_model)

## (Inference) Load base_model + Adapter(LoRA)

In [19]:
# Reload model in FP16 and merge it with LoRA weights
model_lora = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model_lora = PeftModel.from_pretrained(model_lora, new_model)
model_lora = model_lora.merge_and_unload()
print(id(model_lora))

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

46917617896560


In [22]:
base_model= AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
print(id(base_model))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

46917814564512


In [20]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "你是一位宋词学者，帮助我学习一些宋词相关的知识。"
pipe = pipeline(task="text-generation", model=model_lora, tokenizer=tokenizer, max_length=500)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] 你是一位宋词学者，帮助我学习一些宋词相关的知识。 [/INST] 宋词是中国古代文学中的一种词语，具有很高的文学价值和历史价值。学习宋词可以帮助我们更好地理解中国古代文学的核心思想和文化传承。

宋词的概念包括：

1. 词语的意义：宋词的意义是指宋词的核心意义，它是宋词的基础和核心。
2. 词语的用法：宋词的用法是指宋词的用法和表达方式，它是宋词的表达形式。
3. 词语的推广：宋词的推广是指宋词的推广和应用，它是宋词的扩展和发展。

学习宋词的知识可以帮助我们更好地理解中国古代文学的核心思想和文化传承，同时也可以帮助我们更好地理解中国古代文学的语言和文学风格。

宋词的知识包括：

1. 宋词的概念和用法：学习宋词的概念和用法，可以帮助我们更好地理解宋词的意义


In [23]:
prompt = "你是一位宋词学者，帮助我学习一些宋词相关的知识。"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=500)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] 你是一位宋词学者，帮助我学习一些宋词相关的知识。 [/INST]  Ah, a fellow seeker of knowledge! 😊 As a Song Dynasty scholar, I'd be delighted to share some insights and tips on learning about Song Dynasty-related knowledge.

1. Start with the basics: Begin by familiarizing yourself with the major historical events, cultural practices, and literary works of the Song Dynasty (960-1279 AD). This will provide a solid foundation for your studies.
2. Read the classics: The Song Dynasty was a period of great literary achievement, and many of the works from this era are still studied and admired today. Some of the most famous works include "The Classic of Poetry," "The Book of Odes," and "The Water Margin."
3. Explore the philosophical and political context: The Song Dynasty was marked by a complex web of philosophical and political movements, including Confucianism, Taoism, and Buddhism. Understanding these movements and their impact on society will help you appreciate the cultural context of the time.
4. Stud

In [24]:
# print(model_lora)

# print("--------")
# print(base_model)
# for param_tensor in model_lora.state_dict():
#     print(param_tensor, "\t", model_lora.state_dict()[param_tensor].size())
for param_tensor in model_lora.state_dict():
  if param_tensor == "model.layers.0.self_attn.q_proj.weight":
    # Access first 5 elements (modify as needed)
    first_five_values = model_lora.state_dict()[param_tensor][:5]
    print(param_tensor, ":\n", first_five_values)
    break

for param_tensor in base_model.state_dict():
  if param_tensor == "model.layers.0.self_attn.q_proj.weight":
    # Access first 5 elements (modify as needed)
    first_five_values = base_model.state_dict()[param_tensor][:5]
    print(param_tensor, ":\n", first_five_values)
    break  

model.layers.0.self_attn.q_proj.weight :
 tensor([[-0.0056, -0.0145, -0.0024,  ...,  0.0043,  0.0014, -0.0036],
        [ 0.0145, -0.0042,  0.0028,  ..., -0.0091, -0.0112,  0.0071],
        [-0.0140,  0.0120,  0.0005,  ...,  0.0059,  0.0185, -0.0028],
        [-0.0066,  0.0082,  0.0006,  ...,  0.0043,  0.0093, -0.0092],
        [-0.0089,  0.0151,  0.0013,  ...,  0.0029,  0.0179, -0.0004]],
       device='cuda:0', dtype=torch.float16)
model.layers.0.self_attn.q_proj.weight :
 tensor([[-0.0060, -0.0146, -0.0021,  ...,  0.0042,  0.0018, -0.0035],
        [ 0.0142, -0.0043,  0.0032,  ..., -0.0092, -0.0108,  0.0073],
        [-0.0137,  0.0121,  0.0002,  ...,  0.0061,  0.0181, -0.0030],
        [-0.0064,  0.0083,  0.0003,  ...,  0.0044,  0.0090, -0.0093],
        [-0.0086,  0.0153,  0.0010,  ...,  0.0031,  0.0176, -0.0004]],
       device='cuda:0', dtype=torch.float16)


## Boolq

In [84]:
boolq_dataset = load_dataset("super_glue", "boolq")

boolq_dataset_training=boolq_dataset["train"]
boolq_dataset_validation=boolq_dataset["validation"]
def concatenate_question_passage(example):
#     answer="True" if example["label"]==1 else "False"
    return {'combine': f"<QUESTION>{example['question']} </QUESTION> <PASSAGE> {example['passage']} </PASSAGE>{example['label']}"}

boolq_dataset_training = boolq_dataset_training.map(concatenate_question_passage)
print(boolq_dataset_training[0]["combine"])
# for doc in validation_data:
#     passage = doc["passage"]
#     question = doc["question"]
#     gold_label = doc["label"]
# boolq_dataset_training=list(boolq_dataset_training)
# print(boolq_dataset_training[0])

<QUESTION>do iran and afghanistan speak the same language </QUESTION> <PASSAGE> Persian language -- Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet. </PASSAGE>1


In [85]:
training_arguments = TrainingArguments(
    output_dir="./boolq_results",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    #A100 True
    bf16=True,
    max_grad_norm=0.3,
 #   max_steps=max_steps,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)


trainer_boolq = SFTTrainer(
    model=model,
    train_dataset=boolq_dataset_training,
    peft_config=peft_config,
    dataset_text_field="combine",
#     dataset_text_field=["question", "passage"],
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [86]:
trainer_boolq.train()



{'loss': 1.7508, 'learning_rate': 0.0002, 'epoch': 0.01}




{'loss': 1.4188, 'learning_rate': 0.0002, 'epoch': 0.02}




{'loss': 1.4385, 'learning_rate': 0.0002, 'epoch': 0.03}




{'loss': 1.198, 'learning_rate': 0.0002, 'epoch': 0.04}




{'loss': 1.4071, 'learning_rate': 0.0002, 'epoch': 0.05}




{'loss': 1.1876, 'learning_rate': 0.0002, 'epoch': 0.06}




{'loss': 1.4587, 'learning_rate': 0.0002, 'epoch': 0.07}




{'loss': 1.2911, 'learning_rate': 0.0002, 'epoch': 0.08}




{'loss': 1.4408, 'learning_rate': 0.0002, 'epoch': 0.1}




{'loss': 1.253, 'learning_rate': 0.0002, 'epoch': 0.11}




{'loss': 1.3907, 'learning_rate': 0.0002, 'epoch': 0.12}




{'loss': 1.172, 'learning_rate': 0.0002, 'epoch': 0.13}




{'loss': 1.4348, 'learning_rate': 0.0002, 'epoch': 0.14}




{'loss': 1.244, 'learning_rate': 0.0002, 'epoch': 0.15}




{'loss': 1.4514, 'learning_rate': 0.0002, 'epoch': 0.16}




{'loss': 1.3392, 'learning_rate': 0.0002, 'epoch': 0.17}




{'loss': 1.4112, 'learning_rate': 0.0002, 'epoch': 0.18}




{'loss': 1.2235, 'learning_rate': 0.0002, 'epoch': 0.19}




{'loss': 1.2851, 'learning_rate': 0.0002, 'epoch': 0.2}




{'loss': 1.327, 'learning_rate': 0.0002, 'epoch': 0.21}




{'loss': 1.4013, 'learning_rate': 0.0002, 'epoch': 0.22}




{'loss': 1.223, 'learning_rate': 0.0002, 'epoch': 0.23}




{'loss': 1.3976, 'learning_rate': 0.0002, 'epoch': 0.24}




{'loss': 1.1822, 'learning_rate': 0.0002, 'epoch': 0.25}




{'loss': 1.3747, 'learning_rate': 0.0002, 'epoch': 0.27}




{'loss': 1.1842, 'learning_rate': 0.0002, 'epoch': 0.28}




{'loss': 1.3578, 'learning_rate': 0.0002, 'epoch': 0.29}




{'loss': 1.2311, 'learning_rate': 0.0002, 'epoch': 0.3}




{'loss': 1.3462, 'learning_rate': 0.0002, 'epoch': 0.31}




{'loss': 1.2394, 'learning_rate': 0.0002, 'epoch': 0.32}




{'loss': 1.3875, 'learning_rate': 0.0002, 'epoch': 0.33}




{'loss': 1.269, 'learning_rate': 0.0002, 'epoch': 0.34}




{'loss': 1.2726, 'learning_rate': 0.0002, 'epoch': 0.35}




{'loss': 1.2233, 'learning_rate': 0.0002, 'epoch': 0.36}




{'loss': 1.3673, 'learning_rate': 0.0002, 'epoch': 0.37}




{'loss': 1.2589, 'learning_rate': 0.0002, 'epoch': 0.38}




{'loss': 1.3605, 'learning_rate': 0.0002, 'epoch': 0.39}




{'loss': 1.1909, 'learning_rate': 0.0002, 'epoch': 0.4}




{'loss': 1.4179, 'learning_rate': 0.0002, 'epoch': 0.41}




{'loss': 1.2315, 'learning_rate': 0.0002, 'epoch': 0.42}




{'loss': 1.3736, 'learning_rate': 0.0002, 'epoch': 0.43}




{'loss': 1.1164, 'learning_rate': 0.0002, 'epoch': 0.45}




{'loss': 1.3652, 'learning_rate': 0.0002, 'epoch': 0.46}




{'loss': 1.2219, 'learning_rate': 0.0002, 'epoch': 0.47}




{'loss': 1.3047, 'learning_rate': 0.0002, 'epoch': 0.48}




{'loss': 1.1635, 'learning_rate': 0.0002, 'epoch': 0.49}




{'loss': 1.3421, 'learning_rate': 0.0002, 'epoch': 0.5}




{'loss': 1.2071, 'learning_rate': 0.0002, 'epoch': 0.51}




{'loss': 1.3949, 'learning_rate': 0.0002, 'epoch': 0.52}




{'loss': 1.165, 'learning_rate': 0.0002, 'epoch': 0.53}




{'loss': 1.3207, 'learning_rate': 0.0002, 'epoch': 0.54}




{'loss': 1.2135, 'learning_rate': 0.0002, 'epoch': 0.55}




{'loss': 1.3746, 'learning_rate': 0.0002, 'epoch': 0.56}




{'loss': 1.2132, 'learning_rate': 0.0002, 'epoch': 0.57}




{'loss': 1.4696, 'learning_rate': 0.0002, 'epoch': 0.58}




{'loss': 1.1875, 'learning_rate': 0.0002, 'epoch': 0.59}




{'loss': 1.3453, 'learning_rate': 0.0002, 'epoch': 0.6}




{'loss': 1.2096, 'learning_rate': 0.0002, 'epoch': 0.62}




{'loss': 1.3358, 'learning_rate': 0.0002, 'epoch': 0.63}




{'loss': 1.1606, 'learning_rate': 0.0002, 'epoch': 0.64}




{'loss': 1.4001, 'learning_rate': 0.0002, 'epoch': 0.65}




{'loss': 1.2254, 'learning_rate': 0.0002, 'epoch': 0.66}




{'loss': 1.3052, 'learning_rate': 0.0002, 'epoch': 0.67}




{'loss': 1.2234, 'learning_rate': 0.0002, 'epoch': 0.68}




{'loss': 1.3363, 'learning_rate': 0.0002, 'epoch': 0.69}




{'loss': 1.2742, 'learning_rate': 0.0002, 'epoch': 0.7}




{'loss': 1.3138, 'learning_rate': 0.0002, 'epoch': 0.71}




{'loss': 1.2295, 'learning_rate': 0.0002, 'epoch': 0.72}




{'loss': 1.3394, 'learning_rate': 0.0002, 'epoch': 0.73}




{'loss': 1.1382, 'learning_rate': 0.0002, 'epoch': 0.74}




{'loss': 1.3492, 'learning_rate': 0.0002, 'epoch': 0.75}




{'loss': 1.0757, 'learning_rate': 0.0002, 'epoch': 0.76}




{'loss': 1.3605, 'learning_rate': 0.0002, 'epoch': 0.77}




{'loss': 1.1317, 'learning_rate': 0.0002, 'epoch': 0.78}




{'loss': 1.3197, 'learning_rate': 0.0002, 'epoch': 0.8}




{'loss': 1.2001, 'learning_rate': 0.0002, 'epoch': 0.81}




{'loss': 1.3444, 'learning_rate': 0.0002, 'epoch': 0.82}




{'loss': 1.2295, 'learning_rate': 0.0002, 'epoch': 0.83}




{'loss': 1.295, 'learning_rate': 0.0002, 'epoch': 0.84}




{'loss': 1.0816, 'learning_rate': 0.0002, 'epoch': 0.85}




{'loss': 1.3808, 'learning_rate': 0.0002, 'epoch': 0.86}




{'loss': 1.1398, 'learning_rate': 0.0002, 'epoch': 0.87}




{'loss': 1.3655, 'learning_rate': 0.0002, 'epoch': 0.88}




{'loss': 1.152, 'learning_rate': 0.0002, 'epoch': 0.89}




{'loss': 1.3549, 'learning_rate': 0.0002, 'epoch': 0.9}




{'loss': 1.1398, 'learning_rate': 0.0002, 'epoch': 0.91}




{'loss': 1.2869, 'learning_rate': 0.0002, 'epoch': 0.92}




{'loss': 1.1758, 'learning_rate': 0.0002, 'epoch': 0.93}




{'loss': 1.2364, 'learning_rate': 0.0002, 'epoch': 0.94}




{'loss': 1.1373, 'learning_rate': 0.0002, 'epoch': 0.95}




{'loss': 1.359, 'learning_rate': 0.0002, 'epoch': 0.97}




{'loss': 1.1538, 'learning_rate': 0.0002, 'epoch': 0.98}




{'loss': 1.2989, 'learning_rate': 0.0002, 'epoch': 0.99}




{'loss': 1.2018, 'learning_rate': 0.0002, 'epoch': 1.0}




{'loss': 1.3483, 'learning_rate': 0.0002, 'epoch': 1.01}




{'loss': 1.1802, 'learning_rate': 0.0002, 'epoch': 1.02}




{'loss': 1.2006, 'learning_rate': 0.0002, 'epoch': 1.03}




{'loss': 1.0284, 'learning_rate': 0.0002, 'epoch': 1.04}




{'loss': 1.1793, 'learning_rate': 0.0002, 'epoch': 1.05}




{'loss': 1.1457, 'learning_rate': 0.0002, 'epoch': 1.06}




{'loss': 1.1541, 'learning_rate': 0.0002, 'epoch': 1.07}




{'loss': 1.1244, 'learning_rate': 0.0002, 'epoch': 1.08}




{'loss': 1.208, 'learning_rate': 0.0002, 'epoch': 1.09}




{'loss': 1.1605, 'learning_rate': 0.0002, 'epoch': 1.1}




{'loss': 1.2844, 'learning_rate': 0.0002, 'epoch': 1.11}




{'loss': 1.0564, 'learning_rate': 0.0002, 'epoch': 1.12}




{'loss': 1.3079, 'learning_rate': 0.0002, 'epoch': 1.13}




{'loss': 1.1006, 'learning_rate': 0.0002, 'epoch': 1.15}




{'loss': 1.249, 'learning_rate': 0.0002, 'epoch': 1.16}




{'loss': 1.0413, 'learning_rate': 0.0002, 'epoch': 1.17}




{'loss': 1.1723, 'learning_rate': 0.0002, 'epoch': 1.18}




{'loss': 1.0798, 'learning_rate': 0.0002, 'epoch': 1.19}




{'loss': 1.1679, 'learning_rate': 0.0002, 'epoch': 1.2}




{'loss': 1.1034, 'learning_rate': 0.0002, 'epoch': 1.21}




{'loss': 1.1891, 'learning_rate': 0.0002, 'epoch': 1.22}




{'loss': 1.0757, 'learning_rate': 0.0002, 'epoch': 1.23}




{'loss': 1.1479, 'learning_rate': 0.0002, 'epoch': 1.24}




{'loss': 1.1506, 'learning_rate': 0.0002, 'epoch': 1.25}




{'loss': 1.1924, 'learning_rate': 0.0002, 'epoch': 1.26}




{'loss': 1.0981, 'learning_rate': 0.0002, 'epoch': 1.27}




{'loss': 1.1418, 'learning_rate': 0.0002, 'epoch': 1.28}




{'loss': 1.0772, 'learning_rate': 0.0002, 'epoch': 1.29}




{'loss': 1.1849, 'learning_rate': 0.0002, 'epoch': 1.3}




{'loss': 0.9974, 'learning_rate': 0.0002, 'epoch': 1.32}




{'loss': 1.2312, 'learning_rate': 0.0002, 'epoch': 1.33}




{'loss': 1.0452, 'learning_rate': 0.0002, 'epoch': 1.34}




{'loss': 1.1679, 'learning_rate': 0.0002, 'epoch': 1.35}




{'loss': 1.0857, 'learning_rate': 0.0002, 'epoch': 1.36}




{'loss': 1.2497, 'learning_rate': 0.0002, 'epoch': 1.37}




{'loss': 1.0921, 'learning_rate': 0.0002, 'epoch': 1.38}




{'loss': 1.2481, 'learning_rate': 0.0002, 'epoch': 1.39}




{'loss': 1.0696, 'learning_rate': 0.0002, 'epoch': 1.4}




{'loss': 1.1754, 'learning_rate': 0.0002, 'epoch': 1.41}




{'loss': 1.0822, 'learning_rate': 0.0002, 'epoch': 1.42}




{'loss': 1.2025, 'learning_rate': 0.0002, 'epoch': 1.43}




{'loss': 0.9652, 'learning_rate': 0.0002, 'epoch': 1.44}




{'loss': 1.1401, 'learning_rate': 0.0002, 'epoch': 1.45}




{'loss': 1.099, 'learning_rate': 0.0002, 'epoch': 1.46}




{'loss': 1.2102, 'learning_rate': 0.0002, 'epoch': 1.47}




{'loss': 1.1089, 'learning_rate': 0.0002, 'epoch': 1.48}




{'loss': 1.2085, 'learning_rate': 0.0002, 'epoch': 1.5}




{'loss': 1.1508, 'learning_rate': 0.0002, 'epoch': 1.51}




{'loss': 1.1532, 'learning_rate': 0.0002, 'epoch': 1.52}




{'loss': 1.0338, 'learning_rate': 0.0002, 'epoch': 1.53}




{'loss': 1.1975, 'learning_rate': 0.0002, 'epoch': 1.54}




{'loss': 1.0882, 'learning_rate': 0.0002, 'epoch': 1.55}




{'loss': 1.1932, 'learning_rate': 0.0002, 'epoch': 1.56}




{'loss': 1.0076, 'learning_rate': 0.0002, 'epoch': 1.57}




{'loss': 1.1722, 'learning_rate': 0.0002, 'epoch': 1.58}




{'loss': 1.1511, 'learning_rate': 0.0002, 'epoch': 1.59}




{'loss': 1.1484, 'learning_rate': 0.0002, 'epoch': 1.6}




{'loss': 1.1202, 'learning_rate': 0.0002, 'epoch': 1.61}




{'loss': 1.1054, 'learning_rate': 0.0002, 'epoch': 1.62}




{'loss': 1.0553, 'learning_rate': 0.0002, 'epoch': 1.63}




{'loss': 1.222, 'learning_rate': 0.0002, 'epoch': 1.64}




{'loss': 1.0411, 'learning_rate': 0.0002, 'epoch': 1.65}




{'loss': 1.2492, 'learning_rate': 0.0002, 'epoch': 1.67}




{'loss': 1.1189, 'learning_rate': 0.0002, 'epoch': 1.68}




{'loss': 1.2517, 'learning_rate': 0.0002, 'epoch': 1.69}




{'loss': 1.0979, 'learning_rate': 0.0002, 'epoch': 1.7}




{'loss': 1.2534, 'learning_rate': 0.0002, 'epoch': 1.71}




{'loss': 1.0719, 'learning_rate': 0.0002, 'epoch': 1.72}




{'loss': 1.1444, 'learning_rate': 0.0002, 'epoch': 1.73}




{'loss': 1.001, 'learning_rate': 0.0002, 'epoch': 1.74}




{'loss': 1.1286, 'learning_rate': 0.0002, 'epoch': 1.75}




{'loss': 1.1009, 'learning_rate': 0.0002, 'epoch': 1.76}




{'loss': 1.1372, 'learning_rate': 0.0002, 'epoch': 1.77}




{'loss': 0.9291, 'learning_rate': 0.0002, 'epoch': 1.78}




{'loss': 1.2308, 'learning_rate': 0.0002, 'epoch': 1.79}




{'loss': 1.0287, 'learning_rate': 0.0002, 'epoch': 1.8}




{'loss': 1.1995, 'learning_rate': 0.0002, 'epoch': 1.81}




{'loss': 1.0981, 'learning_rate': 0.0002, 'epoch': 1.82}




{'loss': 1.2006, 'learning_rate': 0.0002, 'epoch': 1.83}




{'loss': 1.0696, 'learning_rate': 0.0002, 'epoch': 1.85}




{'loss': 1.2163, 'learning_rate': 0.0002, 'epoch': 1.86}




{'loss': 1.0035, 'learning_rate': 0.0002, 'epoch': 1.87}




{'loss': 1.1204, 'learning_rate': 0.0002, 'epoch': 1.88}




{'loss': 1.123, 'learning_rate': 0.0002, 'epoch': 1.89}




{'loss': 1.204, 'learning_rate': 0.0002, 'epoch': 1.9}




{'loss': 1.068, 'learning_rate': 0.0002, 'epoch': 1.91}




{'loss': 1.1614, 'learning_rate': 0.0002, 'epoch': 1.92}




{'loss': 1.0529, 'learning_rate': 0.0002, 'epoch': 1.93}




{'loss': 1.1451, 'learning_rate': 0.0002, 'epoch': 1.94}




{'loss': 1.0948, 'learning_rate': 0.0002, 'epoch': 1.95}




{'loss': 1.1979, 'learning_rate': 0.0002, 'epoch': 1.96}




{'loss': 0.9516, 'learning_rate': 0.0002, 'epoch': 1.97}




{'loss': 1.1801, 'learning_rate': 0.0002, 'epoch': 1.98}




{'loss': 1.0537, 'learning_rate': 0.0002, 'epoch': 1.99}




{'loss': 1.0936, 'learning_rate': 0.0002, 'epoch': 2.0}




{'loss': 1.035, 'learning_rate': 0.0002, 'epoch': 2.02}




{'loss': 0.9951, 'learning_rate': 0.0002, 'epoch': 2.03}




{'loss': 0.96, 'learning_rate': 0.0002, 'epoch': 2.04}




{'loss': 0.9433, 'learning_rate': 0.0002, 'epoch': 2.05}




{'loss': 0.8931, 'learning_rate': 0.0002, 'epoch': 2.06}




{'loss': 0.9647, 'learning_rate': 0.0002, 'epoch': 2.07}




{'loss': 0.9687, 'learning_rate': 0.0002, 'epoch': 2.08}




{'loss': 1.0093, 'learning_rate': 0.0002, 'epoch': 2.09}




{'loss': 0.9023, 'learning_rate': 0.0002, 'epoch': 2.1}




{'loss': 0.9807, 'learning_rate': 0.0002, 'epoch': 2.11}




{'loss': 0.9754, 'learning_rate': 0.0002, 'epoch': 2.12}




{'loss': 1.0024, 'learning_rate': 0.0002, 'epoch': 2.13}




{'loss': 0.8727, 'learning_rate': 0.0002, 'epoch': 2.14}




{'loss': 0.9711, 'learning_rate': 0.0002, 'epoch': 2.15}




{'loss': 0.9708, 'learning_rate': 0.0002, 'epoch': 2.16}




{'loss': 1.0275, 'learning_rate': 0.0002, 'epoch': 2.17}




{'loss': 1.0057, 'learning_rate': 0.0002, 'epoch': 2.18}




{'loss': 1.045, 'learning_rate': 0.0002, 'epoch': 2.2}




{'loss': 1.0269, 'learning_rate': 0.0002, 'epoch': 2.21}




{'loss': 1.1293, 'learning_rate': 0.0002, 'epoch': 2.22}




{'loss': 0.9267, 'learning_rate': 0.0002, 'epoch': 2.23}




{'loss': 0.9753, 'learning_rate': 0.0002, 'epoch': 2.24}




{'loss': 0.9237, 'learning_rate': 0.0002, 'epoch': 2.25}




{'loss': 1.0855, 'learning_rate': 0.0002, 'epoch': 2.26}




{'loss': 0.9254, 'learning_rate': 0.0002, 'epoch': 2.27}




{'loss': 1.0056, 'learning_rate': 0.0002, 'epoch': 2.28}




{'loss': 0.9838, 'learning_rate': 0.0002, 'epoch': 2.29}




{'loss': 0.9694, 'learning_rate': 0.0002, 'epoch': 2.3}




{'loss': 0.9518, 'learning_rate': 0.0002, 'epoch': 2.31}




{'loss': 0.953, 'learning_rate': 0.0002, 'epoch': 2.32}




{'loss': 0.9508, 'learning_rate': 0.0002, 'epoch': 2.33}




{'loss': 0.9833, 'learning_rate': 0.0002, 'epoch': 2.34}




{'loss': 0.8942, 'learning_rate': 0.0002, 'epoch': 2.35}




{'loss': 0.9968, 'learning_rate': 0.0002, 'epoch': 2.37}




{'loss': 0.9212, 'learning_rate': 0.0002, 'epoch': 2.38}




{'loss': 0.9888, 'learning_rate': 0.0002, 'epoch': 2.39}




{'loss': 0.9343, 'learning_rate': 0.0002, 'epoch': 2.4}




{'loss': 0.8929, 'learning_rate': 0.0002, 'epoch': 2.41}




{'loss': 0.949, 'learning_rate': 0.0002, 'epoch': 2.42}




{'loss': 0.9483, 'learning_rate': 0.0002, 'epoch': 2.43}




{'loss': 1.0371, 'learning_rate': 0.0002, 'epoch': 2.44}




{'loss': 0.9528, 'learning_rate': 0.0002, 'epoch': 2.45}




{'loss': 0.9733, 'learning_rate': 0.0002, 'epoch': 2.46}




{'loss': 1.0257, 'learning_rate': 0.0002, 'epoch': 2.47}




{'loss': 0.9628, 'learning_rate': 0.0002, 'epoch': 2.48}




{'loss': 0.9736, 'learning_rate': 0.0002, 'epoch': 2.49}




{'loss': 0.9417, 'learning_rate': 0.0002, 'epoch': 2.5}




{'loss': 1.01, 'learning_rate': 0.0002, 'epoch': 2.51}




{'loss': 0.9624, 'learning_rate': 0.0002, 'epoch': 2.52}




{'loss': 1.0162, 'learning_rate': 0.0002, 'epoch': 2.54}




{'loss': 1.0302, 'learning_rate': 0.0002, 'epoch': 2.55}




{'loss': 1.0139, 'learning_rate': 0.0002, 'epoch': 2.56}




{'loss': 1.0302, 'learning_rate': 0.0002, 'epoch': 2.57}




{'loss': 1.0964, 'learning_rate': 0.0002, 'epoch': 2.58}




{'loss': 0.9744, 'learning_rate': 0.0002, 'epoch': 2.59}




{'loss': 0.9834, 'learning_rate': 0.0002, 'epoch': 2.6}




{'loss': 0.954, 'learning_rate': 0.0002, 'epoch': 2.61}




{'loss': 1.0151, 'learning_rate': 0.0002, 'epoch': 2.62}




{'loss': 0.9065, 'learning_rate': 0.0002, 'epoch': 2.63}




{'loss': 0.9755, 'learning_rate': 0.0002, 'epoch': 2.64}




{'loss': 0.8857, 'learning_rate': 0.0002, 'epoch': 2.65}




{'loss': 0.951, 'learning_rate': 0.0002, 'epoch': 2.66}




{'loss': 0.9437, 'learning_rate': 0.0002, 'epoch': 2.67}




{'loss': 1.0434, 'learning_rate': 0.0002, 'epoch': 2.68}




{'loss': 1.0229, 'learning_rate': 0.0002, 'epoch': 2.69}




{'loss': 0.9576, 'learning_rate': 0.0002, 'epoch': 2.7}




{'loss': 0.9512, 'learning_rate': 0.0002, 'epoch': 2.72}




{'loss': 0.9279, 'learning_rate': 0.0002, 'epoch': 2.73}




{'loss': 0.9706, 'learning_rate': 0.0002, 'epoch': 2.74}




{'loss': 1.0056, 'learning_rate': 0.0002, 'epoch': 2.75}




{'loss': 0.9574, 'learning_rate': 0.0002, 'epoch': 2.76}




{'loss': 1.0028, 'learning_rate': 0.0002, 'epoch': 2.77}




{'loss': 0.9196, 'learning_rate': 0.0002, 'epoch': 2.78}




{'loss': 0.9411, 'learning_rate': 0.0002, 'epoch': 2.79}




{'loss': 0.8961, 'learning_rate': 0.0002, 'epoch': 2.8}




{'loss': 0.9538, 'learning_rate': 0.0002, 'epoch': 2.81}




{'loss': 0.8965, 'learning_rate': 0.0002, 'epoch': 2.82}




{'loss': 0.95, 'learning_rate': 0.0002, 'epoch': 2.83}




{'loss': 0.8743, 'learning_rate': 0.0002, 'epoch': 2.84}




{'loss': 1.0438, 'learning_rate': 0.0002, 'epoch': 2.85}




{'loss': 0.8615, 'learning_rate': 0.0002, 'epoch': 2.86}




{'loss': 1.0405, 'learning_rate': 0.0002, 'epoch': 2.87}




{'loss': 0.9617, 'learning_rate': 0.0002, 'epoch': 2.89}




{'loss': 0.9777, 'learning_rate': 0.0002, 'epoch': 2.9}




{'loss': 1.0277, 'learning_rate': 0.0002, 'epoch': 2.91}




{'loss': 0.9742, 'learning_rate': 0.0002, 'epoch': 2.92}




{'loss': 0.8978, 'learning_rate': 0.0002, 'epoch': 2.93}




{'loss': 0.9452, 'learning_rate': 0.0002, 'epoch': 2.94}




{'loss': 0.9816, 'learning_rate': 0.0002, 'epoch': 2.95}




{'loss': 0.9185, 'learning_rate': 0.0002, 'epoch': 2.96}




{'loss': 1.008, 'learning_rate': 0.0002, 'epoch': 2.97}




{'loss': 0.9218, 'learning_rate': 0.0002, 'epoch': 2.98}




{'loss': 0.84, 'learning_rate': 0.0002, 'epoch': 2.99}




{'loss': 0.8937, 'learning_rate': 0.0002, 'epoch': 3.0}




{'loss': 0.8362, 'learning_rate': 0.0002, 'epoch': 3.01}




{'loss': 0.6728, 'learning_rate': 0.0002, 'epoch': 3.02}




{'loss': 0.8629, 'learning_rate': 0.0002, 'epoch': 3.03}




{'loss': 0.7715, 'learning_rate': 0.0002, 'epoch': 3.04}




{'loss': 0.8747, 'learning_rate': 0.0002, 'epoch': 3.05}




{'loss': 0.7823, 'learning_rate': 0.0002, 'epoch': 3.07}




{'loss': 0.8093, 'learning_rate': 0.0002, 'epoch': 3.08}




{'loss': 0.7966, 'learning_rate': 0.0002, 'epoch': 3.09}




{'loss': 0.8, 'learning_rate': 0.0002, 'epoch': 3.1}




{'loss': 0.8326, 'learning_rate': 0.0002, 'epoch': 3.11}




{'loss': 0.8374, 'learning_rate': 0.0002, 'epoch': 3.12}




{'loss': 0.7896, 'learning_rate': 0.0002, 'epoch': 3.13}




{'loss': 0.8187, 'learning_rate': 0.0002, 'epoch': 3.14}




{'loss': 0.7786, 'learning_rate': 0.0002, 'epoch': 3.15}




{'loss': 0.8573, 'learning_rate': 0.0002, 'epoch': 3.16}




{'loss': 0.7897, 'learning_rate': 0.0002, 'epoch': 3.17}




{'loss': 0.9068, 'learning_rate': 0.0002, 'epoch': 3.18}




{'loss': 0.7584, 'learning_rate': 0.0002, 'epoch': 3.19}




{'loss': 0.9123, 'learning_rate': 0.0002, 'epoch': 3.2}




{'loss': 0.7966, 'learning_rate': 0.0002, 'epoch': 3.21}




{'loss': 0.8433, 'learning_rate': 0.0002, 'epoch': 3.22}




{'loss': 0.7298, 'learning_rate': 0.0002, 'epoch': 3.24}




{'loss': 0.9071, 'learning_rate': 0.0002, 'epoch': 3.25}




{'loss': 0.7852, 'learning_rate': 0.0002, 'epoch': 3.26}




{'loss': 0.8348, 'learning_rate': 0.0002, 'epoch': 3.27}




{'loss': 0.7786, 'learning_rate': 0.0002, 'epoch': 3.28}




{'loss': 0.8667, 'learning_rate': 0.0002, 'epoch': 3.29}




{'loss': 0.7397, 'learning_rate': 0.0002, 'epoch': 3.3}




{'loss': 0.858, 'learning_rate': 0.0002, 'epoch': 3.31}




{'loss': 0.7909, 'learning_rate': 0.0002, 'epoch': 3.32}




{'loss': 0.82, 'learning_rate': 0.0002, 'epoch': 3.33}




{'loss': 0.7862, 'learning_rate': 0.0002, 'epoch': 3.34}




{'loss': 0.8949, 'learning_rate': 0.0002, 'epoch': 3.35}




{'loss': 0.7937, 'learning_rate': 0.0002, 'epoch': 3.36}




{'loss': 0.8853, 'learning_rate': 0.0002, 'epoch': 3.37}




{'loss': 0.8145, 'learning_rate': 0.0002, 'epoch': 3.38}




{'loss': 0.9312, 'learning_rate': 0.0002, 'epoch': 3.39}




{'loss': 0.7961, 'learning_rate': 0.0002, 'epoch': 3.4}




{'loss': 0.7822, 'learning_rate': 0.0002, 'epoch': 3.42}




{'loss': 0.7059, 'learning_rate': 0.0002, 'epoch': 3.43}




{'loss': 0.8933, 'learning_rate': 0.0002, 'epoch': 3.44}




{'loss': 0.7915, 'learning_rate': 0.0002, 'epoch': 3.45}




{'loss': 0.885, 'learning_rate': 0.0002, 'epoch': 3.46}




{'loss': 0.754, 'learning_rate': 0.0002, 'epoch': 3.47}




{'loss': 0.9126, 'learning_rate': 0.0002, 'epoch': 3.48}




{'loss': 0.7608, 'learning_rate': 0.0002, 'epoch': 3.49}




{'loss': 0.8596, 'learning_rate': 0.0002, 'epoch': 3.5}




{'loss': 0.7055, 'learning_rate': 0.0002, 'epoch': 3.51}




{'loss': 0.8999, 'learning_rate': 0.0002, 'epoch': 3.52}




{'loss': 0.778, 'learning_rate': 0.0002, 'epoch': 3.53}




{'loss': 0.7902, 'learning_rate': 0.0002, 'epoch': 3.54}




{'loss': 0.8277, 'learning_rate': 0.0002, 'epoch': 3.55}




{'loss': 0.8398, 'learning_rate': 0.0002, 'epoch': 3.56}




{'loss': 0.7532, 'learning_rate': 0.0002, 'epoch': 3.57}




{'loss': 0.8451, 'learning_rate': 0.0002, 'epoch': 3.59}




{'loss': 0.8075, 'learning_rate': 0.0002, 'epoch': 3.6}




{'loss': 0.8636, 'learning_rate': 0.0002, 'epoch': 3.61}




{'loss': 0.8012, 'learning_rate': 0.0002, 'epoch': 3.62}




{'loss': 0.7993, 'learning_rate': 0.0002, 'epoch': 3.63}




{'loss': 0.7192, 'learning_rate': 0.0002, 'epoch': 3.64}




{'loss': 0.8414, 'learning_rate': 0.0002, 'epoch': 3.65}




{'loss': 0.7904, 'learning_rate': 0.0002, 'epoch': 3.66}




{'loss': 0.8623, 'learning_rate': 0.0002, 'epoch': 3.67}




{'loss': 0.7581, 'learning_rate': 0.0002, 'epoch': 3.68}




{'loss': 0.8867, 'learning_rate': 0.0002, 'epoch': 3.69}




{'loss': 0.7903, 'learning_rate': 0.0002, 'epoch': 3.7}




{'loss': 0.8501, 'learning_rate': 0.0002, 'epoch': 3.71}




{'loss': 0.8549, 'learning_rate': 0.0002, 'epoch': 3.72}




{'loss': 0.8516, 'learning_rate': 0.0002, 'epoch': 3.73}




{'loss': 0.793, 'learning_rate': 0.0002, 'epoch': 3.74}




{'loss': 0.9203, 'learning_rate': 0.0002, 'epoch': 3.75}




{'loss': 0.7867, 'learning_rate': 0.0002, 'epoch': 3.77}




{'loss': 0.839, 'learning_rate': 0.0002, 'epoch': 3.78}




{'loss': 0.7529, 'learning_rate': 0.0002, 'epoch': 3.79}




{'loss': 0.8359, 'learning_rate': 0.0002, 'epoch': 3.8}




{'loss': 0.8242, 'learning_rate': 0.0002, 'epoch': 3.81}




{'loss': 0.9656, 'learning_rate': 0.0002, 'epoch': 3.82}




{'loss': 0.7352, 'learning_rate': 0.0002, 'epoch': 3.83}




{'loss': 0.9407, 'learning_rate': 0.0002, 'epoch': 3.84}




{'loss': 0.839, 'learning_rate': 0.0002, 'epoch': 3.85}




{'loss': 0.8807, 'learning_rate': 0.0002, 'epoch': 3.86}




{'loss': 0.7941, 'learning_rate': 0.0002, 'epoch': 3.87}




{'loss': 0.8505, 'learning_rate': 0.0002, 'epoch': 3.88}




{'loss': 0.7494, 'learning_rate': 0.0002, 'epoch': 3.89}




{'loss': 0.8972, 'learning_rate': 0.0002, 'epoch': 3.9}




{'loss': 0.8157, 'learning_rate': 0.0002, 'epoch': 3.91}




{'loss': 0.9518, 'learning_rate': 0.0002, 'epoch': 3.92}




{'loss': 0.7676, 'learning_rate': 0.0002, 'epoch': 3.94}




{'loss': 0.9218, 'learning_rate': 0.0002, 'epoch': 3.95}




{'loss': 0.7558, 'learning_rate': 0.0002, 'epoch': 3.96}




{'loss': 0.7977, 'learning_rate': 0.0002, 'epoch': 3.97}




{'loss': 0.7678, 'learning_rate': 0.0002, 'epoch': 3.98}




{'loss': 0.8469, 'learning_rate': 0.0002, 'epoch': 3.99}




{'loss': 0.7109, 'learning_rate': 0.0002, 'epoch': 4.0}




{'train_runtime': 6260.8206, 'train_samples_per_second': 6.023, 'train_steps_per_second': 1.506, 'train_loss': 1.0528914769119664, 'epoch': 4.0}


TrainOutput(global_step=9428, training_loss=1.0528914769119664, metrics={'train_runtime': 6260.8206, 'train_samples_per_second': 6.023, 'train_steps_per_second': 1.506, 'train_loss': 1.0528914769119664, 'epoch': 4.0})

In [87]:
new_model = "llama-2-7b-boolq"
trainer_boolq.model.save_pretrained(new_model)

In [88]:
# Reload model in FP16 and merge it with LoRA weights
new_model = "llama-2-7b-boolq"
model_lora_boolq = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model_lora_boolq = PeftModel.from_pretrained(model_lora_boolq, new_model)
model_lora_boolq = model_lora_boolq.merge_and_unload()
print(id(model_lora_boolq))

# Reload tokenizer to save it
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

46922335917680


In [89]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = f"<QUESTION>is windows movie maker part of windows essentials</QUESTION> <PASSAGE> Windows Movie Maker -- Windows Movie Maker (formerly known as Windows Live Movie Maker in Windows 7) is a discontinued video editing software by Microsoft. It is a part of Windows Essentials software suite and offers the ability to create and edit videos as well as to publish them on OneDrive, Facebook, Vimeo, YouTube, and Flickr.</PASSAGE>"

pipe = pipeline(task="text-generation", model=model_lora_boolq, tokenizer=tokenizer, max_length=200)
# result = pipe(prompt)
result = pipe(prompt, max_length=1, num_return_sequences=1, do_sample=True)
generated_answer = result[0]['generated_text'][-1]
print(generated_answer)
print(result[0]['generated_text'])



1
<QUESTION>is windows movie maker part of windows essentials</QUESTION> <PASSAGE> Windows Movie Maker -- Windows Movie Maker (formerly known as Windows Live Movie Maker in Windows 7) is a discontinued video editing software by Microsoft. It is a part of Windows Essentials software suite and offers the ability to create and edit videos as well as to publish them on OneDrive, Facebook, Vimeo, YouTube, and Flickr.</PASSAGE>1


In [90]:
boolq_dataset_validation=boolq_dataset["validation"]
def concatenate_question_passage(example):
#     answer="True" if example["label"]==1 else "False"
    return {'combine': f"<QUESTION>{example['question']} </QUESTION> <PASSAGE> {example['passage']} </PASSAGE>"}

boolq_dataset_validation = boolq_dataset_validation.map(concatenate_question_passage)
print(boolq_dataset_validation[0])

{'question': 'does ethanol take more energy make that produces', 'passage': "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separa

In [91]:
correct = 0
total = len(boolq_dataset_validation)
pipe = pipeline(task="text-generation", model=model_lora_boolq, tokenizer=tokenizer, max_length=200)


for example in boolq_dataset_validation:
    prompt=example['combine']
    result = pipe(prompt, max_length=1, num_return_sequences=1, do_sample=True)
    generated_answer = result[0]['generated_text'][-1]
    if int(generated_answer) == example['label']:
        correct += 1

accuracy = correct / total
print(f"Accuracy: {accuracy}")



Accuracy: 0.7128440366972477


## RTE

In [94]:
rte_dataset = load_dataset("super_glue", "rte")

rte_dataset_training=rte_dataset["train"]
print(rte_dataset_training)
rte_dataset_validation=boolq_dataset["validation"]
def concatenate_question_passage(example):
#     answer="True" if example["label"]==1 else "False"
    return {'combine': f"<PREMISE>{example['premise']} </PREMISE> <HYPOTHESIS> {example['hypothesis']} </HYPOTHESIS>{example['label']}"}

rte_dataset_training = rte_dataset_training.map(concatenate_question_passage)
print(rte_dataset_training[0]["combine"])

Dataset({
    features: ['premise', 'hypothesis', 'idx', 'label'],
    num_rows: 2490
})


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

<PREMISE>No Weapons of Mass Destruction Found in Iraq Yet. </PREMISE> <HYPOTHESIS> Weapons of Mass Destruction Found in Iraq. </HYPOTHESIS>1


In [96]:
training_arguments = TrainingArguments(
    output_dir="./rte_results",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    # change the lr
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=False,
    #A100 True
    bf16=True,
    max_grad_norm=0.3,
 #   max_steps=max_steps,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)


trainer_rte = SFTTrainer(
    model=model,
    train_dataset=rte_dataset_training,
    peft_config=peft_config,
    dataset_text_field="combine",
#     dataset_text_field=["question", "passage"],
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [97]:
trainer_rte.train()
new_model = "llama-2-7b-rte"
trainer_rte.model.save_pretrained(new_model)



{'loss': 2.4682, 'learning_rate': 2e-05, 'epoch': 0.04}




{'loss': 2.4438, 'learning_rate': 2e-05, 'epoch': 0.08}




{'loss': 2.0747, 'learning_rate': 2e-05, 'epoch': 0.12}




{'loss': 1.9197, 'learning_rate': 2e-05, 'epoch': 0.16}




{'loss': 1.8237, 'learning_rate': 2e-05, 'epoch': 0.2}




{'loss': 1.6237, 'learning_rate': 2e-05, 'epoch': 0.24}




{'loss': 1.7244, 'learning_rate': 2e-05, 'epoch': 0.28}




{'loss': 1.5505, 'learning_rate': 2e-05, 'epoch': 0.32}




{'loss': 1.6944, 'learning_rate': 2e-05, 'epoch': 0.36}




{'loss': 1.4403, 'learning_rate': 2e-05, 'epoch': 0.4}




{'loss': 1.6739, 'learning_rate': 2e-05, 'epoch': 0.44}




{'loss': 1.4274, 'learning_rate': 2e-05, 'epoch': 0.48}




{'loss': 1.6403, 'learning_rate': 2e-05, 'epoch': 0.52}




{'loss': 1.3541, 'learning_rate': 2e-05, 'epoch': 0.56}




{'loss': 1.6377, 'learning_rate': 2e-05, 'epoch': 0.6}




{'loss': 1.2752, 'learning_rate': 2e-05, 'epoch': 0.64}




{'loss': 1.7058, 'learning_rate': 2e-05, 'epoch': 0.68}




{'loss': 1.3246, 'learning_rate': 2e-05, 'epoch': 0.72}




{'loss': 1.5646, 'learning_rate': 2e-05, 'epoch': 0.76}




{'loss': 1.3224, 'learning_rate': 2e-05, 'epoch': 0.8}




{'loss': 1.5764, 'learning_rate': 2e-05, 'epoch': 0.84}




{'loss': 1.2658, 'learning_rate': 2e-05, 'epoch': 0.88}




{'loss': 1.5032, 'learning_rate': 2e-05, 'epoch': 0.92}




{'loss': 1.2736, 'learning_rate': 2e-05, 'epoch': 0.96}




{'loss': 1.4598, 'learning_rate': 2e-05, 'epoch': 1.0}




{'loss': 1.5105, 'learning_rate': 2e-05, 'epoch': 1.04}




{'loss': 1.3255, 'learning_rate': 2e-05, 'epoch': 1.08}




{'loss': 1.4859, 'learning_rate': 2e-05, 'epoch': 1.12}




{'loss': 1.3336, 'learning_rate': 2e-05, 'epoch': 1.16}




{'loss': 1.5203, 'learning_rate': 2e-05, 'epoch': 1.2}




{'loss': 1.2204, 'learning_rate': 2e-05, 'epoch': 1.24}




{'loss': 1.5551, 'learning_rate': 2e-05, 'epoch': 1.28}




{'loss': 1.347, 'learning_rate': 2e-05, 'epoch': 1.32}




{'loss': 1.5099, 'learning_rate': 2e-05, 'epoch': 1.36}




{'loss': 1.3462, 'learning_rate': 2e-05, 'epoch': 1.4}




{'loss': 1.5818, 'learning_rate': 2e-05, 'epoch': 1.44}




{'loss': 1.3098, 'learning_rate': 2e-05, 'epoch': 1.48}




{'loss': 1.5571, 'learning_rate': 2e-05, 'epoch': 1.52}




{'loss': 1.3366, 'learning_rate': 2e-05, 'epoch': 1.57}




{'loss': 1.5225, 'learning_rate': 2e-05, 'epoch': 1.61}




{'loss': 1.3225, 'learning_rate': 2e-05, 'epoch': 1.65}




{'loss': 1.5451, 'learning_rate': 2e-05, 'epoch': 1.69}




{'loss': 1.2907, 'learning_rate': 2e-05, 'epoch': 1.73}




{'loss': 1.4968, 'learning_rate': 2e-05, 'epoch': 1.77}




{'loss': 1.3187, 'learning_rate': 2e-05, 'epoch': 1.81}




{'loss': 1.5228, 'learning_rate': 2e-05, 'epoch': 1.85}




{'loss': 1.2939, 'learning_rate': 2e-05, 'epoch': 1.89}




{'loss': 1.5595, 'learning_rate': 2e-05, 'epoch': 1.93}




{'loss': 1.2955, 'learning_rate': 2e-05, 'epoch': 1.97}




{'loss': 1.4159, 'learning_rate': 2e-05, 'epoch': 2.01}




{'loss': 1.4802, 'learning_rate': 2e-05, 'epoch': 2.05}




{'loss': 1.284, 'learning_rate': 2e-05, 'epoch': 2.09}




{'loss': 1.4781, 'learning_rate': 2e-05, 'epoch': 2.13}




{'loss': 1.3101, 'learning_rate': 2e-05, 'epoch': 2.17}




{'loss': 1.4887, 'learning_rate': 2e-05, 'epoch': 2.21}




{'loss': 1.3148, 'learning_rate': 2e-05, 'epoch': 2.25}




{'loss': 1.5088, 'learning_rate': 2e-05, 'epoch': 2.29}




{'loss': 1.3555, 'learning_rate': 2e-05, 'epoch': 2.33}




{'loss': 1.4945, 'learning_rate': 2e-05, 'epoch': 2.37}




{'loss': 1.306, 'learning_rate': 2e-05, 'epoch': 2.41}




{'loss': 1.5177, 'learning_rate': 2e-05, 'epoch': 2.45}




{'loss': 1.3626, 'learning_rate': 2e-05, 'epoch': 2.49}




{'loss': 1.4477, 'learning_rate': 2e-05, 'epoch': 2.53}




{'loss': 1.3359, 'learning_rate': 2e-05, 'epoch': 2.57}




{'loss': 1.4889, 'learning_rate': 2e-05, 'epoch': 2.61}




{'loss': 1.324, 'learning_rate': 2e-05, 'epoch': 2.65}




{'loss': 1.4955, 'learning_rate': 2e-05, 'epoch': 2.69}




{'loss': 1.2955, 'learning_rate': 2e-05, 'epoch': 2.73}




{'loss': 1.4732, 'learning_rate': 2e-05, 'epoch': 2.77}




{'loss': 1.2649, 'learning_rate': 2e-05, 'epoch': 2.81}




{'loss': 1.4642, 'learning_rate': 2e-05, 'epoch': 2.85}




{'loss': 1.309, 'learning_rate': 2e-05, 'epoch': 2.89}




{'loss': 1.4259, 'learning_rate': 2e-05, 'epoch': 2.93}




{'loss': 1.3305, 'learning_rate': 2e-05, 'epoch': 2.97}




{'loss': 1.3827, 'learning_rate': 2e-05, 'epoch': 3.01}




{'loss': 1.4461, 'learning_rate': 2e-05, 'epoch': 3.05}




{'loss': 1.285, 'learning_rate': 2e-05, 'epoch': 3.09}




{'loss': 1.4788, 'learning_rate': 2e-05, 'epoch': 3.13}




{'loss': 1.3986, 'learning_rate': 2e-05, 'epoch': 3.17}




{'loss': 1.4007, 'learning_rate': 2e-05, 'epoch': 3.21}




{'loss': 1.3126, 'learning_rate': 2e-05, 'epoch': 3.25}




{'loss': 1.3847, 'learning_rate': 2e-05, 'epoch': 3.29}




{'loss': 1.3071, 'learning_rate': 2e-05, 'epoch': 3.33}




{'loss': 1.4726, 'learning_rate': 2e-05, 'epoch': 3.37}




{'loss': 1.3398, 'learning_rate': 2e-05, 'epoch': 3.41}




{'loss': 1.3703, 'learning_rate': 2e-05, 'epoch': 3.45}




{'loss': 1.2987, 'learning_rate': 2e-05, 'epoch': 3.49}




{'loss': 1.4828, 'learning_rate': 2e-05, 'epoch': 3.53}




{'loss': 1.2915, 'learning_rate': 2e-05, 'epoch': 3.57}




{'loss': 1.4017, 'learning_rate': 2e-05, 'epoch': 3.61}




{'loss': 1.3499, 'learning_rate': 2e-05, 'epoch': 3.65}




{'loss': 1.4636, 'learning_rate': 2e-05, 'epoch': 3.69}




{'loss': 1.3336, 'learning_rate': 2e-05, 'epoch': 3.73}




{'loss': 1.4363, 'learning_rate': 2e-05, 'epoch': 3.77}




{'loss': 1.3094, 'learning_rate': 2e-05, 'epoch': 3.81}




{'loss': 1.4494, 'learning_rate': 2e-05, 'epoch': 3.85}




{'loss': 1.3037, 'learning_rate': 2e-05, 'epoch': 3.89}




{'loss': 1.4296, 'learning_rate': 2e-05, 'epoch': 3.93}




{'loss': 1.3087, 'learning_rate': 2e-05, 'epoch': 3.97}




{'train_runtime': 1347.4381, 'train_samples_per_second': 7.392, 'train_steps_per_second': 1.849, 'train_loss': 1.4564326540424966, 'epoch': 4.0}


In [102]:
# Reload model in FP16 and merge it with LoRA weights
model_lora_rte = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model_lora_rte = PeftModel.from_pretrained(model_lora_rte, new_model)
model_lora_rte = model_lora_rte.merge_and_unload()
print(id(model_lora_rte))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

46922336938592


In [103]:
rte_dataset_validation=rte_dataset["validation"]
def concatenate_question_passage(example):
#     answer="True" if example["label"]==1 else "False"
    return {'combine': f"<PREMISE>{example['premise']} </PREMISE> <HYPOTHESIS> {example['hypothesis']} </HYPOTHESIS>"}

rte_dataset_validation = rte_dataset_validation.map(concatenate_question_passage)
print(rte_dataset_validation[0])


{'premise': 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.', 'hypothesis': 'Christopher Reeve had an accident.', 'idx': 0, 'label': 1, 'combine': '<PREMISE>Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. </PREMISE> <HYPOTHESIS> Christopher Reeve had an accident. </HYPOTHESIS>'}


In [107]:
correct = 0
total = len(rte_dataset_validation)
pipe = pipeline(task="text-generation", model=model_lora_rte, tokenizer=tokenizer, max_length=200)


for example in rte_dataset_validation:
    prompt=example['combine']
    result = pipe(prompt, max_length=1, num_return_sequences=1, do_sample=True)
#     print(result[0]['generated_text'])
    generated_answer = result[0]['generated_text'][-1]
#     print(generated_answer)
    if int(generated_answer) == example['label']:
        correct += 1

accuracy = correct / total
print(f"Accuracy: {accuracy}")



Accuracy: 0.7148014440433214


## WSC

In [41]:
wsc_dataset = load_dataset("super_glue", "wsc")

wsc_dataset_training=wsc_dataset["train"]
print(wsc_dataset_training)
wsc_dataset_validation=wsc_dataset["validation"]
def concatenate_question_passage(example):
#     answer="True" if example["label"]==1 else "False"
    return {'combine': f"<QUESTION>The SPAN1_TEXT <{example['span1_text']}> which index is SPAN1_INDEX <{example['span1_index']}> and SPAN2_TEXT <{example['span2_text']}> which index is SPAN2_INDEX <{example['span2_index']}> are provided for the same target in TEXT <{example['text']}>?</QUESTION>{example['label']}"}

wsc_dataset_training = wsc_dataset_training.map(concatenate_question_passage)
print(wsc_dataset_training[0]["combine"])

Dataset({
    features: ['text', 'span1_index', 'span2_index', 'span1_text', 'span2_text', 'idx', 'label'],
    num_rows: 554
})


Map:   0%|          | 0/554 [00:00<?, ? examples/s]

<QUESTION>The SPAN1_TEXT <Mark> which index is SPAN1_INDEX <0> and SPAN2_TEXT <He> which index is SPAN2_INDEX <13> are provided for the same target in TEXT <Mark told Pete many lies about himself, which Pete included in his book. He should have been more skeptical.>?</QUESTION>0


In [42]:
training_arguments = TrainingArguments(
    output_dir="./wsc_results",
    num_train_epochs=8,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    # checkpoint save step
    save_steps=250,
    logging_steps=25,
    # change the lr
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=False,
    #A100 True
    bf16=True,
    max_grad_norm=0.3,
 #   max_steps=max_steps,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)


trainer_wsc = SFTTrainer(
    model=model,
    train_dataset=wsc_dataset_training,
    peft_config=peft_config,
    dataset_text_field="combine",
#     dataset_text_field=["question", "passage"],
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Map:   0%|          | 0/554 [00:00<?, ? examples/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [43]:
trainer_wsc.train()
new_model = "llama-2-7b-wsc"
trainer_wsc.model.save_pretrained(new_model)



Step,Training Loss
25,3.5138
50,3.0703
75,2.5412
100,1.886
125,1.5757
150,1.3493
175,1.1137
200,1.1195
225,1.143
250,0.9894




In [44]:
# Reload model in FP16 and merge it with LoRA weights
new_model = "llama-2-7b-wsc"
model_lora_wsc = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model_lora_wsc = PeftModel.from_pretrained(model_lora_wsc, new_model)
model_lora_wsc = model_lora_wsc.merge_and_unload()
print(id(model_lora_wsc))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

46925531490528


In [45]:
wsc_dataset_validation=wsc_dataset["validation"]
def concatenate_question_passage(example):
#     answer="True" if example["label"]==1 else "False"
     return {'combine': f"<QUESTION>The SPAN1_TEXT <{example['span1_text']}> which index is SPAN1_INDEX <{example['span1_index']}> and SPAN2_TEXT <{example['span2_text']}> which index is SPAN2_INDEX <{example['span2_index']}> are provided for the same target in TEXT <{example['text']}>?</QUESTION>"}
wsc_dataset_validation = wsc_dataset_validation.map(concatenate_question_passage)
print(wsc_dataset_validation[0])

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

{'text': 'Bernard , who had not told the government official that he was less than 21 when he filed for a homestead claim, did not consider that he had done anything dishonest. Still, anyone who knew that he was 19 years old could take his claim away from him .', 'span1_index': 32, 'span2_index': 47, 'span1_text': 'anyone', 'span2_text': 'him', 'idx': 0, 'label': 0, 'combine': '<QUESTION>The SPAN1_TEXT <anyone> which index is SPAN1_INDEX <32> and SPAN2_TEXT <him> which index is SPAN2_INDEX <47> are provided for the same target in TEXT <Bernard , who had not told the government official that he was less than 21 when he filed for a homestead claim, did not consider that he had done anything dishonest. Still, anyone who knew that he was 19 years old could take his claim away from him .>?</QUESTION>'}


In [46]:
correct = 0
total = len(wsc_dataset_validation)
pipe = pipeline(task="text-generation", model=model_lora_wsc, tokenizer=tokenizer, max_length=200)

notNumber=[]


for example in wsc_dataset_validation:
    prompt=example['combine']
    result = pipe(prompt, max_length=1, num_return_sequences=1, do_sample=True)
#     print(result[0]['generated_text'])
    generated_answer = result[0]['generated_text'][-1]
#     print(generated_answer)
    if generated_answer=="0" or generated_answer=="1":
        if int(generated_answer) == example['label']:
            correct += 1
    else:
        notNumber.append(generated_answer)

accuracy = correct / total
print(f"Accuracy: {accuracy}")
print(notNumber)

Input length of input_ids is 127, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 125, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 95, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 108, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 83, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 80, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 78, but `max_length` is set to 1. This can lead to unexpected behavi

Input length of input_ids is 75, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 73, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 83, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 130, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 75, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 119, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 90, but `max_length` is set to 1. This can lead to unexpected behavio

Accuracy: 0.5
['2']
