<a href="https://colab.research.google.com/github/chrishayuk/opl-train/blob/main/Fine_Tune_OPL_in_Llama2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # install dependencies

# # we use the latest version of transformers, peft, and accelerate
# !pip install -q accelerate peft transformers

# # install bitsandbytes for quantization
# !pip install -q bitsandbytes

# # install trl for the SFT library
# !pip install -q trl

# # we need sentencepiece for the llama2 slow tokenizer
# !pip install sentencepiece

# # we need einops, used by falcon-7b, llama-2 etc
# # einops (einsteinops) is used to simplify tensorops by making them readable
# !pip install -q -U einops

# # we need to install datasets for our training dataset
# !pip install -q datasets


# ==================================================
# !pip install protobuf

# !pip install -q ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter nbextension enable --py widgetsnbextension

# pip install -q tensorboardX

## Settings
The following configures our settings for finetuning our model

In [2]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "isma77777/data-version2" ## should be replaced by our real dataset in our final fine-tuning

# Fine-tuned model name
new_model = "llama-2-7b-test"

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 5

## Download the base model
The following will download the base model, in this case the llama-2-7b-chat-hf model.

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)

# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

model.config.pretraining_tp = 1
# Load the tokenizer from the model (llama2)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Train the Model
The following section is about taking your dataset and then finetuning the model

## Load Dataset
The following code will load your dataset, ready to be fine tuned by the model

In [4]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset(dataset_name, split="train")

## Fine Tune the Model
The following section will take your dataset, and fine tune the model with it.

In [5]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,      # uses the number of epochs earlier
    per_device_train_batch_size=1,          # 4 seems reasonable
    gradient_accumulation_steps=2,          # 2 is fine, as we're a small batch
    optim="paged_adamw_32bit",              # default optimizer
    save_steps=25,                           # we're not gonna save
    logging_steps=2,                       # same value as used by Meta
    learning_rate=2e-4,                     # standard learning rate
    weight_decay=0.001,                     # standard weight decay 0.001
    fp16=False,                             # set to true for A100
    bf16=False,                             # set to true for A100
    max_grad_norm=0.3,                      # standard setting
    max_steps=-1,                           # needs to be -1, otherwise overrides epochs
    warmup_ratio=0.03,                      # standard warmup ratio
    group_by_length=True,                   # speeds up the training
    lr_scheduler_type="cosine",           # constant seems better than cosine
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,                # use our lora peft config
    dataset_text_field="text",
    max_seq_length=None,                    # no max sequence length
    tokenizer=tokenizer,                    # use the llama tokenizer
    args=training_arguments,                # use the training arguments
    packing=False,                          # don't need packing
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



  0%|          | 0/9695 [00:00<?, ?it/s]

{'loss': 1.173, 'grad_norm': 0.11466719955205917, 'learning_rate': 1.3745704467353952e-06, 'epoch': 0.0}
{'loss': 1.2677, 'grad_norm': 0.08691975474357605, 'learning_rate': 2.7491408934707903e-06, 'epoch': 0.0}
{'loss': 1.4166, 'grad_norm': 0.10017478466033936, 'learning_rate': 4.123711340206186e-06, 'epoch': 0.0}
{'loss': 1.2465, 'grad_norm': 0.1125314012169838, 'learning_rate': 5.498281786941581e-06, 'epoch': 0.0}
{'loss': 1.1107, 'grad_norm': 0.08278153091669083, 'learning_rate': 6.872852233676977e-06, 'epoch': 0.01}
{'loss': 1.2682, 'grad_norm': 0.07678315788507462, 'learning_rate': 8.247422680412371e-06, 'epoch': 0.01}
{'loss': 1.5926, 'grad_norm': 0.1144038662314415, 'learning_rate': 9.621993127147768e-06, 'epoch': 0.01}
{'loss': 1.3504, 'grad_norm': 0.11382138729095459, 'learning_rate': 1.0996563573883161e-05, 'epoch': 0.01}
{'loss': 1.2404, 'grad_norm': 0.1098538264632225, 'learning_rate': 1.2371134020618558e-05, 'epoch': 0.01}
{'loss': 1.5416, 'grad_norm': 0.12706245481967926,

Checkpoint destination directory ./results/checkpoint-25 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.6752, 'grad_norm': 0.1406734734773636, 'learning_rate': 1.7869415807560138e-05, 'epoch': 0.01}
{'loss': 1.6335, 'grad_norm': 0.15578381717205048, 'learning_rate': 1.9243986254295536e-05, 'epoch': 0.01}
{'loss': 1.8391, 'grad_norm': 0.19295527040958405, 'learning_rate': 2.0618556701030927e-05, 'epoch': 0.02}
{'loss': 1.9789, 'grad_norm': 0.2170020490884781, 'learning_rate': 2.1993127147766322e-05, 'epoch': 0.02}
{'loss': 1.9645, 'grad_norm': 0.24052855372428894, 'learning_rate': 2.336769759450172e-05, 'epoch': 0.02}
{'loss': 1.9646, 'grad_norm': 0.24929283559322357, 'learning_rate': 2.4742268041237116e-05, 'epoch': 0.02}
{'loss': 2.23, 'grad_norm': 0.30085790157318115, 'learning_rate': 2.611683848797251e-05, 'epoch': 0.02}
{'loss': 2.2003, 'grad_norm': 0.34330621361732483, 'learning_rate': 2.749140893470791e-05, 'epoch': 0.02}
{'loss': 2.7785, 'grad_norm': 0.3460121750831604, 'learning_rate': 2.8865979381443297e-05, 'epoch': 0.02}
{'loss': 2.7345, 'grad_norm': 0.4377850890159

Checkpoint destination directory ./results/checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 3.4169, 'grad_norm': 0.9983898401260376, 'learning_rate': 3.4364261168384884e-05, 'epoch': 0.03}
{'loss': 1.0694, 'grad_norm': 0.09618440270423889, 'learning_rate': 3.5738831615120275e-05, 'epoch': 0.03}
{'loss': 1.3535, 'grad_norm': 0.14627259969711304, 'learning_rate': 3.7113402061855674e-05, 'epoch': 0.03}
{'loss': 1.4979, 'grad_norm': 0.14732496440410614, 'learning_rate': 3.848797250859107e-05, 'epoch': 0.03}
{'loss': 1.0029, 'grad_norm': 0.15293966233730316, 'learning_rate': 3.9862542955326463e-05, 'epoch': 0.03}
{'loss': 1.1929, 'grad_norm': 0.11002285778522491, 'learning_rate': 4.1237113402061855e-05, 'epoch': 0.03}
{'loss': 1.2443, 'grad_norm': 0.1404205560684204, 'learning_rate': 4.261168384879725e-05, 'epoch': 0.03}
{'loss': 1.3178, 'grad_norm': 0.13668106496334076, 'learning_rate': 4.3986254295532645e-05, 'epoch': 0.03}
{'loss': 1.0284, 'grad_norm': 0.12038912624120712, 'learning_rate': 4.536082474226804e-05, 'epoch': 0.03}
{'loss': 1.3059, 'grad_norm': 0.1484517306

Checkpoint destination directory ./results/checkpoint-75 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.4727, 'grad_norm': 0.19700419902801514, 'learning_rate': 5.223367697594502e-05, 'epoch': 0.04}
{'loss': 1.3917, 'grad_norm': 0.19336706399917603, 'learning_rate': 5.360824742268041e-05, 'epoch': 0.04}
{'loss': 1.1488, 'grad_norm': 0.16991639137268066, 'learning_rate': 5.498281786941582e-05, 'epoch': 0.04}
{'loss': 1.1904, 'grad_norm': 0.19337327778339386, 'learning_rate': 5.63573883161512e-05, 'epoch': 0.04}
{'loss': 1.5009, 'grad_norm': 0.21982167661190033, 'learning_rate': 5.7731958762886594e-05, 'epoch': 0.04}
{'loss': 1.4658, 'grad_norm': 0.22590377926826477, 'learning_rate': 5.9106529209622e-05, 'epoch': 0.04}
{'loss': 1.5182, 'grad_norm': 0.25576135516166687, 'learning_rate': 6.0481099656357384e-05, 'epoch': 0.05}
{'loss': 1.4668, 'grad_norm': 0.3114008605480194, 'learning_rate': 6.185567010309279e-05, 'epoch': 0.05}
{'loss': 1.4207, 'grad_norm': 0.2811540961265564, 'learning_rate': 6.323024054982817e-05, 'epoch': 0.05}
{'loss': 1.4642, 'grad_norm': 0.2920514941215515,

Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.44, 'grad_norm': 0.8122512102127075, 'learning_rate': 6.872852233676977e-05, 'epoch': 0.05}
{'loss': 0.8482, 'grad_norm': 0.1502983570098877, 'learning_rate': 7.010309278350515e-05, 'epoch': 0.05}
{'loss': 1.0659, 'grad_norm': 0.14991948008537292, 'learning_rate': 7.147766323024055e-05, 'epoch': 0.05}
{'loss': 1.0612, 'grad_norm': 0.1758836954832077, 'learning_rate': 7.285223367697595e-05, 'epoch': 0.05}
{'loss': 1.132, 'grad_norm': 0.18518078327178955, 'learning_rate': 7.422680412371135e-05, 'epoch': 0.06}
{'loss': 0.9701, 'grad_norm': 0.17479142546653748, 'learning_rate': 7.560137457044673e-05, 'epoch': 0.06}
{'loss': 1.0233, 'grad_norm': 0.17168481647968292, 'learning_rate': 7.697594501718214e-05, 'epoch': 0.06}
{'loss': 1.0285, 'grad_norm': 0.15857139229774475, 'learning_rate': 7.835051546391753e-05, 'epoch': 0.06}
{'loss': 0.9182, 'grad_norm': 0.2063567191362381, 'learning_rate': 7.972508591065293e-05, 'epoch': 0.06}
{'loss': 0.8106, 'grad_norm': 0.17404617369174957, 'l

Checkpoint destination directory ./results/checkpoint-125 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.991, 'grad_norm': 0.22029311954975128, 'learning_rate': 8.65979381443299e-05, 'epoch': 0.06}
{'loss': 1.079, 'grad_norm': 0.20873276889324188, 'learning_rate': 8.797250859106529e-05, 'epoch': 0.07}
{'loss': 1.2397, 'grad_norm': 0.25804221630096436, 'learning_rate': 8.93470790378007e-05, 'epoch': 0.07}
{'loss': 1.3461, 'grad_norm': 0.3097119629383087, 'learning_rate': 9.072164948453609e-05, 'epoch': 0.07}
{'loss': 0.9804, 'grad_norm': 0.2599831223487854, 'learning_rate': 9.209621993127147e-05, 'epoch': 0.07}
{'loss': 1.1332, 'grad_norm': 0.4296266436576843, 'learning_rate': 9.347079037800688e-05, 'epoch': 0.07}
{'loss': 1.1026, 'grad_norm': 0.3787997364997864, 'learning_rate': 9.484536082474227e-05, 'epoch': 0.07}
{'loss': 1.1479, 'grad_norm': 0.48667794466018677, 'learning_rate': 9.621993127147767e-05, 'epoch': 0.07}
{'loss': 1.323, 'grad_norm': 0.5714349746704102, 'learning_rate': 9.759450171821306e-05, 'epoch': 0.07}
{'loss': 1.232, 'grad_norm': 0.5201795101165771, 'learni

Checkpoint destination directory ./results/checkpoint-150 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.6994, 'grad_norm': 1.0638090372085571, 'learning_rate': 0.00010309278350515463, 'epoch': 0.08}
{'loss': 1.1145, 'grad_norm': 0.2652125358581543, 'learning_rate': 0.00010446735395189004, 'epoch': 0.08}
{'loss': 1.1136, 'grad_norm': 0.27511534094810486, 'learning_rate': 0.00010584192439862544, 'epoch': 0.08}
{'loss': 0.8731, 'grad_norm': 0.21733634173870087, 'learning_rate': 0.00010721649484536083, 'epoch': 0.08}
{'loss': 0.7098, 'grad_norm': 0.2989443242549896, 'learning_rate': 0.00010859106529209621, 'epoch': 0.08}
{'loss': 1.1813, 'grad_norm': 0.18196628987789154, 'learning_rate': 0.00010996563573883164, 'epoch': 0.08}
{'loss': 0.9856, 'grad_norm': 0.21002943813800812, 'learning_rate': 0.00011134020618556702, 'epoch': 0.08}
{'loss': 0.9763, 'grad_norm': 0.1911860555410385, 'learning_rate': 0.0001127147766323024, 'epoch': 0.08}
{'loss': 0.8897, 'grad_norm': 0.167632594704628, 'learning_rate': 0.0001140893470790378, 'epoch': 0.09}
{'loss': 0.8627, 'grad_norm': 0.1943248659372

Checkpoint destination directory ./results/checkpoint-175 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.9409, 'grad_norm': 0.3193104565143585, 'learning_rate': 0.00012096219931271477, 'epoch': 0.09}
{'loss': 0.8404, 'grad_norm': 0.2683906555175781, 'learning_rate': 0.00012233676975945018, 'epoch': 0.09}
{'loss': 1.0525, 'grad_norm': 0.34589919447898865, 'learning_rate': 0.00012371134020618558, 'epoch': 0.09}
{'loss': 1.0758, 'grad_norm': 0.37419867515563965, 'learning_rate': 0.00012508591065292098, 'epoch': 0.09}
{'loss': 1.0359, 'grad_norm': 0.44883793592453003, 'learning_rate': 0.00012646048109965635, 'epoch': 0.09}
{'loss': 1.0979, 'grad_norm': 0.5287548303604126, 'learning_rate': 0.00012783505154639175, 'epoch': 0.1}
{'loss': 0.9798, 'grad_norm': 0.45727911591529846, 'learning_rate': 0.00012920962199312717, 'epoch': 0.1}
{'loss': 1.0377, 'grad_norm': 0.47985732555389404, 'learning_rate': 0.00013058419243986254, 'epoch': 0.1}
{'loss': 1.2393, 'grad_norm': 0.7222021222114563, 'learning_rate': 0.00013195876288659794, 'epoch': 0.1}
{'loss': 1.0506, 'grad_norm': 0.6314517855644

Checkpoint destination directory ./results/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.7199, 'grad_norm': 1.1666303873062134, 'learning_rate': 0.00013745704467353953, 'epoch': 0.1}
{'loss': 0.6948, 'grad_norm': 0.22928482294082642, 'learning_rate': 0.0001388316151202749, 'epoch': 0.1}
{'loss': 0.7672, 'grad_norm': 0.2312883734703064, 'learning_rate': 0.0001402061855670103, 'epoch': 0.11}
{'loss': 0.9396, 'grad_norm': 0.23767895996570587, 'learning_rate': 0.00014158075601374573, 'epoch': 0.11}
{'loss': 0.8203, 'grad_norm': 0.2235749214887619, 'learning_rate': 0.0001429553264604811, 'epoch': 0.11}
{'loss': 1.1191, 'grad_norm': 0.17977482080459595, 'learning_rate': 0.0001443298969072165, 'epoch': 0.11}
{'loss': 1.0041, 'grad_norm': 0.21287758648395538, 'learning_rate': 0.0001457044673539519, 'epoch': 0.11}
{'loss': 0.9703, 'grad_norm': 0.18468086421489716, 'learning_rate': 0.0001470790378006873, 'epoch': 0.11}
{'loss': 0.7507, 'grad_norm': 0.2251548171043396, 'learning_rate': 0.0001484536082474227, 'epoch': 0.11}
{'loss': 0.9863, 'grad_norm': 0.1676843911409378, 

Checkpoint destination directory ./results/checkpoint-225 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.7371, 'grad_norm': 0.2989744544029236, 'learning_rate': 0.00015532646048109966, 'epoch': 0.12}
{'loss': 0.6939, 'grad_norm': 0.29985272884368896, 'learning_rate': 0.00015670103092783506, 'epoch': 0.12}
{'loss': 0.9375, 'grad_norm': 0.3646528422832489, 'learning_rate': 0.00015807560137457046, 'epoch': 0.12}
{'loss': 0.7979, 'grad_norm': 0.336564838886261, 'learning_rate': 0.00015945017182130585, 'epoch': 0.12}
{'loss': 0.8322, 'grad_norm': 0.7817770838737488, 'learning_rate': 0.00016082474226804125, 'epoch': 0.12}
{'loss': 0.9045, 'grad_norm': 0.4273616671562195, 'learning_rate': 0.00016219931271477665, 'epoch': 0.12}
{'loss': 0.8657, 'grad_norm': 0.43802377581596375, 'learning_rate': 0.00016357388316151202, 'epoch': 0.12}
{'loss': 1.1381, 'grad_norm': 0.4674416184425354, 'learning_rate': 0.00016494845360824742, 'epoch': 0.12}
{'loss': 0.9917, 'grad_norm': 0.5373005270957947, 'learning_rate': 0.00016632302405498285, 'epoch': 0.12}
{'loss': 1.0672, 'grad_norm': 0.5026329755783

Checkpoint destination directory ./results/checkpoint-250 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.3449, 'grad_norm': 0.7998446822166443, 'learning_rate': 0.00017182130584192438, 'epoch': 0.13}
{'loss': 0.8382, 'grad_norm': 0.2525317668914795, 'learning_rate': 0.0001731958762886598, 'epoch': 0.13}
{'loss': 0.743, 'grad_norm': 0.26074525713920593, 'learning_rate': 0.0001745704467353952, 'epoch': 0.13}
{'loss': 0.7305, 'grad_norm': 0.26911497116088867, 'learning_rate': 0.00017594501718213058, 'epoch': 0.13}
{'loss': 0.9221, 'grad_norm': 0.24538946151733398, 'learning_rate': 0.00017731958762886598, 'epoch': 0.13}
{'loss': 0.7849, 'grad_norm': 0.23610828816890717, 'learning_rate': 0.0001786941580756014, 'epoch': 0.13}
{'loss': 0.8384, 'grad_norm': 0.3334355056285858, 'learning_rate': 0.00018006872852233677, 'epoch': 0.14}
{'loss': 0.7649, 'grad_norm': 0.19942304491996765, 'learning_rate': 0.00018144329896907217, 'epoch': 0.14}
{'loss': 0.8322, 'grad_norm': 0.23526477813720703, 'learning_rate': 0.00018281786941580757, 'epoch': 0.14}
{'loss': 0.8034, 'grad_norm': 0.263734757900

Checkpoint destination directory ./results/checkpoint-275 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.9014, 'grad_norm': 0.35396578907966614, 'learning_rate': 0.00018969072164948454, 'epoch': 0.14}
{'loss': 0.8473, 'grad_norm': 0.34337249398231506, 'learning_rate': 0.00019106529209621996, 'epoch': 0.14}
{'loss': 0.8162, 'grad_norm': 0.35923126339912415, 'learning_rate': 0.00019243986254295533, 'epoch': 0.14}
{'loss': 0.7957, 'grad_norm': 0.33741119503974915, 'learning_rate': 0.00019381443298969073, 'epoch': 0.15}
{'loss': 0.7928, 'grad_norm': 0.5285106897354126, 'learning_rate': 0.00019518900343642613, 'epoch': 0.15}
{'loss': 0.8639, 'grad_norm': 0.4216401278972626, 'learning_rate': 0.0001965635738831615, 'epoch': 0.15}
{'loss': 0.7289, 'grad_norm': 0.38201335072517395, 'learning_rate': 0.00019793814432989693, 'epoch': 0.15}
{'loss': 1.1302, 'grad_norm': 0.5160490274429321, 'learning_rate': 0.00019931271477663232, 'epoch': 0.15}
{'loss': 1.0564, 'grad_norm': 0.4803110361099243, 'learning_rate': 0.0001999999944198676, 'epoch': 0.15}
{'loss': 1.0195, 'grad_norm': 0.49974167346

Checkpoint destination directory ./results/checkpoint-300 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.0177, 'grad_norm': 0.5465145111083984, 'learning_rate': 0.00019999954800961064, 'epoch': 0.15}
{'loss': 0.9379, 'grad_norm': 0.27525556087493896, 'learning_rate': 0.00019999932480473128, 'epoch': 0.16}
{'loss': 0.8047, 'grad_norm': 0.20949940383434296, 'learning_rate': 0.00019999905695909526, 'epoch': 0.16}
{'loss': 0.7513, 'grad_norm': 0.22965282201766968, 'learning_rate': 0.00019999874447282213, 'epoch': 0.16}
{'loss': 0.8627, 'grad_norm': 0.2392594963312149, 'learning_rate': 0.0001999983873460514, 'epoch': 0.16}
{'loss': 0.7669, 'grad_norm': 0.2206195592880249, 'learning_rate': 0.0001999979855789425, 'epoch': 0.16}
{'loss': 0.6318, 'grad_norm': 0.2191508561372757, 'learning_rate': 0.0001999975391716748, 'epoch': 0.16}
{'loss': 0.6586, 'grad_norm': 0.18339410424232483, 'learning_rate': 0.00019999704812444753, 'epoch': 0.16}
{'loss': 0.6874, 'grad_norm': 0.23970453441143036, 'learning_rate': 0.00019999651243747996, 'epoch': 0.16}
{'loss': 0.777, 'grad_norm': 0.2588314116001

Checkpoint destination directory ./results/checkpoint-325 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.8093, 'grad_norm': 0.2665958106517792, 'learning_rate': 0.00019999316441560435, 'epoch': 0.17}
{'loss': 0.561, 'grad_norm': 0.30525660514831543, 'learning_rate': 0.0001999923608959141, 'epoch': 0.17}
{'loss': 0.7042, 'grad_norm': 0.30260923504829407, 'learning_rate': 0.00019999151273857597, 'epoch': 0.17}
{'loss': 0.858, 'grad_norm': 0.3911426067352295, 'learning_rate': 0.00019999061994396854, 'epoch': 0.17}
{'loss': 0.7705, 'grad_norm': 0.3490581810474396, 'learning_rate': 0.00019998968251249043, 'epoch': 0.17}
{'loss': 0.6966, 'grad_norm': 0.32266321778297424, 'learning_rate': 0.00019998870044456006, 'epoch': 0.17}
{'loss': 0.8942, 'grad_norm': 0.4339015781879425, 'learning_rate': 0.00019998767374061584, 'epoch': 0.17}
{'loss': 1.0675, 'grad_norm': 0.5309640765190125, 'learning_rate': 0.00019998660240111614, 'epoch': 0.18}
{'loss': 0.9424, 'grad_norm': 0.5678768157958984, 'learning_rate': 0.00019998548642653917, 'epoch': 0.18}
{'loss': 0.9421, 'grad_norm': 0.67437416315078

Checkpoint destination directory ./results/checkpoint-350 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.3191, 'grad_norm': 1.0552988052368164, 'learning_rate': 0.00019998057618772136, 'epoch': 0.18}
{'loss': 0.9463, 'grad_norm': 0.2942209839820862, 'learning_rate': 0.0001999792370456294, 'epoch': 0.18}
{'loss': 0.9505, 'grad_norm': 0.3492533266544342, 'learning_rate': 0.00019997785327174818, 'epoch': 0.18}
{'loss': 0.9408, 'grad_norm': 0.25641068816185, 'learning_rate': 0.0001999764248666954, 'epoch': 0.18}
{'loss': 0.8601, 'grad_norm': 0.22711396217346191, 'learning_rate': 0.00019997495183110876, 'epoch': 0.18}
{'loss': 0.776, 'grad_norm': 0.33447879552841187, 'learning_rate': 0.00019997343416564578, 'epoch': 0.19}
{'loss': 0.7431, 'grad_norm': 0.22488555312156677, 'learning_rate': 0.00019997187187098398, 'epoch': 0.19}
{'loss': 0.8907, 'grad_norm': 0.20901921391487122, 'learning_rate': 0.00019997026494782084, 'epoch': 0.19}
{'loss': 0.6263, 'grad_norm': 0.2221997231245041, 'learning_rate': 0.00019996861339687363, 'epoch': 0.19}
{'loss': 0.7366, 'grad_norm': 0.220238998532295

Checkpoint destination directory ./results/checkpoint-375 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6238, 'grad_norm': 0.2378620058298111, 'learning_rate': 0.0001999596862518788, 'epoch': 0.19}
{'loss': 0.7266, 'grad_norm': 0.28363966941833496, 'learning_rate': 0.0001999577669504073, 'epoch': 0.19}
{'loss': 0.7859, 'grad_norm': 0.3450051546096802, 'learning_rate': 0.000199955803026731, 'epoch': 0.2}
{'loss': 0.7698, 'grad_norm': 0.28946027159690857, 'learning_rate': 0.00019995379448172664, 'epoch': 0.2}
{'loss': 0.7145, 'grad_norm': 0.32047542929649353, 'learning_rate': 0.00019995174131629082, 'epoch': 0.2}
{'loss': 0.9278, 'grad_norm': 0.3410698175430298, 'learning_rate': 0.0001999496435313401, 'epoch': 0.2}
{'loss': 0.9616, 'grad_norm': 0.3873465061187744, 'learning_rate': 0.000199947501127811, 'epoch': 0.2}
{'loss': 0.9273, 'grad_norm': 0.3142256438732147, 'learning_rate': 0.00019994531410665986, 'epoch': 0.2}
{'loss': 0.7737, 'grad_norm': 0.388705313205719, 'learning_rate': 0.00019994308246886296, 'epoch': 0.2}
{'loss': 1.2378, 'grad_norm': 0.5116364359855652, 'learnin

Checkpoint destination directory ./results/checkpoint-400 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.0584, 'grad_norm': 0.8152020573616028, 'learning_rate': 0.00019993370977144164, 'epoch': 0.21}
{'loss': 0.9509, 'grad_norm': 0.25121060013771057, 'learning_rate': 0.0001999312550657579, 'epoch': 0.21}
{'loss': 0.7894, 'grad_norm': 0.26799511909484863, 'learning_rate': 0.00019992875574970459, 'epoch': 0.21}
{'loss': 0.7942, 'grad_norm': 0.22338934242725372, 'learning_rate': 0.00019992621182439736, 'epoch': 0.21}
{'loss': 0.819, 'grad_norm': 0.29835212230682373, 'learning_rate': 0.00019992362329097188, 'epoch': 0.21}
{'loss': 0.8158, 'grad_norm': 0.2431223839521408, 'learning_rate': 0.00019992099015058373, 'epoch': 0.21}
{'loss': 0.8964, 'grad_norm': 0.25140488147735596, 'learning_rate': 0.0001999183124044083, 'epoch': 0.21}
{'loss': 0.7619, 'grad_norm': 0.2865869104862213, 'learning_rate': 0.000199915590053641, 'epoch': 0.21}
{'loss': 0.803, 'grad_norm': 0.20169083774089813, 'learning_rate': 0.0001999128230994971, 'epoch': 0.21}
{'loss': 0.8226, 'grad_norm': 0.215828031301498

Checkpoint destination directory ./results/checkpoint-425 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6905, 'grad_norm': 0.26288360357284546, 'learning_rate': 0.00019989831932205773, 'epoch': 0.22}
{'loss': 0.6447, 'grad_norm': 0.23246289789676666, 'learning_rate': 0.00019989528477429037, 'epoch': 0.22}
{'loss': 0.6311, 'grad_norm': 0.2272995561361313, 'learning_rate': 0.00019989220563221087, 'epoch': 0.22}
{'loss': 0.8985, 'grad_norm': 0.3820868730545044, 'learning_rate': 0.00019988908189719386, 'epoch': 0.22}
{'loss': 0.6825, 'grad_norm': 0.283962607383728, 'learning_rate': 0.00019988591357063382, 'epoch': 0.22}
{'loss': 0.7367, 'grad_norm': 0.3770027458667755, 'learning_rate': 0.00019988270065394504, 'epoch': 0.22}
{'loss': 0.7671, 'grad_norm': 0.42556288838386536, 'learning_rate': 0.0001998794431485619, 'epoch': 0.23}
{'loss': 0.4965, 'grad_norm': 0.34724414348602295, 'learning_rate': 0.0001998761410559385, 'epoch': 0.23}
{'loss': 0.846, 'grad_norm': 0.4331812560558319, 'learning_rate': 0.000199872794377549, 'epoch': 0.23}
{'loss': 1.087, 'grad_norm': 0.5173531770706177,

Checkpoint destination directory ./results/checkpoint-450 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.0843, 'grad_norm': 0.7579184174537659, 'learning_rate': 0.00019985896183650795, 'epoch': 0.23}
{'loss': 0.7916, 'grad_norm': 0.291567325592041, 'learning_rate': 0.0001998553922520957, 'epoch': 0.23}
{'loss': 0.8634, 'grad_norm': 0.2863255441188812, 'learning_rate': 0.00019985177809117978, 'epoch': 0.23}
{'loss': 0.9204, 'grad_norm': 0.18022820353507996, 'learning_rate': 0.00019984811935537362, 'epoch': 0.24}
{'loss': 0.7597, 'grad_norm': 0.28790196776390076, 'learning_rate': 0.00019984441604631051, 'epoch': 0.24}
{'loss': 0.676, 'grad_norm': 0.18084613978862762, 'learning_rate': 0.00019984066816564367, 'epoch': 0.24}
{'loss': 0.6027, 'grad_norm': 0.21294420957565308, 'learning_rate': 0.00019983687571504613, 'epoch': 0.24}
{'loss': 0.6845, 'grad_norm': 0.21172545850276947, 'learning_rate': 0.00019983303869621099, 'epoch': 0.24}
{'loss': 0.7351, 'grad_norm': 0.21259805560112, 'learning_rate': 0.000199829157110851, 'epoch': 0.24}
{'loss': 0.5881, 'grad_norm': 0.264635294675827,

Checkpoint destination directory ./results/checkpoint-475 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6875, 'grad_norm': 0.25819262862205505, 'learning_rate': 0.00019980908074752294, 'epoch': 0.25}
{'loss': 0.7283, 'grad_norm': 0.29079437255859375, 'learning_rate': 0.00019980493180009893, 'epoch': 0.25}
{'loss': 0.6165, 'grad_norm': 0.2976260483264923, 'learning_rate': 0.00019980073829869733, 'epoch': 0.25}
{'loss': 0.7672, 'grad_norm': 0.2764320373535156, 'learning_rate': 0.00019979650024519016, 'epoch': 0.25}
{'loss': 0.7102, 'grad_norm': 0.31462156772613525, 'learning_rate': 0.0001997922176414694, 'epoch': 0.25}
{'loss': 0.5482, 'grad_norm': 0.39489394426345825, 'learning_rate': 0.0001997878904894468, 'epoch': 0.25}
{'loss': 0.786, 'grad_norm': 0.41061756014823914, 'learning_rate': 0.00019978351879105404, 'epoch': 0.25}
{'loss': 0.821, 'grad_norm': 0.5291564464569092, 'learning_rate': 0.00019977910254824274, 'epoch': 0.25}
{'loss': 0.9303, 'grad_norm': 0.5238330960273743, 'learning_rate': 0.00019977464176298426, 'epoch': 0.25}
{'loss': 1.0117, 'grad_norm': 0.5014023780822

Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.174, 'grad_norm': 0.9685277342796326, 'learning_rate': 0.00019975635323760439, 'epoch': 0.26}
{'loss': 0.8587, 'grad_norm': 0.246406689286232, 'learning_rate': 0.00019975166977037812, 'epoch': 0.26}
{'loss': 0.6398, 'grad_norm': 0.3683134913444519, 'learning_rate': 0.00019974694177295104, 'epoch': 0.26}
{'loss': 0.8268, 'grad_norm': 0.2462824136018753, 'learning_rate': 0.0001997421692474337, 'epoch': 0.26}
{'loss': 0.7389, 'grad_norm': 0.2795371115207672, 'learning_rate': 0.00019973735219595674, 'epoch': 0.26}
{'loss': 0.661, 'grad_norm': 0.21210704743862152, 'learning_rate': 0.00019973249062067038, 'epoch': 0.26}
{'loss': 0.689, 'grad_norm': 0.2309054583311081, 'learning_rate': 0.000199727584523745, 'epoch': 0.26}
{'loss': 0.6684, 'grad_norm': 0.2304580956697464, 'learning_rate': 0.00019972263390737068, 'epoch': 0.27}
{'loss': 0.7722, 'grad_norm': 0.2735075056552887, 'learning_rate': 0.00019971763877375747, 'epoch': 0.27}
{'loss': 0.6155, 'grad_norm': 0.21209587156772614, '

Checkpoint destination directory ./results/checkpoint-525 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.5734, 'grad_norm': 0.2730228006839752, 'learning_rate': 0.00019969199542584884, 'epoch': 0.27}
{'loss': 0.5993, 'grad_norm': 0.2814737856388092, 'learning_rate': 0.00019968673323632506, 'epoch': 0.27}
{'loss': 0.6805, 'grad_norm': 0.312138170003891, 'learning_rate': 0.00019968142654558884, 'epoch': 0.27}
{'loss': 0.7341, 'grad_norm': 0.38791245222091675, 'learning_rate': 0.00019967607535600906, 'epoch': 0.27}
{'loss': 0.7568, 'grad_norm': 0.35458889603614807, 'learning_rate': 0.00019967067966997462, 'epoch': 0.28}
{'loss': 0.7586, 'grad_norm': 0.2977774739265442, 'learning_rate': 0.00019966523948989416, 'epoch': 0.28}
{'loss': 0.9183, 'grad_norm': 0.4709359109401703, 'learning_rate': 0.00019965975481819626, 'epoch': 0.28}
{'loss': 0.9431, 'grad_norm': 0.4570365846157074, 'learning_rate': 0.0001996542256573293, 'epoch': 0.28}
{'loss': 0.9833, 'grad_norm': 0.6421048045158386, 'learning_rate': 0.00019964865200976166, 'epoch': 0.28}
{'loss': 0.9809, 'grad_norm': 0.61764383316040

Checkpoint destination directory ./results/checkpoint-550 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.8578, 'grad_norm': 0.5741883516311646, 'learning_rate': 0.00019962591260254398, 'epoch': 0.28}
{'loss': 0.6672, 'grad_norm': 0.25659117102622986, 'learning_rate': 0.00019962011655919173, 'epoch': 0.28}
{'loss': 0.6857, 'grad_norm': 0.2761494219303131, 'learning_rate': 0.00019961427604436538, 'epoch': 0.29}
{'loss': 0.797, 'grad_norm': 0.22160370647907257, 'learning_rate': 0.00019960839106067217, 'epoch': 0.29}
{'loss': 0.8526, 'grad_norm': 0.22558589279651642, 'learning_rate': 0.0001996024616107393, 'epoch': 0.29}
{'loss': 0.515, 'grad_norm': 0.19389371573925018, 'learning_rate': 0.0001995964876972137, 'epoch': 0.29}
{'loss': 0.7837, 'grad_norm': 0.21002008020877838, 'learning_rate': 0.00019959046932276216, 'epoch': 0.29}
{'loss': 0.9045, 'grad_norm': 0.24694450199604034, 'learning_rate': 0.00019958440649007139, 'epoch': 0.29}
{'loss': 0.988, 'grad_norm': 0.11729037016630173, 'learning_rate': 0.00019957829920184785, 'epoch': 0.29}
{'loss': 0.7801, 'grad_norm': 0.281300246715

Checkpoint destination directory ./results/checkpoint-575 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.7649, 'grad_norm': 0.27293357253074646, 'learning_rate': 0.00019954709602385622, 'epoch': 0.3}
{'loss': 0.7356, 'grad_norm': 0.3432512879371643, 'learning_rate': 0.0001995407220603843, 'epoch': 0.3}
{'loss': 0.7325, 'grad_norm': 0.28457921743392944, 'learning_rate': 0.00019953430366088083, 'epoch': 0.3}
{'loss': 0.7172, 'grad_norm': 0.30260834097862244, 'learning_rate': 0.00019952784082821108, 'epoch': 0.3}
{'loss': 0.6462, 'grad_norm': 0.32527509331703186, 'learning_rate': 0.00019952133356526005, 'epoch': 0.3}
{'loss': 0.625, 'grad_norm': 0.29756423830986023, 'learning_rate': 0.00019951478187493272, 'epoch': 0.3}
{'loss': 0.7325, 'grad_norm': 0.3440108299255371, 'learning_rate': 0.00019950818576015386, 'epoch': 0.3}
{'loss': 0.728, 'grad_norm': 0.3112783133983612, 'learning_rate': 0.00019950154522386797, 'epoch': 0.3}
{'loss': 0.6914, 'grad_norm': 0.4941835105419159, 'learning_rate': 0.0001994948602690395, 'epoch': 0.31}
{'loss': 0.9324, 'grad_norm': 0.48641905188560486, 'l

Checkpoint destination directory ./results/checkpoint-600 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.3176, 'grad_norm': 1.0058257579803467, 'learning_rate': 0.00019946767632428175, 'epoch': 0.31}
{'loss': 0.653, 'grad_norm': 0.30133959650993347, 'learning_rate': 0.00019946076932190037, 'epoch': 0.31}
{'loss': 0.5818, 'grad_norm': 0.19045822322368622, 'learning_rate': 0.00019945381791917918, 'epoch': 0.31}
{'loss': 0.8262, 'grad_norm': 0.21753919124603271, 'learning_rate': 0.0001994468221192214, 'epoch': 0.31}
{'loss': 0.7947, 'grad_norm': 0.2542659044265747, 'learning_rate': 0.00019943978192514997, 'epoch': 0.31}
{'loss': 0.8848, 'grad_norm': 0.21983858942985535, 'learning_rate': 0.00019943269734010773, 'epoch': 0.31}
{'loss': 0.6882, 'grad_norm': 0.25944364070892334, 'learning_rate': 0.00019942556836725735, 'epoch': 0.32}
{'loss': 0.9351, 'grad_norm': 0.22762036323547363, 'learning_rate': 0.00019941839500978124, 'epoch': 0.32}
{'loss': 1.0241, 'grad_norm': 0.24753814935684204, 'learning_rate': 0.00019941117727088168, 'epoch': 0.32}
{'loss': 0.6649, 'grad_norm': 0.244053110

Checkpoint destination directory ./results/checkpoint-625 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6343, 'grad_norm': 0.2731969952583313, 'learning_rate': 0.00019937442296849795, 'epoch': 0.32}
{'loss': 0.5884, 'grad_norm': 0.30229827761650085, 'learning_rate': 0.00019936693900941458, 'epoch': 0.32}
{'loss': 0.5859, 'grad_norm': 0.27217191457748413, 'learning_rate': 0.0001993594106918782, 'epoch': 0.32}
{'loss': 0.6607, 'grad_norm': 0.3018663227558136, 'learning_rate': 0.00019935183801924963, 'epoch': 0.33}
{'loss': 0.6624, 'grad_norm': 0.47438332438468933, 'learning_rate': 0.00019934422099490934, 'epoch': 0.33}
{'loss': 0.5891, 'grad_norm': 0.36804643273353577, 'learning_rate': 0.00019933655962225765, 'epoch': 0.33}
{'loss': 0.8542, 'grad_norm': 0.5357491374015808, 'learning_rate': 0.0001993288539047147, 'epoch': 0.33}
{'loss': 0.8931, 'grad_norm': 0.4655322730541229, 'learning_rate': 0.00019932110384572038, 'epoch': 0.33}
{'loss': 0.9204, 'grad_norm': 0.4593242108821869, 'learning_rate': 0.0001993133094487344, 'epoch': 0.33}
{'loss': 1.0024, 'grad_norm': 0.4959926903247

Checkpoint destination directory ./results/checkpoint-650 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.9948, 'grad_norm': 0.9380754232406616, 'learning_rate': 0.00019928168855076091, 'epoch': 0.34}
{'loss': 0.649, 'grad_norm': 0.28710323572158813, 'learning_rate': 0.000199273672516405, 'epoch': 0.34}
{'loss': 0.7733, 'grad_norm': 0.30169379711151123, 'learning_rate': 0.0001992656121652313, 'epoch': 0.34}
{'loss': 0.701, 'grad_norm': 0.25515982508659363, 'learning_rate': 0.000199257507500838, 'epoch': 0.34}
{'loss': 0.828, 'grad_norm': 0.31538915634155273, 'learning_rate': 0.00019924935852684314, 'epoch': 0.34}
{'loss': 0.7278, 'grad_norm': 0.25702640414237976, 'learning_rate': 0.0001992411652468845, 'epoch': 0.34}
{'loss': 0.5185, 'grad_norm': 0.22769856452941895, 'learning_rate': 0.00019923292766461966, 'epoch': 0.34}
{'loss': 0.5853, 'grad_norm': 0.28306788206100464, 'learning_rate': 0.00019922464578372596, 'epoch': 0.34}
{'loss': 0.5768, 'grad_norm': 0.28866222500801086, 'learning_rate': 0.00019921631960790047, 'epoch': 0.34}
{'loss': 0.7163, 'grad_norm': 0.313433378934860

Checkpoint destination directory ./results/checkpoint-675 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.5746, 'grad_norm': 0.404828816652298, 'learning_rate': 0.00019917402443558007, 'epoch': 0.35}
{'loss': 0.7132, 'grad_norm': 0.2936416566371918, 'learning_rate': 0.0001991654325689108, 'epoch': 0.35}
{'loss': 0.6508, 'grad_norm': 0.3727380335330963, 'learning_rate': 0.00019915679643374318, 'epoch': 0.35}
{'loss': 0.7742, 'grad_norm': 0.4506922662258148, 'learning_rate': 0.00019914811603393247, 'epoch': 0.35}
{'loss': 0.7013, 'grad_norm': 0.4316878616809845, 'learning_rate': 0.00019913939137335373, 'epoch': 0.35}
{'loss': 0.914, 'grad_norm': 0.5087308883666992, 'learning_rate': 0.00019913062245590168, 'epoch': 0.35}
{'loss': 0.8033, 'grad_norm': 0.43724021315574646, 'learning_rate': 0.0001991218092854909, 'epoch': 0.35}
{'loss': 0.8844, 'grad_norm': 0.47798293828964233, 'learning_rate': 0.00019911295186605566, 'epoch': 0.36}
{'loss': 1.0228, 'grad_norm': 0.5111427903175354, 'learning_rate': 0.00019910405020155003, 'epoch': 0.36}
{'loss': 0.8098, 'grad_norm': 0.4638864398002624

Checkpoint destination directory ./results/checkpoint-700 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.0099, 'grad_norm': 0.9952471256256104, 'learning_rate': 0.00019906800117259558, 'epoch': 0.36}
{'loss': 0.9285, 'grad_norm': 0.2091754525899887, 'learning_rate': 0.00019905887834273985, 'epoch': 0.36}
{'loss': 0.8116, 'grad_norm': 0.21706891059875488, 'learning_rate': 0.00019904971129195272, 'epoch': 0.36}
{'loss': 0.5979, 'grad_norm': 0.23740936815738678, 'learning_rate': 0.0001990405000243264, 'epoch': 0.36}
{'loss': 0.8488, 'grad_norm': 0.31260740756988525, 'learning_rate': 0.00019903124454397297, 'epoch': 0.37}
{'loss': 0.7167, 'grad_norm': 0.19822320342063904, 'learning_rate': 0.00019902194485502415, 'epoch': 0.37}
{'loss': 0.7248, 'grad_norm': 0.25358161330223083, 'learning_rate': 0.0001990126009616314, 'epoch': 0.37}
{'loss': 0.6984, 'grad_norm': 0.1963653564453125, 'learning_rate': 0.00019900321286796596, 'epoch': 0.37}
{'loss': 0.6172, 'grad_norm': 0.2972865402698517, 'learning_rate': 0.00019899378057821877, 'epoch': 0.37}
{'loss': 0.5254, 'grad_norm': 0.22575345635

Checkpoint destination directory ./results/checkpoint-725 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6452, 'grad_norm': 0.3756614327430725, 'learning_rate': 0.0001989459563363205, 'epoch': 0.37}
{'loss': 0.7035, 'grad_norm': 0.4629502594470978, 'learning_rate': 0.00019893625895919733, 'epoch': 0.38}
{'loss': 0.6279, 'grad_norm': 0.3122808337211609, 'learning_rate': 0.00019892651741588137, 'epoch': 0.38}
{'loss': 0.7001, 'grad_norm': 0.34367045760154724, 'learning_rate': 0.00019891673171072127, 'epoch': 0.38}
{'loss': 0.4463, 'grad_norm': 0.37202176451683044, 'learning_rate': 0.00019890690184808555, 'epoch': 0.38}
{'loss': 0.7193, 'grad_norm': 0.3718208968639374, 'learning_rate': 0.0001988970278323623, 'epoch': 0.38}
{'loss': 0.7573, 'grad_norm': 0.4412391781806946, 'learning_rate': 0.00019888710966795947, 'epoch': 0.38}
{'loss': 0.9034, 'grad_norm': 0.49345463514328003, 'learning_rate': 0.00019887714735930453, 'epoch': 0.38}
{'loss': 0.6507, 'grad_norm': 0.5210151076316833, 'learning_rate': 0.00019886714091084485, 'epoch': 0.38}
{'loss': 0.865, 'grad_norm': 0.64271402359008

Checkpoint destination directory ./results/checkpoint-750 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.1782, 'grad_norm': 0.8546478152275085, 'learning_rate': 0.00019882667380859349, 'epoch': 0.39}
{'loss': 0.6423, 'grad_norm': 0.2803995609283447, 'learning_rate': 0.00019881644672850867, 'epoch': 0.39}
{'loss': 0.8086, 'grad_norm': 0.4288199841976166, 'learning_rate': 0.0001988061755357165, 'epoch': 0.39}
{'loss': 0.4974, 'grad_norm': 0.1671379804611206, 'learning_rate': 0.00019879586023480213, 'epoch': 0.39}
{'loss': 0.8017, 'grad_norm': 0.17818798124790192, 'learning_rate': 0.00019878550083037038, 'epoch': 0.39}
{'loss': 0.717, 'grad_norm': 0.2686959505081177, 'learning_rate': 0.00019877509732704586, 'epoch': 0.39}
{'loss': 0.7376, 'grad_norm': 0.21130098402500153, 'learning_rate': 0.00019876464972947276, 'epoch': 0.39}
{'loss': 0.7148, 'grad_norm': 0.24252519011497498, 'learning_rate': 0.00019875415804231504, 'epoch': 0.39}
{'loss': 0.696, 'grad_norm': 0.24347320199012756, 'learning_rate': 0.00019874362227025624, 'epoch': 0.39}
{'loss': 0.7645, 'grad_norm': 0.2264049053192

Checkpoint destination directory ./results/checkpoint-775 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.4929, 'grad_norm': 0.34293273091316223, 'learning_rate': 0.00019869028230175013, 'epoch': 0.4}
{'loss': 0.6151, 'grad_norm': 0.25331559777259827, 'learning_rate': 0.00019867948211974262, 'epoch': 0.4}
{'loss': 0.5996, 'grad_norm': 0.3461962044239044, 'learning_rate': 0.00019866863788617017, 'epoch': 0.4}
{'loss': 0.7598, 'grad_norm': 0.27482739090919495, 'learning_rate': 0.00019865774960587377, 'epoch': 0.4}
{'loss': 0.6287, 'grad_norm': 0.3044281005859375, 'learning_rate': 0.0001986468172837141, 'epoch': 0.4}
{'loss': 0.5884, 'grad_norm': 0.3554573059082031, 'learning_rate': 0.0001986358409245714, 'epoch': 0.41}
{'loss': 0.5156, 'grad_norm': 0.39855819940567017, 'learning_rate': 0.00019862482053334567, 'epoch': 0.41}
{'loss': 0.6652, 'grad_norm': 0.37743711471557617, 'learning_rate': 0.00019861375611495655, 'epoch': 0.41}
{'loss': 0.8522, 'grad_norm': 0.5154813528060913, 'learning_rate': 0.00019860264767434325, 'epoch': 0.41}
{'loss': 0.7596, 'grad_norm': 0.5487068891525269

Checkpoint destination directory ./results/checkpoint-800 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.0832, 'grad_norm': 1.0267060995101929, 'learning_rate': 0.00019855777378912203, 'epoch': 0.41}
{'loss': 0.8168, 'grad_norm': 0.16058959066867828, 'learning_rate': 0.0001985464453121649, 'epoch': 0.41}
{'loss': 0.6604, 'grad_norm': 0.2758268713951111, 'learning_rate': 0.00019853507284303188, 'epoch': 0.41}
{'loss': 0.827, 'grad_norm': 0.39596712589263916, 'learning_rate': 0.00019852365638679977, 'epoch': 0.42}
{'loss': 0.4008, 'grad_norm': 0.1947191059589386, 'learning_rate': 0.00019851219594856502, 'epoch': 0.42}
{'loss': 0.9067, 'grad_norm': 0.2745696008205414, 'learning_rate': 0.00019850069153344363, 'epoch': 0.42}
{'loss': 0.7898, 'grad_norm': 0.25951090455055237, 'learning_rate': 0.00019848914314657133, 'epoch': 0.42}
{'loss': 0.6439, 'grad_norm': 0.23082460463047028, 'learning_rate': 0.00019847755079310342, 'epoch': 0.42}
{'loss': 0.8069, 'grad_norm': 0.2399863451719284, 'learning_rate': 0.0001984659144782149, 'epoch': 0.42}
{'loss': 0.7054, 'grad_norm': 0.2567037940025

Checkpoint destination directory ./results/checkpoint-825 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.4638, 'grad_norm': 0.31153538823127747, 'learning_rate': 0.00019840707366495944, 'epoch': 0.43}
{'loss': 0.4154, 'grad_norm': 0.31172674894332886, 'learning_rate': 0.0001983951736913198, 'epoch': 0.43}
{'loss': 0.5594, 'grad_norm': 0.3465496897697449, 'learning_rate': 0.00019838322979303353, 'epoch': 0.43}
{'loss': 0.7429, 'grad_norm': 0.3877885937690735, 'learning_rate': 0.00019837124197543253, 'epoch': 0.43}
{'loss': 0.6103, 'grad_norm': 0.3715866804122925, 'learning_rate': 0.00019835921024386824, 'epoch': 0.43}
{'loss': 0.7704, 'grad_norm': 0.40052375197410583, 'learning_rate': 0.00019834713460371178, 'epoch': 0.43}
{'loss': 0.854, 'grad_norm': 0.3958892524242401, 'learning_rate': 0.00019833501506035387, 'epoch': 0.43}
{'loss': 0.7436, 'grad_norm': 0.4491722583770752, 'learning_rate': 0.00019832285161920478, 'epoch': 0.43}
{'loss': 0.8357, 'grad_norm': 0.37821826338768005, 'learning_rate': 0.00019831064428569437, 'epoch': 0.43}
{'loss': 0.682, 'grad_norm': 0.5709822177886

Checkpoint destination directory ./results/checkpoint-850 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.9022, 'grad_norm': 0.6487171649932861, 'learning_rate': 0.00019826137613732333, 'epoch': 0.44}
{'loss': 0.7225, 'grad_norm': 0.39548686146736145, 'learning_rate': 0.0001982489494241405, 'epoch': 0.44}
{'loss': 0.7413, 'grad_norm': 0.22753594815731049, 'learning_rate': 0.0001982364788515871, 'epoch': 0.44}
{'loss': 0.667, 'grad_norm': 0.3149030804634094, 'learning_rate': 0.0001982239644252301, 'epoch': 0.44}
{'loss': 0.6933, 'grad_norm': 0.17515483498573303, 'learning_rate': 0.0001982114061506561, 'epoch': 0.44}
{'loss': 0.6455, 'grad_norm': 0.21400631964206696, 'learning_rate': 0.00019819880403347127, 'epoch': 0.44}
{'loss': 0.57, 'grad_norm': 0.22578123211860657, 'learning_rate': 0.00019818615807930127, 'epoch': 0.44}
{'loss': 0.6018, 'grad_norm': 0.27964967489242554, 'learning_rate': 0.00019817346829379141, 'epoch': 0.45}
{'loss': 0.7652, 'grad_norm': 0.26734986901283264, 'learning_rate': 0.00019816073468260657, 'epoch': 0.45}
{'loss': 0.8539, 'grad_norm': 0.23918825387954

Checkpoint destination directory ./results/checkpoint-875 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6963, 'grad_norm': 0.25139331817626953, 'learning_rate': 0.00019809640944119664, 'epoch': 0.45}
{'loss': 0.5556, 'grad_norm': 0.29058897495269775, 'learning_rate': 0.00019808341299601919, 'epoch': 0.45}
{'loss': 0.553, 'grad_norm': 0.31147006154060364, 'learning_rate': 0.00019807037276536838, 'epoch': 0.45}
{'loss': 0.5551, 'grad_norm': 0.38611865043640137, 'learning_rate': 0.00019805728875506548, 'epoch': 0.45}
{'loss': 0.6804, 'grad_norm': 0.3063066601753235, 'learning_rate': 0.00019804416097095136, 'epoch': 0.46}
{'loss': 0.628, 'grad_norm': 0.43884336948394775, 'learning_rate': 0.0001980309894188864, 'epoch': 0.46}
{'loss': 1.0507, 'grad_norm': 0.34741663932800293, 'learning_rate': 0.00019801777410475055, 'epoch': 0.46}
{'loss': 0.6629, 'grad_norm': 0.3958677053451538, 'learning_rate': 0.0001980045150344432, 'epoch': 0.46}
{'loss': 0.9032, 'grad_norm': 0.34454458951950073, 'learning_rate': 0.00019799121221388338, 'epoch': 0.46}
{'loss': 0.7889, 'grad_norm': 0.48938244581

Checkpoint destination directory ./results/checkpoint-900 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.3415, 'grad_norm': 0.7038065195083618, 'learning_rate': 0.0001979375635481827, 'epoch': 0.46}
{'loss': 0.9815, 'grad_norm': 0.2043527066707611, 'learning_rate': 0.00019792404206582883, 'epoch': 0.47}
{'loss': 0.9879, 'grad_norm': 0.16947944462299347, 'learning_rate': 0.00019791047686914645, 'epoch': 0.47}
{'loss': 0.8169, 'grad_norm': 0.5049649477005005, 'learning_rate': 0.00019789686796419125, 'epoch': 0.47}
{'loss': 0.9585, 'grad_norm': 0.1596957892179489, 'learning_rate': 0.00019788321535703834, 'epoch': 0.47}
{'loss': 0.6925, 'grad_norm': 0.2587966024875641, 'learning_rate': 0.00019786951905378244, 'epoch': 0.47}
{'loss': 0.7674, 'grad_norm': 0.26416999101638794, 'learning_rate': 0.0001978557790605377, 'epoch': 0.47}
{'loss': 0.5811, 'grad_norm': 0.23020897805690765, 'learning_rate': 0.0001978419953834378, 'epoch': 0.47}
{'loss': 0.5921, 'grad_norm': 0.26216545701026917, 'learning_rate': 0.0001978281680286359, 'epoch': 0.47}
{'loss': 0.6143, 'grad_norm': 0.27741923928260

Checkpoint destination directory ./results/checkpoint-925 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.5662, 'grad_norm': 0.268219530582428, 'learning_rate': 0.00019775837630582266, 'epoch': 0.48}
{'loss': 0.6439, 'grad_norm': 0.3614184558391571, 'learning_rate': 0.00019774428701511734, 'epoch': 0.48}
{'loss': 0.6035, 'grad_norm': 0.3100531995296478, 'learning_rate': 0.00019773015409032805, 'epoch': 0.48}
{'loss': 0.7946, 'grad_norm': 0.27295583486557007, 'learning_rate': 0.00019771597753776395, 'epoch': 0.48}
{'loss': 0.5473, 'grad_norm': 0.4462517201900482, 'learning_rate': 0.00019770175736375358, 'epoch': 0.48}
{'loss': 0.4665, 'grad_norm': 0.37601613998413086, 'learning_rate': 0.00019768749357464493, 'epoch': 0.48}
{'loss': 0.7105, 'grad_norm': 0.3236527442932129, 'learning_rate': 0.00019767318617680554, 'epoch': 0.48}
{'loss': 0.8024, 'grad_norm': 0.4720814824104309, 'learning_rate': 0.0001976588351766224, 'epoch': 0.48}
{'loss': 0.8327, 'grad_norm': 0.4985937774181366, 'learning_rate': 0.00019764444058050193, 'epoch': 0.49}
{'loss': 0.8443, 'grad_norm': 0.46524110436439

Checkpoint destination directory ./results/checkpoint-950 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 1.0356, 'grad_norm': 1.3305107355117798, 'learning_rate': 0.0001975864263654566, 'epoch': 0.49}
{'loss': 0.7353, 'grad_norm': 0.19739165902137756, 'learning_rate': 0.0001975718138864271, 'epoch': 0.49}
{'loss': 0.8394, 'grad_norm': 0.3823391795158386, 'learning_rate': 0.00019755715785030754, 'epoch': 0.49}
{'loss': 0.5209, 'grad_norm': 0.2174692004919052, 'learning_rate': 0.00019754245826364043, 'epoch': 0.49}
{'loss': 0.6693, 'grad_norm': 0.23924972116947174, 'learning_rate': 0.0001975277151329879, 'epoch': 0.49}
{'loss': 0.6855, 'grad_norm': 0.2242138534784317, 'learning_rate': 0.00019751292846493138, 'epoch': 0.49}
{'loss': 0.7057, 'grad_norm': 0.2608100175857544, 'learning_rate': 0.00019749809826607187, 'epoch': 0.5}
{'loss': 0.7382, 'grad_norm': 0.317317932844162, 'learning_rate': 0.0001974832245430297, 'epoch': 0.5}
{'loss': 0.6876, 'grad_norm': 0.245153546333313, 'learning_rate': 0.00019746830730244463, 'epoch': 0.5}
{'loss': 0.4626, 'grad_norm': 0.17401063442230225, 'l



{'loss': 0.4178, 'grad_norm': 0.5726067423820496, 'learning_rate': 7.837349531182536e-06, 'epoch': 4.38}
{'loss': 0.3986, 'grad_norm': 0.5874629616737366, 'learning_rate': 7.81144106977766e-06, 'epoch': 4.38}
{'loss': 0.4157, 'grad_norm': 0.7348241806030273, 'learning_rate': 7.78557376232093e-06, 'epoch': 4.39}
{'loss': 0.4586, 'grad_norm': 1.0093730688095093, 'learning_rate': 7.759747620359758e-06, 'epoch': 4.39}
{'loss': 0.3393, 'grad_norm': 0.2546720504760742, 'learning_rate': 7.73396265542322e-06, 'epoch': 4.39}
{'loss': 0.2296, 'grad_norm': 0.19211259484291077, 'learning_rate': 7.708218879022022e-06, 'epoch': 4.39}
{'loss': 0.2485, 'grad_norm': 0.22691130638122559, 'learning_rate': 7.682516302648423e-06, 'epoch': 4.39}
{'loss': 0.3844, 'grad_norm': 0.31191185116767883, 'learning_rate': 7.656854937776326e-06, 'epoch': 4.39}
{'loss': 0.3834, 'grad_norm': 0.3476581275463104, 'learning_rate': 7.631234795861265e-06, 'epoch': 4.39}
{'loss': 0.2902, 'grad_norm': 0.2884020209312439, 'lear

# Run the Model
The following runs the model post fine tune

In [6]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model

prompt = "Write a code code scenario of a data encryption program using SHA-256 algorithm"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Write a code code scenario of a data encryption program using SHA-256 algorithm [/INST] #define DEBUG 0\nint main(int argc, char **argv)\n{\n#if DEBUG\n    printf("debug enabled\\n");\n#endif\n    unsigned int i;\n    unsigned int j;\n    unsigned int k;\n    unsigned int num_blocks;\n    unsigned int num_bytes;\n    unsigned int num_key_bits;\n    unsigned int num_block_key_bits;\n    unsigned int num_key_bits;\n    unsigned int num_block_key_bits;\n    unsigned int num_key_bits_mod;\n    unsigned int num_block_key_bits_mod;\n    unsigned int num_key_bits_mod;\n    unsigned int num_block_key_bits_mod;\n    unsigned int


In [7]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [8]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

In [9]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0}#device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [10]:
# !huggingface-cli login

# model.push_to_hub(new_model, use_temp_dir=False)
# tokenizer.push_to_hub(new_model, use_temp_dir=False)