# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [39]:
#pip install transformers datasets evaluate accelerate


#clear CUDA cache
import torch
import gc
torch.cuda.empty_cache()
gc.collect()


# Set the device as GPU, MPS, or CPU according to availability
import torch
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    
    
#device = torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda:0


In [40]:
import sys
import pprint
sys.displayhook = pprint.pprint

In [41]:
from datasets import load_dataset, load_dataset_builder,get_dataset_split_names,get_dataset_config_names

dataset_name = "dair-ai/emotion"

dataset = load_dataset(dataset_name)



splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset(dataset_name, split=splits))}

# Thin out the dataset to make it run faster for this example
for split in splits:
    ds[split] = ds[split].shuffle(seed=42).select(range(1000))


ds_builder = load_dataset_builder(dataset_name)
print(ds_builder.info.description)
print(ds_builder.info.features)

get_dataset_split_names(dataset_name)

configs = get_dataset_config_names(dataset_name)

print(configs)

ds


{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}
['split', 'unsplit']


{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 1000
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 1000
 })}

In [42]:
ds['test'][1]

{'text': 'i feel so thrilled to have three such distinguished individuals such as yourselves here',
 'label': 1}

In [43]:


num_classes = dataset['train'].features['label'].num_classes
id2label = {id:dataset['train'].features['label'].int2str(id) for id in range(num_classes)}
label2id = {label:id for (id,label) in id2label.items()}
print(ds_builder.info.features)
print(id2label)
print(label2id)
print(len(id2label))

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}
{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}
6


In [44]:
from transformers import AutoTokenizer

foundation_model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(foundation_model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})




def preprocess_function(examples):
    return tokenizer(examples["text"],padding="max_length", truncation=True)

tokenized_data = dataset.map(preprocess_function, batched=True)

tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)


# Show the first example of the tokenized training set
print(tokenized_ds["train"][0]["input_ids"])



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

[101, 2096, 9670, 1999, 1996, 2406, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
from transformers import AutoModelForSequenceClassification



model = AutoModelForSequenceClassification.from_pretrained(
    foundation_model_name, 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)


for param in model.base_model.parameters():
    param.requires_grad = False
    
    
print(model.classifier)

model = model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=6, bias=True)


In [46]:
print(model)



DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [47]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

""" import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()} """

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

    # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    

batch_size = 1

# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis",
        per_device_eval_batch_size=batch_size,
        label_names=["labels"],
    ),
    #train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

#trainer.train()

In [48]:
# Show the performance of the model on the test set
# What do you think the evaluation accuracy will be?
trainer.evaluate()



  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.817934513092041,
 'eval_accuracy': 0.032,
 'eval_precision': 0.1443512145748988,
 'eval_recall': 0.032,
 'eval_f1': 0.007531017078270091,
 'eval_runtime': 42.6607,
 'eval_samples_per_second': 23.441,
 'eval_steps_per_second': 11.72}

In [49]:
import pandas as pd
import numpy as np

df = pd.DataFrame(tokenized_ds["test"])
df = df[["text", "label"]]

# Replace <br /> tags in the text with spaces
df["text"] = df["text"].str.replace("<br />", " ")

# Add the model predictions to the dataframe
predictions = trainer.predict(tokenized_ds["test"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)

df.head(10)



  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,text,label,predicted_label
0,i was feeling really troubled and down over what my dad said,0,5
1,i feel so thrilled to have three such distinguished individuals such as yourselves here,1,5
2,i feel is that the most likeable characters aren t important enough to the plot,1,5
3,i tune out the rest of the world and focus on the rhythm of the needles and the softness of the yarn and for that time i feel my most peaceful,1,5
4,i sit here writing this i feel unhappy inside,0,5
5,im feeling and if ive liked being pregnant,2,5
6,im very hurt and i feel unimportant,0,5
7,i used to be able to hang around talk with the cashier when i was putting away my money now i feel rushed and stressed if i take a second to fumble with the coins and put them in my purse,3,5
8,i don t have the feeling of divine vibrations,1,5
9,i vented my feelings towards the pathetic excuse of a communicat,0,5


In [50]:
# Show full cell output
pd.set_option("display.max_colwidth", None)

df[df["label"] != df["predicted_label"]].head(10)

Unnamed: 0,text,label,predicted_label
0,i was feeling really troubled and down over what my dad said,0,5
1,i feel so thrilled to have three such distinguished individuals such as yourselves here,1,5
2,i feel is that the most likeable characters aren t important enough to the plot,1,5
3,i tune out the rest of the world and focus on the rhythm of the needles and the softness of the yarn and for that time i feel my most peaceful,1,5
4,i sit here writing this i feel unhappy inside,0,5
5,im feeling and if ive liked being pregnant,2,5
6,im very hurt and i feel unimportant,0,5
7,i used to be able to hang around talk with the cashier when i was putting away my money now i feel rushed and stressed if i take a second to fumble with the coins and put them in my purse,3,5
8,i don t have the feeling of divine vibrations,1,5
9,i vented my feelings towards the pathetic excuse of a communicat,0,5


In [51]:
# Show full cell output
pd.set_option("display.max_colwidth", None)

df[df["label"] == df["predicted_label"]].head(10)

Unnamed: 0,text,label,predicted_label
51,i feel a strange gratitude for the hated israeli occupation of sinai that lasted from to for actually recognizing the importance of sinais history,5,5
78,im feeling sentimental or in need of reassurance,0,0
134,i go through my day feeling your movements and am amazed that something so miraculous is happening in my body its like a special secret only you and i have,5,5
179,i was so uncomfortable and feeling weird feelings but wasn t sure if they were contractions since i never really felt contractions with jared until they jacked me up with pitocin,5,5
184,i feel shame in a strange way,5,5
193,i feel like i should not be surprised at this development,5,5
217,i received the blanket i was absolutely amazed on how fluffy it is and extremely soft i really didnt think it was going to feel that amazing,5,5
218,i feel more amazed and more thankful for having e in our lives,5,5
271,i feel shocked that you d stoup to destinys child b,5,5
293,i got home and told peter how i was feeling he wasnt shocked at all by what i was telling him,5,5


In [52]:
#clear CUDA cache
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

34

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [53]:
import peft
from peft import LoraConfig, get_peft_model, AutoPeftModel,PeftConfig

lora_config = LoraConfig(
    r=5, 
    lora_alpha=1, # the alpha parameter for the lora loss.
    target_modules= ['classifier'], # the modules that are to be trained.
    #target_modules="all-linear",
    lora_dropout=0.05, # the dropout rate for the lora loss.
    bias="lora_only", # the bias for the lora loss.
)

In [54]:
import torch
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []
    
    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing 

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])
    
    return layer_names

list(set(get_specific_layer_names(model)))

['', 'attention', 'ffn']

In [55]:

pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print (pytorch_total_params)

peft_model = get_peft_model(model, lora_config)
peft_model = peft_model.to(device)

print(peft_model.print_trainable_parameters())

print(peft_model)



595206
trainable params: 3,876 || all params: 66,961,956 || trainable%: 0.005788361379407734
None
PeftModel(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(in_features=768, out_features=768, bias=True)
                (k_lin): Linear(in_features=768, out_features=768, bias=True)
                (v_lin): Linear(in_features=768, out_features=768, bias=True)
                (out_lin): Linear(in

In [56]:
#Create a directory to contain the Model
import os
working_dir = './'

output_directory = os.path.join(working_dir, "./data/sentiment_analysis")

In [57]:

#Creating the TrainingArgs

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding



trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis",
        learning_rate=2e-2, 
        # Reduce the batch size if you don't have enough memory
        #auto_find_batch_size=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=100,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        do_eval=True,
        do_predict=True,
        metric_for_best_model="accuracy",
        label_names=["labels"],
        #load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


trainer.train()

  0%|          | 0/50000 [00:00<?, ?it/s]



{'loss': 1.6554, 'grad_norm': 0.9739303588867188, 'learning_rate': 0.0198, 'epoch': 1.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.5280498266220093, 'eval_accuracy': 0.374, 'eval_precision': 0.3911916241062309, 'eval_recall': 0.374, 'eval_f1': 0.278370263019168, 'eval_runtime': 41.6402, 'eval_samples_per_second': 24.015, 'eval_steps_per_second': 12.008, 'epoch': 1.0}




{'loss': 1.579, 'grad_norm': 1.2943836450576782, 'learning_rate': 0.0196, 'epoch': 2.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4025729894638062, 'eval_accuracy': 0.474, 'eval_precision': 0.30319551885653584, 'eval_recall': 0.474, 'eval_f1': 0.36667561700751355, 'eval_runtime': 42.2933, 'eval_samples_per_second': 23.644, 'eval_steps_per_second': 11.822, 'epoch': 2.0}




{'loss': 1.5417, 'grad_norm': 1.220957636833191, 'learning_rate': 0.0194, 'epoch': 3.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3975383043289185, 'eval_accuracy': 0.469, 'eval_precision': 0.4105251165323891, 'eval_recall': 0.469, 'eval_f1': 0.3843274118547589, 'eval_runtime': 42.3793, 'eval_samples_per_second': 23.596, 'eval_steps_per_second': 11.798, 'epoch': 3.0}




{'loss': 1.519, 'grad_norm': 0.729088544845581, 'learning_rate': 0.0192, 'epoch': 4.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3605320453643799, 'eval_accuracy': 0.491, 'eval_precision': 0.4050585173218452, 'eval_recall': 0.491, 'eval_f1': 0.3845873887171917, 'eval_runtime': 42.5904, 'eval_samples_per_second': 23.479, 'eval_steps_per_second': 11.74, 'epoch': 4.0}




{'loss': 1.5362, 'grad_norm': 1.036426067352295, 'learning_rate': 0.019, 'epoch': 5.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3840442895889282, 'eval_accuracy': 0.491, 'eval_precision': 0.31503747870528104, 'eval_recall': 0.491, 'eval_f1': 0.38346135306553913, 'eval_runtime': 42.5178, 'eval_samples_per_second': 23.52, 'eval_steps_per_second': 11.76, 'epoch': 5.0}




{'loss': 1.4939, 'grad_norm': 0.7340205311775208, 'learning_rate': 0.0188, 'epoch': 6.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.345322847366333, 'eval_accuracy': 0.502, 'eval_precision': 0.4227602332222751, 'eval_recall': 0.502, 'eval_f1': 0.4054283362877535, 'eval_runtime': 42.2162, 'eval_samples_per_second': 23.688, 'eval_steps_per_second': 11.844, 'epoch': 6.0}




{'loss': 1.4831, 'grad_norm': 0.7629669904708862, 'learning_rate': 0.018600000000000002, 'epoch': 7.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.402084469795227, 'eval_accuracy': 0.459, 'eval_precision': 0.3667976658941616, 'eval_recall': 0.459, 'eval_f1': 0.3711240457775262, 'eval_runtime': 42.9329, 'eval_samples_per_second': 23.292, 'eval_steps_per_second': 11.646, 'epoch': 7.0}




{'loss': 1.471, 'grad_norm': 1.0684525966644287, 'learning_rate': 0.0184, 'epoch': 8.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3948912620544434, 'eval_accuracy': 0.472, 'eval_precision': 0.4295942572623662, 'eval_recall': 0.472, 'eval_f1': 0.4171899737087818, 'eval_runtime': 42.148, 'eval_samples_per_second': 23.726, 'eval_steps_per_second': 11.863, 'epoch': 8.0}




{'loss': 1.4774, 'grad_norm': 1.052148699760437, 'learning_rate': 0.0182, 'epoch': 9.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3677936792373657, 'eval_accuracy': 0.508, 'eval_precision': 0.38472902814818455, 'eval_recall': 0.508, 'eval_f1': 0.39963324117020416, 'eval_runtime': 42.3913, 'eval_samples_per_second': 23.59, 'eval_steps_per_second': 11.795, 'epoch': 9.0}




{'loss': 1.462, 'grad_norm': 0.8972395658493042, 'learning_rate': 0.018000000000000002, 'epoch': 10.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.336026906967163, 'eval_accuracy': 0.51, 'eval_precision': 0.4596119791666667, 'eval_recall': 0.51, 'eval_f1': 0.4181017287141297, 'eval_runtime': 42.793, 'eval_samples_per_second': 23.368, 'eval_steps_per_second': 11.684, 'epoch': 10.0}




{'loss': 1.4437, 'grad_norm': 1.6635032892227173, 'learning_rate': 0.0178, 'epoch': 11.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3481215238571167, 'eval_accuracy': 0.513, 'eval_precision': 0.4653810914901029, 'eval_recall': 0.513, 'eval_f1': 0.44681151128649355, 'eval_runtime': 42.7161, 'eval_samples_per_second': 23.41, 'eval_steps_per_second': 11.705, 'epoch': 11.0}




{'loss': 1.4714, 'grad_norm': 0.8653721213340759, 'learning_rate': 0.0176, 'epoch': 12.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3681607246398926, 'eval_accuracy': 0.495, 'eval_precision': 0.4618588934841464, 'eval_recall': 0.495, 'eval_f1': 0.4604157787403706, 'eval_runtime': 42.0307, 'eval_samples_per_second': 23.792, 'eval_steps_per_second': 11.896, 'epoch': 12.0}




{'loss': 1.4871, 'grad_norm': 1.4076159000396729, 'learning_rate': 0.0174, 'epoch': 13.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3467607498168945, 'eval_accuracy': 0.484, 'eval_precision': 0.3344808040420241, 'eval_recall': 0.484, 'eval_f1': 0.3871311428195254, 'eval_runtime': 42.3261, 'eval_samples_per_second': 23.626, 'eval_steps_per_second': 11.813, 'epoch': 13.0}




{'loss': 1.4527, 'grad_norm': 0.9630712866783142, 'learning_rate': 0.0172, 'epoch': 14.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.411891222000122, 'eval_accuracy': 0.456, 'eval_precision': 0.41290104534050576, 'eval_recall': 0.456, 'eval_f1': 0.3744796790062702, 'eval_runtime': 42.7523, 'eval_samples_per_second': 23.391, 'eval_steps_per_second': 11.695, 'epoch': 14.0}




{'loss': 1.4588, 'grad_norm': 0.43865686655044556, 'learning_rate': 0.017, 'epoch': 15.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3199951648712158, 'eval_accuracy': 0.518, 'eval_precision': 0.4313785610152524, 'eval_recall': 0.518, 'eval_f1': 0.41664658719468944, 'eval_runtime': 42.1652, 'eval_samples_per_second': 23.716, 'eval_steps_per_second': 11.858, 'epoch': 15.0}




{'loss': 1.4317, 'grad_norm': 1.2252612113952637, 'learning_rate': 0.0168, 'epoch': 16.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4115294218063354, 'eval_accuracy': 0.458, 'eval_precision': 0.5341775152684025, 'eval_recall': 0.458, 'eval_f1': 0.432388481823458, 'eval_runtime': 42.6541, 'eval_samples_per_second': 23.444, 'eval_steps_per_second': 11.722, 'epoch': 16.0}




{'loss': 1.4279, 'grad_norm': 1.697898507118225, 'learning_rate': 0.0166, 'epoch': 17.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3606996536254883, 'eval_accuracy': 0.507, 'eval_precision': 0.5484795029482027, 'eval_recall': 0.507, 'eval_f1': 0.4454644683523994, 'eval_runtime': 42.5536, 'eval_samples_per_second': 23.5, 'eval_steps_per_second': 11.75, 'epoch': 17.0}




{'loss': 1.4385, 'grad_norm': 1.7038325071334839, 'learning_rate': 0.016399999999999998, 'epoch': 18.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.3845219612121582, 'eval_accuracy': 0.487, 'eval_precision': 0.46269620675571366, 'eval_recall': 0.487, 'eval_f1': 0.45334282549118604, 'eval_runtime': 42.5621, 'eval_samples_per_second': 23.495, 'eval_steps_per_second': 11.748, 'epoch': 18.0}




{'loss': 1.449, 'grad_norm': 1.3838067054748535, 'learning_rate': 0.016200000000000003, 'epoch': 19.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3474403619766235, 'eval_accuracy': 0.518, 'eval_precision': 0.5167955757172507, 'eval_recall': 0.518, 'eval_f1': 0.4412085909585015, 'eval_runtime': 42.5878, 'eval_samples_per_second': 23.481, 'eval_steps_per_second': 11.74, 'epoch': 19.0}




{'loss': 1.4014, 'grad_norm': 0.9813433289527893, 'learning_rate': 0.016, 'epoch': 20.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3534431457519531, 'eval_accuracy': 0.444, 'eval_precision': 0.4231722803743836, 'eval_recall': 0.444, 'eval_f1': 0.3787348662851722, 'eval_runtime': 41.9368, 'eval_samples_per_second': 23.845, 'eval_steps_per_second': 11.923, 'epoch': 20.0}




{'loss': 1.4354, 'grad_norm': 0.7479931116104126, 'learning_rate': 0.0158, 'epoch': 21.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2963734865188599, 'eval_accuracy': 0.512, 'eval_precision': 0.42269287063118627, 'eval_recall': 0.512, 'eval_f1': 0.4187327265242792, 'eval_runtime': 42.4261, 'eval_samples_per_second': 23.57, 'eval_steps_per_second': 11.785, 'epoch': 21.0}




{'loss': 1.4079, 'grad_norm': 1.061863660812378, 'learning_rate': 0.015600000000000001, 'epoch': 22.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3657389879226685, 'eval_accuracy': 0.493, 'eval_precision': 0.5460524405072087, 'eval_recall': 0.493, 'eval_f1': 0.41760248557871604, 'eval_runtime': 41.7139, 'eval_samples_per_second': 23.973, 'eval_steps_per_second': 11.986, 'epoch': 22.0}




{'loss': 1.3913, 'grad_norm': 2.3694040775299072, 'learning_rate': 0.0154, 'epoch': 23.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3329304456710815, 'eval_accuracy': 0.505, 'eval_precision': 0.546925650232653, 'eval_recall': 0.5049999999999999, 'eval_f1': 0.4309555504359842, 'eval_runtime': 42.0607, 'eval_samples_per_second': 23.775, 'eval_steps_per_second': 11.888, 'epoch': 23.0}




{'loss': 1.4254, 'grad_norm': 1.688113808631897, 'learning_rate': 0.0152, 'epoch': 24.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3643099069595337, 'eval_accuracy': 0.48, 'eval_precision': 0.49467715748369856, 'eval_recall': 0.48, 'eval_f1': 0.3896076659931435, 'eval_runtime': 42.1145, 'eval_samples_per_second': 23.745, 'eval_steps_per_second': 11.872, 'epoch': 24.0}




{'loss': 1.4097, 'grad_norm': 1.4423322677612305, 'learning_rate': 0.015, 'epoch': 25.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.415165901184082, 'eval_accuracy': 0.422, 'eval_precision': 0.47434545146354307, 'eval_recall': 0.422, 'eval_f1': 0.40801595736920104, 'eval_runtime': 42.6271, 'eval_samples_per_second': 23.459, 'eval_steps_per_second': 11.73, 'epoch': 25.0}




{'loss': 1.4123, 'grad_norm': 0.899284839630127, 'learning_rate': 0.0148, 'epoch': 26.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3665698766708374, 'eval_accuracy': 0.459, 'eval_precision': 0.5285025064835777, 'eval_recall': 0.459, 'eval_f1': 0.406689341189497, 'eval_runtime': 42.157, 'eval_samples_per_second': 23.721, 'eval_steps_per_second': 11.86, 'epoch': 26.0}




{'loss': 1.4088, 'grad_norm': 1.260456919670105, 'learning_rate': 0.0146, 'epoch': 27.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3196289539337158, 'eval_accuracy': 0.509, 'eval_precision': 0.4344247228796754, 'eval_recall': 0.509, 'eval_f1': 0.45224625310278044, 'eval_runtime': 41.81, 'eval_samples_per_second': 23.918, 'eval_steps_per_second': 11.959, 'epoch': 27.0}




{'loss': 1.413, 'grad_norm': 1.3242790699005127, 'learning_rate': 0.0144, 'epoch': 28.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.373284935951233, 'eval_accuracy': 0.52, 'eval_precision': 0.504439013881027, 'eval_recall': 0.52, 'eval_f1': 0.4211254148002322, 'eval_runtime': 42.4237, 'eval_samples_per_second': 23.572, 'eval_steps_per_second': 11.786, 'epoch': 28.0}




{'loss': 1.3783, 'grad_norm': 0.5110235214233398, 'learning_rate': 0.014199999999999999, 'epoch': 29.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3206892013549805, 'eval_accuracy': 0.492, 'eval_precision': 0.4413894111115068, 'eval_recall': 0.492, 'eval_f1': 0.4240461388148019, 'eval_runtime': 42.4353, 'eval_samples_per_second': 23.565, 'eval_steps_per_second': 11.783, 'epoch': 29.0}




{'loss': 1.4174, 'grad_norm': 0.8179543614387512, 'learning_rate': 0.013999999999999999, 'epoch': 30.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3339436054229736, 'eval_accuracy': 0.496, 'eval_precision': 0.4728123556172998, 'eval_recall': 0.496, 'eval_f1': 0.4725990628334179, 'eval_runtime': 42.0338, 'eval_samples_per_second': 23.79, 'eval_steps_per_second': 11.895, 'epoch': 30.0}




{'loss': 1.373, 'grad_norm': 1.7143826484680176, 'learning_rate': 0.0138, 'epoch': 31.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3225067853927612, 'eval_accuracy': 0.473, 'eval_precision': 0.43612745410555065, 'eval_recall': 0.473, 'eval_f1': 0.3928097329482137, 'eval_runtime': 41.9983, 'eval_samples_per_second': 23.81, 'eval_steps_per_second': 11.905, 'epoch': 31.0}




{'loss': 1.401, 'grad_norm': 1.5072169303894043, 'learning_rate': 0.013600000000000001, 'epoch': 32.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3860446214675903, 'eval_accuracy': 0.461, 'eval_precision': 0.49471893458966004, 'eval_recall': 0.461, 'eval_f1': 0.4476804785892262, 'eval_runtime': 42.7235, 'eval_samples_per_second': 23.406, 'eval_steps_per_second': 11.703, 'epoch': 32.0}




{'loss': 1.3827, 'grad_norm': 0.8964704275131226, 'learning_rate': 0.0134, 'epoch': 33.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3190405368804932, 'eval_accuracy': 0.502, 'eval_precision': 0.5003573447656378, 'eval_recall': 0.502, 'eval_f1': 0.4124349938439281, 'eval_runtime': 42.5917, 'eval_samples_per_second': 23.479, 'eval_steps_per_second': 11.739, 'epoch': 33.0}




{'loss': 1.3786, 'grad_norm': 1.2560839653015137, 'learning_rate': 0.013200000000000002, 'epoch': 34.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2666218280792236, 'eval_accuracy': 0.526, 'eval_precision': 0.49622063183362564, 'eval_recall': 0.526, 'eval_f1': 0.441114735176612, 'eval_runtime': 41.854, 'eval_samples_per_second': 23.893, 'eval_steps_per_second': 11.946, 'epoch': 34.0}




{'loss': 1.3711, 'grad_norm': 0.8566499948501587, 'learning_rate': 0.013000000000000001, 'epoch': 35.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3536977767944336, 'eval_accuracy': 0.485, 'eval_precision': 0.46012532657138466, 'eval_recall': 0.485, 'eval_f1': 0.40286809436932225, 'eval_runtime': 42.2542, 'eval_samples_per_second': 23.666, 'eval_steps_per_second': 11.833, 'epoch': 35.0}




{'loss': 1.3689, 'grad_norm': 0.5197556018829346, 'learning_rate': 0.0128, 'epoch': 36.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2902741432189941, 'eval_accuracy': 0.506, 'eval_precision': 0.4718155347158612, 'eval_recall': 0.506, 'eval_f1': 0.4279251050583798, 'eval_runtime': 42.4013, 'eval_samples_per_second': 23.584, 'eval_steps_per_second': 11.792, 'epoch': 36.0}




{'loss': 1.3985, 'grad_norm': 1.210261583328247, 'learning_rate': 0.0126, 'epoch': 37.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3252782821655273, 'eval_accuracy': 0.488, 'eval_precision': 0.5130257325176486, 'eval_recall': 0.488, 'eval_f1': 0.41375773963501367, 'eval_runtime': 42.0081, 'eval_samples_per_second': 23.805, 'eval_steps_per_second': 11.902, 'epoch': 37.0}




{'loss': 1.3726, 'grad_norm': 1.5711548328399658, 'learning_rate': 0.0124, 'epoch': 38.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.5104185342788696, 'eval_accuracy': 0.408, 'eval_precision': 0.3907950144156132, 'eval_recall': 0.408, 'eval_f1': 0.3332532760653171, 'eval_runtime': 42.3644, 'eval_samples_per_second': 23.605, 'eval_steps_per_second': 11.802, 'epoch': 38.0}




{'loss': 1.353, 'grad_norm': 1.0447602272033691, 'learning_rate': 0.0122, 'epoch': 39.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.298731803894043, 'eval_accuracy': 0.511, 'eval_precision': 0.47854670632769586, 'eval_recall': 0.511, 'eval_f1': 0.469495118687278, 'eval_runtime': 42.222, 'eval_samples_per_second': 23.684, 'eval_steps_per_second': 11.842, 'epoch': 39.0}




{'loss': 1.3681, 'grad_norm': 1.1873297691345215, 'learning_rate': 0.012, 'epoch': 40.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.271195650100708, 'eval_accuracy': 0.53, 'eval_precision': 0.46456570679745435, 'eval_recall': 0.53, 'eval_f1': 0.45314696880527544, 'eval_runtime': 42.4956, 'eval_samples_per_second': 23.532, 'eval_steps_per_second': 11.766, 'epoch': 40.0}




{'loss': 1.3859, 'grad_norm': 0.9159631729125977, 'learning_rate': 0.0118, 'epoch': 41.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.282354712486267, 'eval_accuracy': 0.529, 'eval_precision': 0.4883658301978396, 'eval_recall': 0.529, 'eval_f1': 0.4392537887113113, 'eval_runtime': 42.0023, 'eval_samples_per_second': 23.808, 'eval_steps_per_second': 11.904, 'epoch': 41.0}




{'loss': 1.3642, 'grad_norm': 1.1500364542007446, 'learning_rate': 0.0116, 'epoch': 42.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3263123035430908, 'eval_accuracy': 0.482, 'eval_precision': 0.480823574969349, 'eval_recall': 0.482, 'eval_f1': 0.4468431329927434, 'eval_runtime': 42.6475, 'eval_samples_per_second': 23.448, 'eval_steps_per_second': 11.724, 'epoch': 42.0}




{'loss': 1.3527, 'grad_norm': 0.9345324039459229, 'learning_rate': 0.011399999999999999, 'epoch': 43.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3239017724990845, 'eval_accuracy': 0.528, 'eval_precision': 0.5461340921237964, 'eval_recall': 0.528, 'eval_f1': 0.43201331788631203, 'eval_runtime': 42.3631, 'eval_samples_per_second': 23.605, 'eval_steps_per_second': 11.803, 'epoch': 43.0}




{'loss': 1.4002, 'grad_norm': 1.2569212913513184, 'learning_rate': 0.011200000000000002, 'epoch': 44.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.4225293397903442, 'eval_accuracy': 0.446, 'eval_precision': 0.5391604701216771, 'eval_recall': 0.446, 'eval_f1': 0.41364188697560694, 'eval_runtime': 41.9298, 'eval_samples_per_second': 23.849, 'eval_steps_per_second': 11.925, 'epoch': 44.0}




{'loss': 1.3358, 'grad_norm': 1.2384216785430908, 'learning_rate': 0.011000000000000001, 'epoch': 45.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2846839427947998, 'eval_accuracy': 0.519, 'eval_precision': 0.4870310955961332, 'eval_recall': 0.519, 'eval_f1': 0.439175275641534, 'eval_runtime': 42.1262, 'eval_samples_per_second': 23.738, 'eval_steps_per_second': 11.869, 'epoch': 45.0}




{'loss': 1.3664, 'grad_norm': 1.8991115093231201, 'learning_rate': 0.0108, 'epoch': 46.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3594919443130493, 'eval_accuracy': 0.484, 'eval_precision': 0.4911592419992818, 'eval_recall': 0.484, 'eval_f1': 0.47249018701120327, 'eval_runtime': 42.3572, 'eval_samples_per_second': 23.609, 'eval_steps_per_second': 11.804, 'epoch': 46.0}




{'loss': 1.3993, 'grad_norm': 1.1933664083480835, 'learning_rate': 0.0106, 'epoch': 47.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.3744056224822998, 'eval_accuracy': 0.461, 'eval_precision': 0.4818468283065439, 'eval_recall': 0.461, 'eval_f1': 0.43783273491459396, 'eval_runtime': 42.4586, 'eval_samples_per_second': 23.552, 'eval_steps_per_second': 11.776, 'epoch': 47.0}




{'loss': 1.3501, 'grad_norm': 0.6343691945075989, 'learning_rate': 0.010400000000000001, 'epoch': 48.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2781003713607788, 'eval_accuracy': 0.499, 'eval_precision': 0.4206636759992458, 'eval_recall': 0.499, 'eval_f1': 0.4232310835343393, 'eval_runtime': 41.4618, 'eval_samples_per_second': 24.119, 'eval_steps_per_second': 12.059, 'epoch': 48.0}




{'loss': 1.345, 'grad_norm': 0.8235697150230408, 'learning_rate': 0.0102, 'epoch': 49.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3446379899978638, 'eval_accuracy': 0.482, 'eval_precision': 0.4645211970363158, 'eval_recall': 0.482, 'eval_f1': 0.41321299782466075, 'eval_runtime': 42.2087, 'eval_samples_per_second': 23.692, 'eval_steps_per_second': 11.846, 'epoch': 49.0}




{'loss': 1.3547, 'grad_norm': 1.2254270315170288, 'learning_rate': 0.01, 'epoch': 50.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.341411828994751, 'eval_accuracy': 0.475, 'eval_precision': 0.49889192147034256, 'eval_recall': 0.475, 'eval_f1': 0.4396182685549792, 'eval_runtime': 42.393, 'eval_samples_per_second': 23.589, 'eval_steps_per_second': 11.794, 'epoch': 50.0}




{'loss': 1.3846, 'grad_norm': 0.9731242060661316, 'learning_rate': 0.0098, 'epoch': 51.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3387211561203003, 'eval_accuracy': 0.519, 'eval_precision': 0.5276228715496498, 'eval_recall': 0.519, 'eval_f1': 0.44294190225456326, 'eval_runtime': 41.531, 'eval_samples_per_second': 24.078, 'eval_steps_per_second': 12.039, 'epoch': 51.0}




{'loss': 1.307, 'grad_norm': 2.1281681060791016, 'learning_rate': 0.0096, 'epoch': 52.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2811886072158813, 'eval_accuracy': 0.527, 'eval_precision': 0.4506650549248804, 'eval_recall': 0.527, 'eval_f1': 0.4586216145744976, 'eval_runtime': 41.7118, 'eval_samples_per_second': 23.974, 'eval_steps_per_second': 11.987, 'epoch': 52.0}




{'loss': 1.3294, 'grad_norm': 0.7664686441421509, 'learning_rate': 0.0094, 'epoch': 53.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.406400442123413, 'eval_accuracy': 0.436, 'eval_precision': 0.439577731132431, 'eval_recall': 0.436, 'eval_f1': 0.40714547872008217, 'eval_runtime': 41.6702, 'eval_samples_per_second': 23.998, 'eval_steps_per_second': 11.999, 'epoch': 53.0}




{'loss': 1.3498, 'grad_norm': 1.1058405637741089, 'learning_rate': 0.0092, 'epoch': 54.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2618376016616821, 'eval_accuracy': 0.533, 'eval_precision': 0.4590710271373331, 'eval_recall': 0.533, 'eval_f1': 0.4789791545121076, 'eval_runtime': 42.2402, 'eval_samples_per_second': 23.674, 'eval_steps_per_second': 11.837, 'epoch': 54.0}




{'loss': 1.3437, 'grad_norm': 2.4800119400024414, 'learning_rate': 0.009000000000000001, 'epoch': 55.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3324018716812134, 'eval_accuracy': 0.489, 'eval_precision': 0.48744862334689204, 'eval_recall': 0.489, 'eval_f1': 0.4130501661638025, 'eval_runtime': 42.3259, 'eval_samples_per_second': 23.626, 'eval_steps_per_second': 11.813, 'epoch': 55.0}




{'loss': 1.3394, 'grad_norm': 0.8749233484268188, 'learning_rate': 0.0088, 'epoch': 56.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2982304096221924, 'eval_accuracy': 0.501, 'eval_precision': 0.45270678049639146, 'eval_recall': 0.501, 'eval_f1': 0.43315592350768567, 'eval_runtime': 41.9109, 'eval_samples_per_second': 23.86, 'eval_steps_per_second': 11.93, 'epoch': 56.0}




{'loss': 1.3469, 'grad_norm': 1.4616022109985352, 'learning_rate': 0.0086, 'epoch': 57.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3551371097564697, 'eval_accuracy': 0.523, 'eval_precision': 0.5137452639104015, 'eval_recall': 0.523, 'eval_f1': 0.42818453110872357, 'eval_runtime': 42.3585, 'eval_samples_per_second': 23.608, 'eval_steps_per_second': 11.804, 'epoch': 57.0}




{'loss': 1.3207, 'grad_norm': 0.8158915638923645, 'learning_rate': 0.0084, 'epoch': 58.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2850576639175415, 'eval_accuracy': 0.512, 'eval_precision': 0.4958438375482444, 'eval_recall': 0.512, 'eval_f1': 0.44116952751702526, 'eval_runtime': 41.8192, 'eval_samples_per_second': 23.912, 'eval_steps_per_second': 11.956, 'epoch': 58.0}




{'loss': 1.3335, 'grad_norm': 1.7821038961410522, 'learning_rate': 0.008199999999999999, 'epoch': 59.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.3258591890335083, 'eval_accuracy': 0.483, 'eval_precision': 0.4496204914819841, 'eval_recall': 0.483, 'eval_f1': 0.4066619542966801, 'eval_runtime': 41.8081, 'eval_samples_per_second': 23.919, 'eval_steps_per_second': 11.959, 'epoch': 59.0}




{'loss': 1.3362, 'grad_norm': 1.0019891262054443, 'learning_rate': 0.008, 'epoch': 60.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.262588381767273, 'eval_accuracy': 0.533, 'eval_precision': 0.5002294259913985, 'eval_recall': 0.533, 'eval_f1': 0.44755010957140123, 'eval_runtime': 42.5277, 'eval_samples_per_second': 23.514, 'eval_steps_per_second': 11.757, 'epoch': 60.0}




{'loss': 1.3125, 'grad_norm': 2.598184823989868, 'learning_rate': 0.0078000000000000005, 'epoch': 61.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3117319345474243, 'eval_accuracy': 0.485, 'eval_precision': 0.43361765629034077, 'eval_recall': 0.485, 'eval_f1': 0.43423537602937756, 'eval_runtime': 43.0644, 'eval_samples_per_second': 23.221, 'eval_steps_per_second': 11.611, 'epoch': 61.0}




{'loss': 1.3302, 'grad_norm': 1.3999358415603638, 'learning_rate': 0.0076, 'epoch': 62.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.333740472793579, 'eval_accuracy': 0.511, 'eval_precision': 0.4563536815113297, 'eval_recall': 0.511, 'eval_f1': 0.42067914074128365, 'eval_runtime': 42.8645, 'eval_samples_per_second': 23.329, 'eval_steps_per_second': 11.665, 'epoch': 62.0}




{'loss': 1.3271, 'grad_norm': 1.1963245868682861, 'learning_rate': 0.0074, 'epoch': 63.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2754902839660645, 'eval_accuracy': 0.499, 'eval_precision': 0.44744773544064825, 'eval_recall': 0.499, 'eval_f1': 0.46167818319684345, 'eval_runtime': 42.72, 'eval_samples_per_second': 23.408, 'eval_steps_per_second': 11.704, 'epoch': 63.0}




{'loss': 1.3287, 'grad_norm': 2.3824527263641357, 'learning_rate': 0.0072, 'epoch': 64.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2533425092697144, 'eval_accuracy': 0.528, 'eval_precision': 0.46841168124852345, 'eval_recall': 0.528, 'eval_f1': 0.4606107973221077, 'eval_runtime': 42.9514, 'eval_samples_per_second': 23.282, 'eval_steps_per_second': 11.641, 'epoch': 64.0}




{'loss': 1.3057, 'grad_norm': 1.1626312732696533, 'learning_rate': 0.006999999999999999, 'epoch': 65.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2901650667190552, 'eval_accuracy': 0.522, 'eval_precision': 0.512128151605678, 'eval_recall': 0.522, 'eval_f1': 0.4548847388786099, 'eval_runtime': 42.5872, 'eval_samples_per_second': 23.481, 'eval_steps_per_second': 11.741, 'epoch': 65.0}




{'loss': 1.3153, 'grad_norm': 1.305396556854248, 'learning_rate': 0.0068000000000000005, 'epoch': 66.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3136125802993774, 'eval_accuracy': 0.478, 'eval_precision': 0.46305107884701735, 'eval_recall': 0.478, 'eval_f1': 0.4154224059102091, 'eval_runtime': 42.3874, 'eval_samples_per_second': 23.592, 'eval_steps_per_second': 11.796, 'epoch': 66.0}




{'loss': 1.3221, 'grad_norm': 1.2514313459396362, 'learning_rate': 0.006600000000000001, 'epoch': 67.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3957393169403076, 'eval_accuracy': 0.444, 'eval_precision': 0.5273076905887897, 'eval_recall': 0.444, 'eval_f1': 0.43697220486692606, 'eval_runtime': 42.5714, 'eval_samples_per_second': 23.49, 'eval_steps_per_second': 11.745, 'epoch': 67.0}




{'loss': 1.3211, 'grad_norm': 0.8430867791175842, 'learning_rate': 0.0064, 'epoch': 68.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.3154939413070679, 'eval_accuracy': 0.497, 'eval_precision': 0.49061796587542944, 'eval_recall': 0.497, 'eval_f1': 0.48455097386717993, 'eval_runtime': 43.2167, 'eval_samples_per_second': 23.139, 'eval_steps_per_second': 11.57, 'epoch': 68.0}




{'loss': 1.2813, 'grad_norm': 2.2102174758911133, 'learning_rate': 0.0062, 'epoch': 69.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.3564709424972534, 'eval_accuracy': 0.474, 'eval_precision': 0.5235472081255818, 'eval_recall': 0.474, 'eval_f1': 0.44144389775983595, 'eval_runtime': 42.5518, 'eval_samples_per_second': 23.501, 'eval_steps_per_second': 11.75, 'epoch': 69.0}




{'loss': 1.3266, 'grad_norm': 1.0281535387039185, 'learning_rate': 0.006, 'epoch': 70.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3174242973327637, 'eval_accuracy': 0.512, 'eval_precision': 0.5023155265723569, 'eval_recall': 0.512, 'eval_f1': 0.4650014388542949, 'eval_runtime': 42.7012, 'eval_samples_per_second': 23.419, 'eval_steps_per_second': 11.709, 'epoch': 70.0}




{'loss': 1.3263, 'grad_norm': 1.3036061525344849, 'learning_rate': 0.0058, 'epoch': 71.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.314850091934204, 'eval_accuracy': 0.497, 'eval_precision': 0.48423791727103904, 'eval_recall': 0.497, 'eval_f1': 0.4435966836704401, 'eval_runtime': 43.0157, 'eval_samples_per_second': 23.247, 'eval_steps_per_second': 11.624, 'epoch': 71.0}




{'loss': 1.3294, 'grad_norm': 0.5959565043449402, 'learning_rate': 0.005600000000000001, 'epoch': 72.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.30192232131958, 'eval_accuracy': 0.497, 'eval_precision': 0.43910030774235465, 'eval_recall': 0.497, 'eval_f1': 0.4120367791191409, 'eval_runtime': 42.6528, 'eval_samples_per_second': 23.445, 'eval_steps_per_second': 11.723, 'epoch': 72.0}




{'loss': 1.2836, 'grad_norm': 1.3124313354492188, 'learning_rate': 0.0054, 'epoch': 73.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2488903999328613, 'eval_accuracy': 0.539, 'eval_precision': 0.47056851775777847, 'eval_recall': 0.539, 'eval_f1': 0.47483497308382805, 'eval_runtime': 42.3617, 'eval_samples_per_second': 23.606, 'eval_steps_per_second': 11.803, 'epoch': 73.0}




{'loss': 1.2958, 'grad_norm': 3.0660455226898193, 'learning_rate': 0.005200000000000001, 'epoch': 74.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.265485405921936, 'eval_accuracy': 0.524, 'eval_precision': 0.4742777603731551, 'eval_recall': 0.524, 'eval_f1': 0.4617567630872274, 'eval_runtime': 42.6331, 'eval_samples_per_second': 23.456, 'eval_steps_per_second': 11.728, 'epoch': 74.0}




{'loss': 1.2757, 'grad_norm': 1.3245466947555542, 'learning_rate': 0.005, 'epoch': 75.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.3113771677017212, 'eval_accuracy': 0.492, 'eval_precision': 0.4816481879080701, 'eval_recall': 0.492, 'eval_f1': 0.4722456120781937, 'eval_runtime': 43.0779, 'eval_samples_per_second': 23.214, 'eval_steps_per_second': 11.607, 'epoch': 75.0}




{'loss': 1.304, 'grad_norm': 1.2504878044128418, 'learning_rate': 0.0048, 'epoch': 76.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2847967147827148, 'eval_accuracy': 0.521, 'eval_precision': 0.4687077787436814, 'eval_recall': 0.521, 'eval_f1': 0.4380592778381996, 'eval_runtime': 42.7654, 'eval_samples_per_second': 23.383, 'eval_steps_per_second': 11.692, 'epoch': 76.0}




{'loss': 1.2646, 'grad_norm': 0.9478985071182251, 'learning_rate': 0.0046, 'epoch': 77.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2955272197723389, 'eval_accuracy': 0.524, 'eval_precision': 0.4949683558540967, 'eval_recall': 0.524, 'eval_f1': 0.44303877191157787, 'eval_runtime': 42.6171, 'eval_samples_per_second': 23.465, 'eval_steps_per_second': 11.732, 'epoch': 77.0}




{'loss': 1.2965, 'grad_norm': 2.495465040206909, 'learning_rate': 0.0044, 'epoch': 78.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2698835134506226, 'eval_accuracy': 0.539, 'eval_precision': 0.5349952166989796, 'eval_recall': 0.539, 'eval_f1': 0.45952272881036776, 'eval_runtime': 42.8016, 'eval_samples_per_second': 23.364, 'eval_steps_per_second': 11.682, 'epoch': 78.0}




{'loss': 1.2888, 'grad_norm': 1.645450234413147, 'learning_rate': 0.0042, 'epoch': 79.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2875362634658813, 'eval_accuracy': 0.523, 'eval_precision': 0.4833494403494404, 'eval_recall': 0.523, 'eval_f1': 0.47676197665332215, 'eval_runtime': 41.9573, 'eval_samples_per_second': 23.834, 'eval_steps_per_second': 11.917, 'epoch': 79.0}




{'loss': 1.2664, 'grad_norm': 1.561491847038269, 'learning_rate': 0.004, 'epoch': 80.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2530517578125, 'eval_accuracy': 0.531, 'eval_precision': 0.4548849102745355, 'eval_recall': 0.531, 'eval_f1': 0.48197070475252995, 'eval_runtime': 41.9195, 'eval_samples_per_second': 23.855, 'eval_steps_per_second': 11.928, 'epoch': 80.0}




{'loss': 1.2658, 'grad_norm': 1.8453994989395142, 'learning_rate': 0.0038, 'epoch': 81.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.273872971534729, 'eval_accuracy': 0.521, 'eval_precision': 0.4796245630718946, 'eval_recall': 0.521, 'eval_f1': 0.44033147849229715, 'eval_runtime': 42.1873, 'eval_samples_per_second': 23.704, 'eval_steps_per_second': 11.852, 'epoch': 81.0}




{'loss': 1.2645, 'grad_norm': 1.870595932006836, 'learning_rate': 0.0036, 'epoch': 82.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3120895624160767, 'eval_accuracy': 0.504, 'eval_precision': 0.4731170001799916, 'eval_recall': 0.504, 'eval_f1': 0.4407100699037629, 'eval_runtime': 42.5978, 'eval_samples_per_second': 23.475, 'eval_steps_per_second': 11.738, 'epoch': 82.0}




{'loss': 1.2497, 'grad_norm': 1.4326962232589722, 'learning_rate': 0.0034000000000000002, 'epoch': 83.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.303066372871399, 'eval_accuracy': 0.496, 'eval_precision': 0.4866751203212813, 'eval_recall': 0.496, 'eval_f1': 0.43898333635675835, 'eval_runtime': 42.3773, 'eval_samples_per_second': 23.598, 'eval_steps_per_second': 11.799, 'epoch': 83.0}




{'loss': 1.2262, 'grad_norm': 0.32008615136146545, 'learning_rate': 0.0032, 'epoch': 84.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2289948463439941, 'eval_accuracy': 0.534, 'eval_precision': 0.4568755182505236, 'eval_recall': 0.534, 'eval_f1': 0.47519076362347445, 'eval_runtime': 41.8677, 'eval_samples_per_second': 23.885, 'eval_steps_per_second': 11.942, 'epoch': 84.0}




{'loss': 1.257, 'grad_norm': 1.8548264503479004, 'learning_rate': 0.003, 'epoch': 85.0}


  0%|          | 0/500 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3144923448562622, 'eval_accuracy': 0.53, 'eval_precision': 0.48921628072993356, 'eval_recall': 0.53, 'eval_f1': 0.4347251140632691, 'eval_runtime': 42.4637, 'eval_samples_per_second': 23.55, 'eval_steps_per_second': 11.775, 'epoch': 85.0}




{'loss': 1.2318, 'grad_norm': 1.109553337097168, 'learning_rate': 0.0028000000000000004, 'epoch': 86.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2794189453125, 'eval_accuracy': 0.541, 'eval_precision': 0.511913390351906, 'eval_recall': 0.541, 'eval_f1': 0.4765909783963437, 'eval_runtime': 41.7374, 'eval_samples_per_second': 23.959, 'eval_steps_per_second': 11.98, 'epoch': 86.0}




{'loss': 1.2537, 'grad_norm': 2.270636558532715, 'learning_rate': 0.0026000000000000003, 'epoch': 87.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2570111751556396, 'eval_accuracy': 0.508, 'eval_precision': 0.44843045389954267, 'eval_recall': 0.508, 'eval_f1': 0.46280066871670994, 'eval_runtime': 41.9015, 'eval_samples_per_second': 23.865, 'eval_steps_per_second': 11.933, 'epoch': 87.0}




{'loss': 1.2347, 'grad_norm': 0.5415827035903931, 'learning_rate': 0.0024, 'epoch': 88.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2383291721343994, 'eval_accuracy': 0.529, 'eval_precision': 0.46000870127998744, 'eval_recall': 0.529, 'eval_f1': 0.4705876684265827, 'eval_runtime': 41.6794, 'eval_samples_per_second': 23.993, 'eval_steps_per_second': 11.996, 'epoch': 88.0}




{'loss': 1.2185, 'grad_norm': 3.230241060256958, 'learning_rate': 0.0022, 'epoch': 89.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2310237884521484, 'eval_accuracy': 0.538, 'eval_precision': 0.4592787729441844, 'eval_recall': 0.538, 'eval_f1': 0.4869386567531804, 'eval_runtime': 42.1653, 'eval_samples_per_second': 23.716, 'eval_steps_per_second': 11.858, 'epoch': 89.0}




{'loss': 1.2232, 'grad_norm': 1.442477822303772, 'learning_rate': 0.002, 'epoch': 90.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2363276481628418, 'eval_accuracy': 0.536, 'eval_precision': 0.4601963377099988, 'eval_recall': 0.536, 'eval_f1': 0.48792782090497483, 'eval_runtime': 42.6711, 'eval_samples_per_second': 23.435, 'eval_steps_per_second': 11.718, 'epoch': 90.0}




{'loss': 1.2089, 'grad_norm': 0.794154167175293, 'learning_rate': 0.0018, 'epoch': 91.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2420133352279663, 'eval_accuracy': 0.535, 'eval_precision': 0.46244312837789375, 'eval_recall': 0.535, 'eval_f1': 0.4720880929695219, 'eval_runtime': 41.6712, 'eval_samples_per_second': 23.997, 'eval_steps_per_second': 11.999, 'epoch': 91.0}




{'loss': 1.22, 'grad_norm': 2.9114718437194824, 'learning_rate': 0.0016, 'epoch': 92.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2558207511901855, 'eval_accuracy': 0.528, 'eval_precision': 0.4766990190026197, 'eval_recall': 0.528, 'eval_f1': 0.48186792646460175, 'eval_runtime': 42.4551, 'eval_samples_per_second': 23.554, 'eval_steps_per_second': 11.777, 'epoch': 92.0}




{'loss': 1.2499, 'grad_norm': 1.8719924688339233, 'learning_rate': 0.0014000000000000002, 'epoch': 93.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2573041915893555, 'eval_accuracy': 0.527, 'eval_precision': 0.4914022059718402, 'eval_recall': 0.527, 'eval_f1': 0.49234244843901503, 'eval_runtime': 42.2602, 'eval_samples_per_second': 23.663, 'eval_steps_per_second': 11.831, 'epoch': 93.0}




{'loss': 1.2195, 'grad_norm': 1.445015788078308, 'learning_rate': 0.0012, 'epoch': 94.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2333301305770874, 'eval_accuracy': 0.542, 'eval_precision': 0.4700503174874038, 'eval_recall': 0.542, 'eval_f1': 0.48516294959318684, 'eval_runtime': 41.9185, 'eval_samples_per_second': 23.856, 'eval_steps_per_second': 11.928, 'epoch': 94.0}




{'loss': 1.2564, 'grad_norm': 1.5463624000549316, 'learning_rate': 0.001, 'epoch': 95.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2446188926696777, 'eval_accuracy': 0.523, 'eval_precision': 0.454932813748385, 'eval_recall': 0.523, 'eval_f1': 0.46937016405607196, 'eval_runtime': 41.9203, 'eval_samples_per_second': 23.855, 'eval_steps_per_second': 11.927, 'epoch': 95.0}




{'loss': 1.219, 'grad_norm': 2.5605947971343994, 'learning_rate': 0.0008, 'epoch': 96.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2467924356460571, 'eval_accuracy': 0.527, 'eval_precision': 0.4823102024724024, 'eval_recall': 0.527, 'eval_f1': 0.48049339998521223, 'eval_runtime': 42.4476, 'eval_samples_per_second': 23.558, 'eval_steps_per_second': 11.779, 'epoch': 96.0}




{'loss': 1.2479, 'grad_norm': 1.6391277313232422, 'learning_rate': 0.0006, 'epoch': 97.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2334175109863281, 'eval_accuracy': 0.537, 'eval_precision': 0.466259062757908, 'eval_recall': 0.537, 'eval_f1': 0.48483716542298383, 'eval_runtime': 42.6756, 'eval_samples_per_second': 23.433, 'eval_steps_per_second': 11.716, 'epoch': 97.0}




{'loss': 1.2352, 'grad_norm': 1.2073432207107544, 'learning_rate': 0.0004, 'epoch': 98.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2322646379470825, 'eval_accuracy': 0.533, 'eval_precision': 0.46141300366300364, 'eval_recall': 0.533, 'eval_f1': 0.478096393973432, 'eval_runtime': 42.319, 'eval_samples_per_second': 23.63, 'eval_steps_per_second': 11.815, 'epoch': 98.0}




{'loss': 1.2112, 'grad_norm': 1.9971650838851929, 'learning_rate': 0.0002, 'epoch': 99.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2340880632400513, 'eval_accuracy': 0.524, 'eval_precision': 0.4525210200280717, 'eval_recall': 0.524, 'eval_f1': 0.4698587393766238, 'eval_runtime': 42.1336, 'eval_samples_per_second': 23.734, 'eval_steps_per_second': 11.867, 'epoch': 99.0}




{'loss': 1.1942, 'grad_norm': 2.5871312618255615, 'learning_rate': 0.0, 'epoch': 100.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2333229780197144, 'eval_accuracy': 0.533, 'eval_precision': 0.46200214190909067, 'eval_recall': 0.533, 'eval_f1': 0.4768362017539027, 'eval_runtime': 42.6792, 'eval_samples_per_second': 23.431, 'eval_steps_per_second': 11.715, 'epoch': 100.0}




{'train_runtime': 11695.9555, 'train_samples_per_second': 8.55, 'train_steps_per_second': 4.275, 'train_loss': 1.35382287109375, 'epoch': 100.0}


TrainOutput(global_step=50000, training_loss=1.35382287109375, metrics={'train_runtime': 11695.9555, 'train_samples_per_second': 8.55, 'train_steps_per_second': 4.275, 'total_flos': 1.32488736768e+16, 'train_loss': 1.35382287109375, 'epoch': 100.0})

In [62]:
#Save the model.
#peft_model_path = os.path.join(output_directory, f"lora_model")

trainer.model.save_pretrained("peft_model_emotion", model_type="bert")

trainer.evaluate()




  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.2333229780197144,
 'eval_accuracy': 0.533,
 'eval_precision': 0.46200214190909067,
 'eval_recall': 0.533,
 'eval_f1': 0.4768362017539027,
 'eval_runtime': 42.426,
 'eval_samples_per_second': 23.57,
 'eval_steps_per_second': 11.785,
 'epoch': 100.0}

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [63]:
#Load the Model.

peft_config = PeftConfig.from_pretrained("peft_model_emotion")
loaded_model = AutoPeftModel.from_pretrained(
                                        "peft_model_emotion",
                                        label2id=label2id,
                                        id2label=id2label,
                                        is_trainable=False,
                                        config=peft_config)

loaded_model = loaded_model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
#from transformers import pipeline

#classifier = pipeline( "sentiment-analysis",model=loaded_model,tokenizer=tokenizer)
text = "im happy"
inputs = tokenizer(text, return_tensors="pt")

inputs = inputs.to(device)

In [65]:
import torch

with torch.no_grad():
    logits = model(**inputs).logits
    
    
predicted_class_id = logits.argmax().item()
loaded_model.config.id2label[predicted_class_id]    

'joy'