In [1]:
import torch
from datasets import Dataset

# Check if GPU is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    for gpu_id in range(num_gpus):
        print(f"GPU {gpu_id}: {torch.cuda.get_device_name(gpu_id)}")
else:
    print("No GPU available.")

  from .autonotebook import tqdm as notebook_tqdm


Number of available GPUs: 1
GPU 0: NVIDIA GeForce RTX 3070


In [2]:
ds = Dataset.load_from_disk("dataset/train_dataset_for_vit")

In [3]:
import pickle

# File path to the pickle file
id2label_file_path = "dataset/id2label.pkl"
label2id_file_path = "dataset/label2id.pkl"

# Load the object from the pickle file
with open(id2label_file_path, "rb") as f:
    id2label = pickle.load(f)
    
# Load the object from the pickle file
with open(label2id_file_path, "rb") as f:
    label2id = pickle.load(f)

In [4]:
dataset = ds.train_test_split(test_size=0.1, shuffle=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'image_file_path', 'labels'],
        num_rows: 4088
    })
    test: Dataset({
        features: ['image', 'image_file_path', 'labels'],
        num_rows: 455
    })
})

In [5]:
from transformers import ViTImageProcessor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_name_or_path)

In [6]:
def process_example(example):
    inputs = processor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs

In [7]:
process_example(dataset['train'][0])

{'pixel_values': tensor([[[[-0.1608, -0.1686, -0.2392,  ..., -0.2000, -0.1608, -0.1843],
          [-0.2392, -0.2784, -0.1843,  ..., -0.1922, -0.1451, -0.1608],
          [-0.2314, -0.2784, -0.2627,  ..., -0.2078, -0.2235, -0.2549],
          ...,
          [-0.2627, -0.2314, -0.2235,  ..., -0.2549, -0.2471, -0.2392],
          [-0.2000, -0.2471, -0.2549,  ..., -0.2235, -0.2549, -0.2549],
          [-0.2314, -0.2471, -0.2706,  ..., -0.2000, -0.2314, -0.2471]],

         [[-0.3255, -0.3333, -0.4039,  ..., -0.3725, -0.3333, -0.3569],
          [-0.4039, -0.4431, -0.3490,  ..., -0.3647, -0.3176, -0.3333],
          [-0.3961, -0.4431, -0.4275,  ..., -0.3804, -0.3961, -0.4275],
          ...,
          [-0.4275, -0.4039, -0.4039,  ..., -0.4275, -0.4196, -0.4118],
          [-0.3647, -0.4118, -0.4275,  ..., -0.3961, -0.4275, -0.4275],
          [-0.3961, -0.4118, -0.4353,  ..., -0.3725, -0.4039, -0.4196]],

         [[-0.5765, -0.5843, -0.6549,  ..., -0.6471, -0.6078, -0.6314],
          [-0

In [8]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = processor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

In [9]:
prepared_ds = dataset.with_transform(transform)

In [10]:
prepared_ds['train'][0]['pixel_values'].shape

torch.Size([3, 224, 224])

In [11]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [12]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
from transformers import ViTForImageClassification

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    device_map="auto",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-beans",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=25,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=40,
  learning_rate=0.002,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='wandb',
  load_best_model_at_end=True,
)

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"], # its validation dataset
    tokenizer=processor,
)

In [16]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33minspiration601[0m ([33maniketkonkar[0m). Use [1m`wandb login --relogin`[0m to force relogin


  1%|          | 41/6400 [00:12<18:16,  5.80it/s]  

{'loss': 6.1141, 'grad_norm': 0.7669553160667419, 'learning_rate': 0.0019875, 'epoch': 0.16}


  1%|▏         | 81/6400 [00:19<17:59,  5.86it/s]

{'loss': 6.07, 'grad_norm': 0.5916109085083008, 'learning_rate': 0.001975, 'epoch': 0.31}


                                                  
  2%|▏         | 100/6400 [00:25<19:38,  5.34it/s]

{'eval_loss': 6.056920051574707, 'eval_accuracy': 0.002197802197802198, 'eval_runtime': 3.0117, 'eval_samples_per_second': 151.079, 'eval_steps_per_second': 18.926, 'epoch': 0.39}


  2%|▏         | 121/6400 [00:30<19:25,  5.39it/s]  

{'loss': 6.0501, 'grad_norm': 0.5226492285728455, 'learning_rate': 0.0019625, 'epoch': 0.47}


  3%|▎         | 161/6400 [00:38<19:11,  5.42it/s]

{'loss': 6.0306, 'grad_norm': 0.48075488209724426, 'learning_rate': 0.00195, 'epoch': 0.62}


  3%|▎         | 200/6400 [00:45<18:53,  5.47it/s]

{'loss': 6.0077, 'grad_norm': 1.251535177230835, 'learning_rate': 0.0019375, 'epoch': 0.78}


                                                  
  3%|▎         | 200/6400 [00:48<18:53,  5.47it/s]

{'eval_loss': 6.011632442474365, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8006, 'eval_samples_per_second': 162.464, 'eval_steps_per_second': 20.353, 'epoch': 0.78}


  4%|▍         | 241/6400 [00:57<19:11,  5.35it/s]  

{'loss': 5.9804, 'grad_norm': 0.5090551376342773, 'learning_rate': 0.001925, 'epoch': 0.94}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  4%|▍         | 281/6400 [01:04<19:05,  5.34it/s]

{'loss': 5.9368, 'grad_norm': 0.5448741912841797, 'learning_rate': 0.0019125000000000001, 'epoch': 1.09}


                                                  
  5%|▍         | 300/6400 [01:11<18:45,  5.42it/s]

{'eval_loss': 6.051682472229004, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8036, 'eval_samples_per_second': 162.29, 'eval_steps_per_second': 20.331, 'epoch': 1.17}


  5%|▌         | 321/6400 [01:16<18:46,  5.40it/s]  

{'loss': 5.9264, 'grad_norm': 0.5370280742645264, 'learning_rate': 0.0019, 'epoch': 1.25}


  6%|▌         | 361/6400 [01:23<18:34,  5.42it/s]

{'loss': 5.922, 'grad_norm': 0.5982754826545715, 'learning_rate': 0.0018875, 'epoch': 1.41}


  6%|▋         | 400/6400 [01:30<18:21,  5.45it/s]

{'loss': 5.9237, 'grad_norm': 0.6643596887588501, 'learning_rate': 0.001875, 'epoch': 1.56}


                                                  
  6%|▋         | 400/6400 [01:33<18:21,  5.45it/s]

{'eval_loss': 6.016071319580078, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8396, 'eval_samples_per_second': 160.232, 'eval_steps_per_second': 20.073, 'epoch': 1.56}


  7%|▋         | 441/6400 [01:42<18:25,  5.39it/s]  

{'loss': 5.9657, 'grad_norm': 0.5963823795318604, 'learning_rate': 0.0018625, 'epoch': 1.72}


  8%|▊         | 481/6400 [01:49<18:22,  5.37it/s]

{'loss': 5.9573, 'grad_norm': 0.7684153914451599, 'learning_rate': 0.00185, 'epoch': 1.88}


                                                  
  8%|▊         | 500/6400 [01:56<18:10,  5.41it/s]

{'eval_loss': 5.983361721038818, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8556, 'eval_samples_per_second': 159.334, 'eval_steps_per_second': 19.96, 'epoch': 1.95}


  8%|▊         | 521/6400 [02:01<18:11,  5.38it/s]  

{'loss': 5.929, 'grad_norm': 0.5724313259124756, 'learning_rate': 0.0018375, 'epoch': 2.03}


  9%|▉         | 561/6400 [02:08<18:00,  5.41it/s]

{'loss': 5.9438, 'grad_norm': 0.6966441869735718, 'learning_rate': 0.001825, 'epoch': 2.19}


  9%|▉         | 600/6400 [02:15<17:58,  5.38it/s]

{'loss': 5.9233, 'grad_norm': 0.5169881582260132, 'learning_rate': 0.0018125, 'epoch': 2.34}


                                                  
  9%|▉         | 600/6400 [02:18<17:58,  5.38it/s]

{'eval_loss': 5.986555576324463, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8646, 'eval_samples_per_second': 158.833, 'eval_steps_per_second': 19.898, 'epoch': 2.34}


 10%|█         | 641/6400 [02:27<17:50,  5.38it/s]  

{'loss': 5.8982, 'grad_norm': 0.7201768755912781, 'learning_rate': 0.0018000000000000002, 'epoch': 2.5}


 11%|█         | 681/6400 [02:34<17:51,  5.34it/s]

{'loss': 5.939, 'grad_norm': 0.6571674942970276, 'learning_rate': 0.0017875, 'epoch': 2.66}


                                                  
 11%|█         | 700/6400 [02:41<17:33,  5.41it/s]

{'eval_loss': 6.00970983505249, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8576, 'eval_samples_per_second': 159.222, 'eval_steps_per_second': 19.947, 'epoch': 2.73}


 11%|█▏        | 721/6400 [02:46<17:42,  5.34it/s]  

{'loss': 5.9305, 'grad_norm': 0.5550730228424072, 'learning_rate': 0.0017749999999999999, 'epoch': 2.81}


 12%|█▏        | 761/6400 [02:53<17:14,  5.45it/s]

{'loss': 5.9417, 'grad_norm': 0.5542479753494263, 'learning_rate': 0.0017625, 'epoch': 2.97}


 12%|█▎        | 800/6400 [03:00<17:08,  5.44it/s]

{'loss': 5.9215, 'grad_norm': 0.5853504538536072, 'learning_rate': 0.00175, 'epoch': 3.12}


                                                  
 12%|█▎        | 800/6400 [03:03<17:08,  5.44it/s]

{'eval_loss': 5.9814653396606445, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8256, 'eval_samples_per_second': 161.026, 'eval_steps_per_second': 20.172, 'epoch': 3.12}


 13%|█▎        | 841/6400 [03:12<17:12,  5.38it/s]  

{'loss': 5.8904, 'grad_norm': 0.8036910891532898, 'learning_rate': 0.0017375000000000001, 'epoch': 3.28}


 14%|█▍        | 881/6400 [03:19<17:02,  5.40it/s]

{'loss': 5.924, 'grad_norm': 0.579498291015625, 'learning_rate': 0.0017250000000000002, 'epoch': 3.44}


                                                  
 14%|█▍        | 900/6400 [03:26<17:03,  5.37it/s]

{'eval_loss': 5.9853105545043945, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8957, 'eval_samples_per_second': 157.132, 'eval_steps_per_second': 19.685, 'epoch': 3.52}


 14%|█▍        | 921/6400 [03:31<16:56,  5.39it/s]  

{'loss': 5.9385, 'grad_norm': 0.6393287777900696, 'learning_rate': 0.0017125, 'epoch': 3.59}


 15%|█▌        | 961/6400 [03:38<16:48,  5.39it/s]

{'loss': 5.9286, 'grad_norm': 0.4849768579006195, 'learning_rate': 0.0017, 'epoch': 3.75}


 16%|█▌        | 1000/6400 [03:45<16:36,  5.42it/s]

{'loss': 5.8992, 'grad_norm': 0.6869425177574158, 'learning_rate': 0.0016875, 'epoch': 3.91}


                                                   
 16%|█▌        | 1000/6400 [03:48<16:36,  5.42it/s]

{'eval_loss': 5.9832587242126465, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8496, 'eval_samples_per_second': 159.669, 'eval_steps_per_second': 20.003, 'epoch': 3.91}


 16%|█▋        | 1041/6400 [03:57<16:33,  5.39it/s]  

{'loss': 5.9614, 'grad_norm': 0.8490535020828247, 'learning_rate': 0.001675, 'epoch': 4.06}


 17%|█▋        | 1081/6400 [04:04<16:32,  5.36it/s]

{'loss': 5.9323, 'grad_norm': 0.702585756778717, 'learning_rate': 0.0016625000000000001, 'epoch': 4.22}


                                                   
 17%|█▋        | 1100/6400 [04:11<16:15,  5.43it/s]

{'eval_loss': 5.981473445892334, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8736, 'eval_samples_per_second': 158.335, 'eval_steps_per_second': 19.835, 'epoch': 4.3}


 18%|█▊        | 1121/6400 [04:16<16:23,  5.37it/s]  

{'loss': 5.8951, 'grad_norm': 0.6862462759017944, 'learning_rate': 0.00165, 'epoch': 4.38}


 18%|█▊        | 1161/6400 [04:23<16:17,  5.36it/s]

{'loss': 5.9062, 'grad_norm': 0.5449145436286926, 'learning_rate': 0.0016375, 'epoch': 4.53}


 19%|█▉        | 1200/6400 [04:30<15:58,  5.43it/s]

{'loss': 5.8973, 'grad_norm': 0.49669137597084045, 'learning_rate': 0.0016250000000000001, 'epoch': 4.69}


                                                   
 19%|█▉        | 1200/6400 [04:33<15:58,  5.43it/s]

{'eval_loss': 5.984683990478516, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8696, 'eval_samples_per_second': 158.556, 'eval_steps_per_second': 19.863, 'epoch': 4.69}


 19%|█▉        | 1241/6400 [04:42<15:48,  5.44it/s]  

{'loss': 5.9222, 'grad_norm': 0.6013498306274414, 'learning_rate': 0.0016125, 'epoch': 4.84}


 20%|██        | 1280/6400 [04:49<15:05,  5.66it/s]

{'loss': 5.9645, 'grad_norm': 1.3161320686340332, 'learning_rate': 0.0016, 'epoch': 5.0}


                                                   
 20%|██        | 1300/6400 [04:55<15:31,  5.48it/s]

{'eval_loss': 6.004206657409668, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8827, 'eval_samples_per_second': 157.841, 'eval_steps_per_second': 19.773, 'epoch': 5.08}


 21%|██        | 1321/6400 [05:01<15:51,  5.34it/s]  

{'loss': 5.9049, 'grad_norm': 0.4385043978691101, 'learning_rate': 0.0015875, 'epoch': 5.16}


 21%|██▏       | 1361/6400 [05:08<15:37,  5.38it/s]

{'loss': 5.9207, 'grad_norm': 0.6386650204658508, 'learning_rate': 0.001575, 'epoch': 5.31}


 22%|██▏       | 1400/6400 [05:15<15:22,  5.42it/s]

{'loss': 5.913, 'grad_norm': 0.7046363949775696, 'learning_rate': 0.0015625, 'epoch': 5.47}


                                                   
 22%|██▏       | 1400/6400 [05:18<15:22,  5.42it/s]

{'eval_loss': 5.97575569152832, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8626, 'eval_samples_per_second': 158.944, 'eval_steps_per_second': 19.912, 'epoch': 5.47}


 23%|██▎       | 1441/6400 [05:27<15:35,  5.30it/s]  

{'loss': 5.9207, 'grad_norm': 0.6742605566978455, 'learning_rate': 0.0015500000000000002, 'epoch': 5.62}


 23%|██▎       | 1481/6400 [05:34<15:15,  5.37it/s]

{'loss': 5.8929, 'grad_norm': 0.6126128435134888, 'learning_rate': 0.0015375, 'epoch': 5.78}


                                                   
 23%|██▎       | 1500/6400 [05:41<15:05,  5.41it/s]

{'eval_loss': 5.98587703704834, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8516, 'eval_samples_per_second': 159.557, 'eval_steps_per_second': 19.988, 'epoch': 5.86}


 24%|██▍       | 1521/6400 [05:46<15:11,  5.36it/s]  

{'loss': 5.9186, 'grad_norm': 0.870542049407959, 'learning_rate': 0.0015249999999999999, 'epoch': 5.94}


 24%|██▍       | 1561/6400 [05:53<14:58,  5.39it/s]

{'loss': 5.9243, 'grad_norm': 0.5965292453765869, 'learning_rate': 0.0015125, 'epoch': 6.09}


 25%|██▌       | 1600/6400 [06:00<15:00,  5.33it/s]

{'loss': 5.9428, 'grad_norm': 0.6783541440963745, 'learning_rate': 0.0015, 'epoch': 6.25}


                                                   
 25%|██▌       | 1600/6400 [06:03<15:00,  5.33it/s]

{'eval_loss': 5.979816436767578, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8576, 'eval_samples_per_second': 159.222, 'eval_steps_per_second': 19.947, 'epoch': 6.25}


 26%|██▌       | 1641/6400 [06:12<14:36,  5.43it/s]  

{'loss': 5.9311, 'grad_norm': 1.078152060508728, 'learning_rate': 0.0014875, 'epoch': 6.41}


 26%|██▋       | 1681/6400 [06:19<14:36,  5.39it/s]

{'loss': 5.8623, 'grad_norm': 0.7337961792945862, 'learning_rate': 0.0014750000000000002, 'epoch': 6.56}


                                                   
 27%|██▋       | 1700/6400 [06:26<14:28,  5.41it/s]

{'eval_loss': 6.00806999206543, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8957, 'eval_samples_per_second': 157.132, 'eval_steps_per_second': 19.685, 'epoch': 6.64}


 27%|██▋       | 1721/6400 [06:31<14:28,  5.39it/s]  

{'loss': 5.8675, 'grad_norm': 0.9468205571174622, 'learning_rate': 0.0014625, 'epoch': 6.72}


 28%|██▊       | 1761/6400 [06:38<14:31,  5.33it/s]

{'loss': 5.9618, 'grad_norm': 1.1793876886367798, 'learning_rate': 0.00145, 'epoch': 6.88}


 28%|██▊       | 1800/6400 [06:45<13:56,  5.50it/s]

{'loss': 5.8898, 'grad_norm': 0.6057853102684021, 'learning_rate': 0.0014375, 'epoch': 7.03}


                                                   
 28%|██▊       | 1800/6400 [06:48<13:56,  5.50it/s]

{'eval_loss': 5.999571323394775, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8433, 'eval_samples_per_second': 160.025, 'eval_steps_per_second': 20.047, 'epoch': 7.03}


 29%|██▉       | 1841/6400 [06:57<14:14,  5.33it/s]  

{'loss': 5.8913, 'grad_norm': 0.47987163066864014, 'learning_rate': 0.001425, 'epoch': 7.19}


 29%|██▉       | 1881/6400 [07:04<13:58,  5.39it/s]

{'loss': 5.933, 'grad_norm': 0.6659724712371826, 'learning_rate': 0.0014125000000000001, 'epoch': 7.34}


                                                   
 30%|██▉       | 1900/6400 [07:11<13:51,  5.41it/s]

{'eval_loss': 5.9734368324279785, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8686, 'eval_samples_per_second': 158.611, 'eval_steps_per_second': 19.87, 'epoch': 7.42}


 30%|███       | 1921/6400 [07:16<13:53,  5.37it/s]  

{'loss': 5.9211, 'grad_norm': 0.7747485041618347, 'learning_rate': 0.0014, 'epoch': 7.5}


 31%|███       | 1961/6400 [07:23<13:41,  5.40it/s]

{'loss': 5.9016, 'grad_norm': 0.5064123868942261, 'learning_rate': 0.0013875, 'epoch': 7.66}


 31%|███▏      | 2000/6400 [07:31<13:32,  5.41it/s]

{'loss': 5.9115, 'grad_norm': 0.6133146286010742, 'learning_rate': 0.001375, 'epoch': 7.81}


                                                   
 31%|███▏      | 2000/6400 [07:33<13:32,  5.41it/s]

{'eval_loss': 5.990556240081787, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8882, 'eval_samples_per_second': 157.54, 'eval_steps_per_second': 19.736, 'epoch': 7.81}


 32%|███▏      | 2041/6400 [07:42<13:29,  5.38it/s]  

{'loss': 5.9073, 'grad_norm': 0.5220264196395874, 'learning_rate': 0.0013625, 'epoch': 7.97}


 33%|███▎      | 2081/6400 [07:50<13:35,  5.29it/s]

{'loss': 5.8818, 'grad_norm': 0.45714589953422546, 'learning_rate': 0.00135, 'epoch': 8.12}


                                                   
 33%|███▎      | 2100/6400 [07:56<13:11,  5.43it/s]

{'eval_loss': 5.9951581954956055, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8806, 'eval_samples_per_second': 157.951, 'eval_steps_per_second': 19.787, 'epoch': 8.2}


 33%|███▎      | 2121/6400 [08:01<13:52,  5.14it/s]  

{'loss': 5.8992, 'grad_norm': 0.6250827312469482, 'learning_rate': 0.0013375, 'epoch': 8.28}


 34%|███▍      | 2161/6400 [08:09<13:05,  5.39it/s]

{'loss': 5.9063, 'grad_norm': 0.5694795846939087, 'learning_rate': 0.001325, 'epoch': 8.44}


 34%|███▍      | 2200/6400 [08:16<12:54,  5.42it/s]

{'loss': 5.9231, 'grad_norm': 0.631350576877594, 'learning_rate': 0.0013125, 'epoch': 8.59}


                                                   
 34%|███▍      | 2200/6400 [08:19<12:54,  5.42it/s]

{'eval_loss': 5.985568523406982, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8676, 'eval_samples_per_second': 158.667, 'eval_steps_per_second': 19.877, 'epoch': 8.59}


 35%|███▌      | 2241/6400 [08:27<12:48,  5.41it/s]  

{'loss': 5.9191, 'grad_norm': 0.8188505172729492, 'learning_rate': 0.0013000000000000002, 'epoch': 8.75}


 36%|███▌      | 2281/6400 [08:35<12:40,  5.42it/s]

{'loss': 5.907, 'grad_norm': 0.5197357535362244, 'learning_rate': 0.0012875, 'epoch': 8.91}


                                                   
 36%|███▌      | 2300/6400 [08:41<12:32,  5.45it/s]

{'eval_loss': 5.9888129234313965, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8426, 'eval_samples_per_second': 160.063, 'eval_steps_per_second': 20.052, 'epoch': 8.98}


 36%|███▋      | 2321/6400 [08:46<12:43,  5.34it/s]  

{'loss': 5.9187, 'grad_norm': 0.6499793529510498, 'learning_rate': 0.0012749999999999999, 'epoch': 9.06}


 37%|███▋      | 2361/6400 [08:54<12:47,  5.26it/s]

{'loss': 5.8326, 'grad_norm': 0.5263387560844421, 'learning_rate': 0.0012625, 'epoch': 9.22}


 38%|███▊      | 2400/6400 [09:01<12:23,  5.38it/s]

{'loss': 5.9222, 'grad_norm': 0.5418633222579956, 'learning_rate': 0.00125, 'epoch': 9.38}


                                                   
 38%|███▊      | 2400/6400 [09:04<12:23,  5.38it/s]

{'eval_loss': 5.998763084411621, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8586, 'eval_samples_per_second': 159.166, 'eval_steps_per_second': 19.94, 'epoch': 9.38}


 38%|███▊      | 2441/6400 [09:13<12:09,  5.43it/s]  

{'loss': 5.9049, 'grad_norm': 0.9309344291687012, 'learning_rate': 0.0012375, 'epoch': 9.53}


 39%|███▉      | 2481/6400 [09:20<12:09,  5.38it/s]

{'loss': 5.9322, 'grad_norm': 0.6704539060592651, 'learning_rate': 0.0012250000000000002, 'epoch': 9.69}


                                                   
 39%|███▉      | 2500/6400 [09:26<11:58,  5.43it/s]

{'eval_loss': 5.997218608856201, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.9247, 'eval_samples_per_second': 155.574, 'eval_steps_per_second': 19.489, 'epoch': 9.77}


 39%|███▉      | 2521/6400 [09:31<12:09,  5.32it/s]  

{'loss': 5.8874, 'grad_norm': 0.5775419473648071, 'learning_rate': 0.0012125, 'epoch': 9.84}


 40%|████      | 2561/6400 [09:39<10:09,  6.30it/s]

{'loss': 5.9453, 'grad_norm': 1.006713628768921, 'learning_rate': 0.0012, 'epoch': 10.0}


 41%|████      | 2600/6400 [09:46<10:48,  5.86it/s]

{'loss': 5.9183, 'grad_norm': 0.8641149997711182, 'learning_rate': 0.0011875, 'epoch': 10.16}


                                                   
 41%|████      | 2600/6400 [09:48<10:48,  5.86it/s]

{'eval_loss': 5.9905219078063965, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.7966, 'eval_samples_per_second': 162.696, 'eval_steps_per_second': 20.382, 'epoch': 10.16}


 41%|████▏     | 2641/6400 [09:57<11:38,  5.38it/s]  

{'loss': 5.9081, 'grad_norm': 0.5842429399490356, 'learning_rate': 0.001175, 'epoch': 10.31}


 42%|████▏     | 2681/6400 [10:05<11:23,  5.44it/s]

{'loss': 5.8985, 'grad_norm': 1.2105119228363037, 'learning_rate': 0.0011625000000000001, 'epoch': 10.47}


                                                   
 42%|████▏     | 2700/6400 [10:11<11:11,  5.51it/s]

{'eval_loss': 5.976931571960449, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8429, 'eval_samples_per_second': 160.045, 'eval_steps_per_second': 20.05, 'epoch': 10.55}


 43%|████▎     | 2721/6400 [10:16<11:22,  5.39it/s]  

{'loss': 5.9093, 'grad_norm': 0.6996431350708008, 'learning_rate': 0.00115, 'epoch': 10.62}


 43%|████▎     | 2761/6400 [10:23<11:06,  5.46it/s]

{'loss': 5.8713, 'grad_norm': 0.6874296069145203, 'learning_rate': 0.0011375, 'epoch': 10.78}


 44%|████▍     | 2800/6400 [10:30<10:52,  5.51it/s]

{'loss': 5.8869, 'grad_norm': 0.7004806995391846, 'learning_rate': 0.0011250000000000001, 'epoch': 10.94}


                                                   
 44%|████▍     | 2800/6400 [10:33<10:52,  5.51it/s]

{'eval_loss': 5.994865894317627, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.7936, 'eval_samples_per_second': 162.871, 'eval_steps_per_second': 20.404, 'epoch': 10.94}


 44%|████▍     | 2841/6400 [10:42<10:49,  5.48it/s]  

{'loss': 5.8918, 'grad_norm': 0.8600200414657593, 'learning_rate': 0.0011125, 'epoch': 11.09}


 45%|████▌     | 2881/6400 [10:49<10:44,  5.46it/s]

{'loss': 5.9156, 'grad_norm': 0.664797306060791, 'learning_rate': 0.0011, 'epoch': 11.25}


                                                   
 45%|████▌     | 2900/6400 [10:55<10:36,  5.50it/s]

{'eval_loss': 5.992977142333984, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8146, 'eval_samples_per_second': 161.655, 'eval_steps_per_second': 20.251, 'epoch': 11.33}


 46%|████▌     | 2921/6400 [11:01<10:41,  5.42it/s]  

{'loss': 5.9156, 'grad_norm': 0.5569192171096802, 'learning_rate': 0.0010875, 'epoch': 11.41}


 46%|████▋     | 2961/6400 [11:08<10:32,  5.43it/s]

{'loss': 5.9167, 'grad_norm': 0.5856494903564453, 'learning_rate': 0.001075, 'epoch': 11.56}


 47%|████▋     | 3000/6400 [11:15<10:18,  5.49it/s]

{'loss': 5.9039, 'grad_norm': 0.6623902320861816, 'learning_rate': 0.0010625, 'epoch': 11.72}


                                                   
 47%|████▋     | 3000/6400 [11:18<10:18,  5.49it/s]

{'eval_loss': 5.983482360839844, 'eval_accuracy': 0.006593406593406593, 'eval_runtime': 2.8416, 'eval_samples_per_second': 160.119, 'eval_steps_per_second': 20.059, 'epoch': 11.72}


 48%|████▊     | 3041/6400 [11:27<10:14,  5.47it/s]  

{'loss': 5.8809, 'grad_norm': 0.604623019695282, 'learning_rate': 0.0010500000000000002, 'epoch': 11.88}


 48%|████▊     | 3081/6400 [11:34<10:01,  5.52it/s]

{'loss': 5.8961, 'grad_norm': 0.6773180365562439, 'learning_rate': 0.0010375, 'epoch': 12.03}


                                                   
 48%|████▊     | 3100/6400 [11:40<10:01,  5.49it/s]

{'eval_loss': 5.989980697631836, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8086, 'eval_samples_per_second': 162.001, 'eval_steps_per_second': 20.295, 'epoch': 12.11}


 49%|████▉     | 3121/6400 [11:46<10:01,  5.45it/s]  

{'loss': 5.8758, 'grad_norm': 0.8770876526832581, 'learning_rate': 0.0010249999999999999, 'epoch': 12.19}


 49%|████▉     | 3161/6400 [11:53<09:50,  5.49it/s]

{'loss': 5.8742, 'grad_norm': 0.7266815304756165, 'learning_rate': 0.0010125, 'epoch': 12.34}


 50%|█████     | 3200/6400 [12:00<09:43,  5.49it/s]

{'loss': 5.8778, 'grad_norm': 0.6172295212745667, 'learning_rate': 0.001, 'epoch': 12.5}


                                                   
 50%|█████     | 3200/6400 [12:03<09:43,  5.49it/s]

{'eval_loss': 5.986384391784668, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8066, 'eval_samples_per_second': 162.116, 'eval_steps_per_second': 20.309, 'epoch': 12.5}


 51%|█████     | 3241/6400 [12:12<09:37,  5.47it/s]  

{'loss': 5.9015, 'grad_norm': 0.6169325113296509, 'learning_rate': 0.0009875, 'epoch': 12.66}


 51%|█████▏    | 3281/6400 [12:19<09:29,  5.47it/s]

{'loss': 5.9193, 'grad_norm': 0.7270877957344055, 'learning_rate': 0.000975, 'epoch': 12.81}


                                                   
 52%|█████▏    | 3300/6400 [12:25<09:23,  5.50it/s]

{'eval_loss': 5.9866156578063965, 'eval_accuracy': 0.004395604395604396, 'eval_runtime': 2.8236, 'eval_samples_per_second': 161.14, 'eval_steps_per_second': 20.187, 'epoch': 12.89}
