In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("yainage90/fashion-pattern-images")

Resolving data files:   0%|          | 0/1900 [00:00<?, ?it/s]

In [9]:
labels = dataset["train"].features["label"].names
print("Labels:", labels)

Labels: ['argyle', 'camouflage', 'checked', 'dot', 'floral', 'geometric', 'gradient', 'graphic', 'houndstooth', 'leopard', 'lettering', 'muji', 'paisley', 'snake_skin', 'snow_flake', 'stripe', 'tropical', 'zebra', 'zigzag']


In [11]:
from sklearn.model_selection import train_test_split

# Split the train set into train and test
split = dataset["train"].train_test_split(test_size=0.2, seed=42)
trainset = split["train"]
testset = split["test"]
print("Trainset size:", len(trainset))
print("Testset size:", len(testset))


Trainset size: 1520
Testset size: 380


In [None]:
from datasets import load_dataset
from transformers import ViTForImageClassification, ViTImageProcessor, TrainingArguments, Trainer
import torch
from torchvision import transforms
from sklearn.metrics import accuracy_score, f1_score

# Load dataset and split
dataset = load_dataset("yainage90/fashion-pattern-images")
split = dataset["train"].train_test_split(test_size=0.2, seed=42)
trainset = split["train"]
testset = split["test"]

# Get label info
labels = trainset.features["label"].names
num_labels = len(labels)
id2label = {str(i): l for i, l in enumerate(labels)}
label2id = {l: i for i, l in enumerate(labels)}

# Preprocessing (ViT processor + optional augmentation)
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

augment = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

def transform_examples(batch):
    # Data augmentation only on training set
    if "train" in batch["__split__"][0]:
        images = [augment(img.convert("RGB")) for img in batch["image"]]
    else:
        images = [img.convert("RGB") for img in batch["image"]]
    processed = processor(images=images, return_tensors="pt")
    # Remove batch dimension for each image
    pixel_values = [img for img in processed["pixel_values"]]
    return {
        "pixel_values": pixel_values,
        "labels": batch["label"]
    }

# Add split info for augmentation
trainset = trainset.add_column("__split__", ["train"] * len(trainset))
testset = testset.add_column("__split__", ["test"] * len(testset))

trainset = trainset.map(transform_examples, batched=True, remove_columns=trainset.column_names)
testset = testset.map(transform_examples, batched=True, remove_columns=testset.column_names)

# Model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Data collator
def collate_fn(batch):
    pixel_values = torch.stack([x["pixel_values"] for x in batch])
    labels = torch.tensor([x["labels"] for x in batch])
    return {"pixel_values": pixel_values, "labels": labels}

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./vit-pattern",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Try more epochs for better results
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=8,
    fp16=True,
    remove_unused_columns=True,
    report_to="none",
    learning_rate=3e-5,  # Lower learning rate for better fine-tuning
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset,
    data_collator=collate_fn,
    tokenizer=None,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Evaluate
metrics = trainer.evaluate()
print(metrics)

Resolving data files:   0%|          | 0/1900 [00:00<?, ?it/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/950 [00:00<?, ?it/s]

{'loss': 2.9513, 'grad_norm': 1.5533137321472168, 'learning_rate': 2.974736842105263e-05, 'epoch': 0.08}
{'loss': 2.9309, 'grad_norm': 1.49404776096344, 'learning_rate': 2.9494736842105264e-05, 'epoch': 0.17}
{'loss': 2.9204, 'grad_norm': 1.6879453659057617, 'learning_rate': 2.9242105263157893e-05, 'epoch': 0.25}
{'loss': 2.8858, 'grad_norm': 1.7725534439086914, 'learning_rate': 2.8989473684210528e-05, 'epoch': 0.34}
{'loss': 2.8753, 'grad_norm': 1.8019193410873413, 'learning_rate': 2.8736842105263157e-05, 'epoch': 0.42}
{'loss': 2.8683, 'grad_norm': 1.9326295852661133, 'learning_rate': 2.8484210526315792e-05, 'epoch': 0.51}
{'loss': 2.8435, 'grad_norm': 2.039621114730835, 'learning_rate': 2.823157894736842e-05, 'epoch': 0.59}
{'loss': 2.8398, 'grad_norm': 1.8846739530563354, 'learning_rate': 2.7978947368421052e-05, 'epoch': 0.67}
{'loss': 2.8038, 'grad_norm': 2.131182909011841, 'learning_rate': 2.7726315789473684e-05, 'epoch': 0.76}
{'loss': 2.779, 'grad_norm': 1.9602160453796387, 'le

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 2.708326578140259, 'eval_accuracy': 0.3894736842105263, 'eval_f1': 0.37057082658622426, 'eval_runtime': 52.3008, 'eval_samples_per_second': 7.266, 'eval_steps_per_second': 0.459, 'epoch': 1.0}
{'loss': 2.6994, 'grad_norm': 1.9618207216262817, 'learning_rate': 2.696842105263158e-05, 'epoch': 1.01}
{'loss': 2.5667, 'grad_norm': 2.3437774181365967, 'learning_rate': 2.671578947368421e-05, 'epoch': 1.09}
{'loss': 2.5271, 'grad_norm': 2.0768721103668213, 'learning_rate': 2.6463157894736843e-05, 'epoch': 1.18}
{'loss': 2.5167, 'grad_norm': 1.9647642374038696, 'learning_rate': 2.6210526315789475e-05, 'epoch': 1.26}
{'loss': 2.4607, 'grad_norm': 1.9937245845794678, 'learning_rate': 2.5957894736842107e-05, 'epoch': 1.35}
{'loss': 2.417, 'grad_norm': 2.1634106636047363, 'learning_rate': 2.5705263157894736e-05, 'epoch': 1.43}
{'loss': 2.3732, 'grad_norm': 2.4954466819763184, 'learning_rate': 2.545263157894737e-05, 'epoch': 1.52}
{'loss': 2.3961, 'grad_norm': 2.122347593307495, 'learn

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 2.248154878616333, 'eval_accuracy': 0.6394736842105263, 'eval_f1': 0.6200860764481894, 'eval_runtime': 52.0088, 'eval_samples_per_second': 7.306, 'eval_steps_per_second': 0.461, 'epoch': 2.0}
{'loss': 2.1575, 'grad_norm': 2.396374464035034, 'learning_rate': 2.393684210526316e-05, 'epoch': 2.02}
{'loss': 1.9761, 'grad_norm': 2.0862865447998047, 'learning_rate': 2.368421052631579e-05, 'epoch': 2.11}
{'loss': 1.9847, 'grad_norm': 2.7507364749908447, 'learning_rate': 2.343157894736842e-05, 'epoch': 2.19}
{'loss': 1.9455, 'grad_norm': 2.1163206100463867, 'learning_rate': 2.3178947368421054e-05, 'epoch': 2.27}
{'loss': 1.8624, 'grad_norm': 2.4434568881988525, 'learning_rate': 2.2926315789473683e-05, 'epoch': 2.36}
{'loss': 1.885, 'grad_norm': 1.9848313331604004, 'learning_rate': 2.2673684210526318e-05, 'epoch': 2.44}
{'loss': 1.8012, 'grad_norm': 2.1632370948791504, 'learning_rate': 2.2421052631578946e-05, 'epoch': 2.53}
{'loss': 1.7402, 'grad_norm': 2.1759791374206543, 'learni

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.8865337371826172, 'eval_accuracy': 0.6631578947368421, 'eval_f1': 0.6408467194550057, 'eval_runtime': 52.3625, 'eval_samples_per_second': 7.257, 'eval_steps_per_second': 0.458, 'epoch': 3.0}
{'loss': 1.5966, 'grad_norm': 2.411165475845337, 'learning_rate': 2.0905263157894737e-05, 'epoch': 3.03}
{'loss': 1.5441, 'grad_norm': 1.8495014905929565, 'learning_rate': 2.065263157894737e-05, 'epoch': 3.12}
{'loss': 1.4552, 'grad_norm': 2.216501235961914, 'learning_rate': 2.04e-05, 'epoch': 3.2}
{'loss': 1.4368, 'grad_norm': 2.518754005432129, 'learning_rate': 2.0147368421052633e-05, 'epoch': 3.28}
{'loss': 1.4769, 'grad_norm': 2.180781126022339, 'learning_rate': 1.989473684210526e-05, 'epoch': 3.37}
{'loss': 1.4411, 'grad_norm': 2.9849891662597656, 'learning_rate': 1.9642105263157897e-05, 'epoch': 3.45}
{'loss': 1.422, 'grad_norm': 2.5317962169647217, 'learning_rate': 1.9389473684210525e-05, 'epoch': 3.54}
{'loss': 1.3629, 'grad_norm': 3.8504090309143066, 'learning_rate': 1.9136

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.6447702646255493, 'eval_accuracy': 0.6947368421052632, 'eval_f1': 0.6901231020595284, 'eval_runtime': 52.5858, 'eval_samples_per_second': 7.226, 'eval_steps_per_second': 0.456, 'epoch': 4.0}
{'loss': 1.2586, 'grad_norm': 2.147754430770874, 'learning_rate': 1.7873684210526316e-05, 'epoch': 4.04}
{'loss': 1.1348, 'grad_norm': 1.8664156198501587, 'learning_rate': 1.7621052631578948e-05, 'epoch': 4.13}
{'loss': 1.1296, 'grad_norm': 1.7795203924179077, 'learning_rate': 1.736842105263158e-05, 'epoch': 4.21}
{'loss': 1.0826, 'grad_norm': 1.922107458114624, 'learning_rate': 1.711578947368421e-05, 'epoch': 4.29}
{'loss': 1.0893, 'grad_norm': 1.8976538181304932, 'learning_rate': 1.6863157894736844e-05, 'epoch': 4.38}
{'loss': 1.0906, 'grad_norm': 1.8544960021972656, 'learning_rate': 1.6610526315789472e-05, 'epoch': 4.46}
{'loss': 1.0417, 'grad_norm': 2.1478354930877686, 'learning_rate': 1.6357894736842108e-05, 'epoch': 4.55}
{'loss': 1.0268, 'grad_norm': 2.131279230117798, 'learn

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.4546823501586914, 'eval_accuracy': 0.7473684210526316, 'eval_f1': 0.7421800978216624, 'eval_runtime': 52.4351, 'eval_samples_per_second': 7.247, 'eval_steps_per_second': 0.458, 'epoch': 5.0}
{'loss': 0.896, 'grad_norm': 1.6596311330795288, 'learning_rate': 1.4842105263157895e-05, 'epoch': 5.05}
{'loss': 0.8942, 'grad_norm': 1.5796433687210083, 'learning_rate': 1.4589473684210527e-05, 'epoch': 5.14}
{'loss': 0.8487, 'grad_norm': 1.4793360233306885, 'learning_rate': 1.4336842105263159e-05, 'epoch': 5.22}
{'loss': 0.8793, 'grad_norm': 1.932173728942871, 'learning_rate': 1.408421052631579e-05, 'epoch': 5.31}
{'loss': 0.8288, 'grad_norm': 1.4632599353790283, 'learning_rate': 1.3831578947368421e-05, 'epoch': 5.39}
{'loss': 0.8661, 'grad_norm': 2.4419636726379395, 'learning_rate': 1.3578947368421053e-05, 'epoch': 5.47}
{'loss': 0.8296, 'grad_norm': 2.045839548110962, 'learning_rate': 1.3326315789473685e-05, 'epoch': 5.56}
{'loss': 0.8183, 'grad_norm': 2.830130100250244, 'learn

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.334495186805725, 'eval_accuracy': 0.7421052631578947, 'eval_f1': 0.7380645096311316, 'eval_runtime': 50.6976, 'eval_samples_per_second': 7.495, 'eval_steps_per_second': 0.473, 'epoch': 6.0}
{'loss': 0.735, 'grad_norm': 2.524820327758789, 'learning_rate': 1.1810526315789474e-05, 'epoch': 6.06}
{'loss': 0.6792, 'grad_norm': 1.831991195678711, 'learning_rate': 1.1557894736842106e-05, 'epoch': 6.15}
{'loss': 0.7017, 'grad_norm': 1.5670113563537598, 'learning_rate': 1.1305263157894736e-05, 'epoch': 6.23}
{'loss': 0.6789, 'grad_norm': 1.7557810544967651, 'learning_rate': 1.1052631578947368e-05, 'epoch': 6.32}
{'loss': 0.6574, 'grad_norm': 1.3228693008422852, 'learning_rate': 1.08e-05, 'epoch': 6.4}
{'loss': 0.6253, 'grad_norm': 1.4633392095565796, 'learning_rate': 1.0547368421052632e-05, 'epoch': 6.48}
{'loss': 0.6013, 'grad_norm': 1.3782529830932617, 'learning_rate': 1.0294736842105264e-05, 'epoch': 6.57}
{'loss': 0.6644, 'grad_norm': 2.1547977924346924, 'learning_rate': 1.0

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.2774536609649658, 'eval_accuracy': 0.7289473684210527, 'eval_f1': 0.7283410560298298, 'eval_runtime': 51.9295, 'eval_samples_per_second': 7.318, 'eval_steps_per_second': 0.462, 'epoch': 7.0}
{'loss': 0.5936, 'grad_norm': 1.6169304847717285, 'learning_rate': 8.778947368421053e-06, 'epoch': 7.07}
{'loss': 0.5942, 'grad_norm': 1.5342814922332764, 'learning_rate': 8.526315789473685e-06, 'epoch': 7.16}
{'loss': 0.581, 'grad_norm': 1.2384439706802368, 'learning_rate': 8.273684210526317e-06, 'epoch': 7.24}
{'loss': 0.5367, 'grad_norm': 1.3140690326690674, 'learning_rate': 8.021052631578949e-06, 'epoch': 7.33}
{'loss': 0.5107, 'grad_norm': 1.3429896831512451, 'learning_rate': 7.768421052631579e-06, 'epoch': 7.41}
{'loss': 0.5428, 'grad_norm': 1.4011050462722778, 'learning_rate': 7.5157894736842115e-06, 'epoch': 7.49}
{'loss': 0.5386, 'grad_norm': 1.5545356273651123, 'learning_rate': 7.2631578947368426e-06, 'epoch': 7.58}
{'loss': 0.5149, 'grad_norm': 1.0314152240753174, 'learni

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.2159501314163208, 'eval_accuracy': 0.7368421052631579, 'eval_f1': 0.7344828353753896, 'eval_runtime': 50.4995, 'eval_samples_per_second': 7.525, 'eval_steps_per_second': 0.475, 'epoch': 8.0}
{'loss': 0.4717, 'grad_norm': 1.0261173248291016, 'learning_rate': 5.747368421052631e-06, 'epoch': 8.08}
{'loss': 0.4718, 'grad_norm': 0.9143159985542297, 'learning_rate': 5.494736842105263e-06, 'epoch': 8.17}
{'loss': 0.4926, 'grad_norm': 0.991754949092865, 'learning_rate': 5.242105263157895e-06, 'epoch': 8.25}
{'loss': 0.4705, 'grad_norm': 1.063393235206604, 'learning_rate': 4.989473684210527e-06, 'epoch': 8.34}
{'loss': 0.4888, 'grad_norm': 1.427108645439148, 'learning_rate': 4.736842105263158e-06, 'epoch': 8.42}
{'loss': 0.4695, 'grad_norm': 1.043702483177185, 'learning_rate': 4.48421052631579e-06, 'epoch': 8.51}
{'loss': 0.4945, 'grad_norm': 1.177340030670166, 'learning_rate': 4.2315789473684215e-06, 'epoch': 8.59}
{'loss': 0.4644, 'grad_norm': 1.1340672969818115, 'learning_rat

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.1916773319244385, 'eval_accuracy': 0.7315789473684211, 'eval_f1': 0.730497875346291, 'eval_runtime': 53.1814, 'eval_samples_per_second': 7.145, 'eval_steps_per_second': 0.451, 'epoch': 9.0}
{'loss': 0.4589, 'grad_norm': 1.1218756437301636, 'learning_rate': 2.968421052631579e-06, 'epoch': 9.01}
{'loss': 0.4378, 'grad_norm': 1.079724907875061, 'learning_rate': 2.715789473684211e-06, 'epoch': 9.09}
{'loss': 0.4468, 'grad_norm': 1.0606586933135986, 'learning_rate': 2.4631578947368424e-06, 'epoch': 9.18}
{'loss': 0.422, 'grad_norm': 0.9863461256027222, 'learning_rate': 2.2105263157894734e-06, 'epoch': 9.26}
{'loss': 0.435, 'grad_norm': 0.9780309796333313, 'learning_rate': 1.9578947368421052e-06, 'epoch': 9.35}
{'loss': 0.4376, 'grad_norm': 0.9087545871734619, 'learning_rate': 1.7052631578947369e-06, 'epoch': 9.43}
{'loss': 0.437, 'grad_norm': 0.8572819232940674, 'learning_rate': 1.4526315789473685e-06, 'epoch': 9.52}
{'loss': 0.442, 'grad_norm': 1.090192198753357, 'learning_

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.1873637437820435, 'eval_accuracy': 0.7342105263157894, 'eval_f1': 0.7323141919249194, 'eval_runtime': 51.102, 'eval_samples_per_second': 7.436, 'eval_steps_per_second': 0.47, 'epoch': 10.0}
{'train_runtime': 5227.7836, 'train_samples_per_second': 2.908, 'train_steps_per_second': 0.182, 'train_loss': 1.2449252640573603, 'epoch': 10.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.1873637437820435, 'eval_accuracy': 0.7342105263157894, 'eval_f1': 0.7323141919249194, 'eval_runtime': 51.7452, 'eval_samples_per_second': 7.344, 'eval_steps_per_second': 0.464, 'epoch': 10.0}


In [None]:
from datasets import load_dataset
from transformers import ViTForImageClassification, ViTImageProcessor, TrainingArguments, Trainer
import torch
from torchvision import transforms
from sklearn.metrics import accuracy_score, f1_score

# Load dataset and split
dataset = load_dataset("yainage90/fashion-pattern-images")
split = dataset["train"].train_test_split(test_size=0.2, seed=42)
trainset = split["train"]
testset = split["test"]

# Get label info
labels = trainset.features["label"].names
num_labels = len(labels)
id2label = {str(i): l for i, l in enumerate(labels)}
label2id = {l: i for i, l in enumerate(labels)}

# Preprocessing (ViT processor + optional augmentation)
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

augment = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(10),
    # Do NOT include ToTensor or Normalize here
])


def transform_examples(batch):
    # Data augmentation only on training set
    if "train" in batch["__split__"][0]:
        images = [augment(img.convert("RGB")) for img in batch["image"]]
    else:
        images = [img.convert("RGB") for img in batch["image"]]
    # The processor expects PIL Images or numpy arrays in [0, 255] range
    processed = processor(images=images, return_tensors="pt")
    # processed["pixel_values"] is a tensor of shape (batch_size, 3, 224, 224)
    return {
        "pixel_values": processed["pixel_values"],
        "labels": batch["label"]
    }
# Add split info for augmentation
trainset = trainset.add_column("__split__", ["train"] * len(trainset))
testset = testset.add_column("__split__", ["test"] * len(testset))

trainset = trainset.map(transform_examples, batched=True, remove_columns=trainset.column_names)
testset = testset.map(transform_examples, batched=True, remove_columns=testset.column_names)

# Model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


# Data collator
def collate_fn(batch):
    # Ensure pixel_values are tensors and stack them
    pixel_values = torch.stack([x["pixel_values"] if isinstance(x["pixel_values"], torch.Tensor) else torch.tensor(x["pixel_values"]) for x in batch])
    labels = torch.tensor([l for x in batch for l in (x["labels"] if isinstance(x["labels"], list) else [x["labels"]])])
    return {"pixel_values": pixel_values, "labels": labels}
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./vit2-pattern",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Try more epochs for better results
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=8,
    fp16=True,
    remove_unused_columns=True,
    report_to="none",
    learning_rate=3e-5,  # Lower learning rate for better fine-tuning
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset,
    data_collator=collate_fn,
    tokenizer=None,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Evaluate
metrics = trainer.evaluate()
print(metrics)

Resolving data files:   0%|          | 0/1900 [00:00<?, ?it/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/950 [00:00<?, ?it/s]

{'loss': 2.9334, 'grad_norm': 1.889012098312378, 'learning_rate': 2.974736842105263e-05, 'epoch': 0.08}
{'loss': 2.9303, 'grad_norm': 1.8285527229309082, 'learning_rate': 2.9494736842105264e-05, 'epoch': 0.17}
{'loss': 2.8979, 'grad_norm': 1.72324538230896, 'learning_rate': 2.9242105263157893e-05, 'epoch': 0.25}
{'loss': 2.8921, 'grad_norm': 1.7049087285995483, 'learning_rate': 2.8989473684210528e-05, 'epoch': 0.34}
{'loss': 2.8601, 'grad_norm': 1.9082345962524414, 'learning_rate': 2.8736842105263157e-05, 'epoch': 0.42}
{'loss': 2.8629, 'grad_norm': 1.9359761476516724, 'learning_rate': 2.8484210526315792e-05, 'epoch': 0.51}
{'loss': 2.8435, 'grad_norm': 2.0235798358917236, 'learning_rate': 2.823157894736842e-05, 'epoch': 0.59}
{'loss': 2.8228, 'grad_norm': 2.0750834941864014, 'learning_rate': 2.7978947368421052e-05, 'epoch': 0.67}
{'loss': 2.7832, 'grad_norm': 2.2923481464385986, 'learning_rate': 2.7726315789473684e-05, 'epoch': 0.76}
{'loss': 2.7549, 'grad_norm': 2.029365062713623, 'l

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 2.6859734058380127, 'eval_accuracy': 0.41842105263157897, 'eval_f1': 0.3808719190907127, 'eval_runtime': 113.0618, 'eval_samples_per_second': 3.361, 'eval_steps_per_second': 0.212, 'epoch': 1.0}
{'loss': 2.6624, 'grad_norm': 1.9285041093826294, 'learning_rate': 2.696842105263158e-05, 'epoch': 1.01}
{'loss': 2.5331, 'grad_norm': 2.1956522464752197, 'learning_rate': 2.671578947368421e-05, 'epoch': 1.09}
{'loss': 2.4866, 'grad_norm': 2.08432936668396, 'learning_rate': 2.6463157894736843e-05, 'epoch': 1.18}
{'loss': 2.4587, 'grad_norm': 1.9122259616851807, 'learning_rate': 2.6210526315789475e-05, 'epoch': 1.26}
{'loss': 2.4089, 'grad_norm': 1.9667901992797852, 'learning_rate': 2.5957894736842107e-05, 'epoch': 1.35}
{'loss': 2.3896, 'grad_norm': 2.1207916736602783, 'learning_rate': 2.5705263157894736e-05, 'epoch': 1.43}
{'loss': 2.3408, 'grad_norm': 2.1696579456329346, 'learning_rate': 2.545263157894737e-05, 'epoch': 1.52}
{'loss': 2.3313, 'grad_norm': 2.1553730964660645, 'lea

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 2.2480416297912598, 'eval_accuracy': 0.6263157894736842, 'eval_f1': 0.6019023120179844, 'eval_runtime': 95.3121, 'eval_samples_per_second': 3.987, 'eval_steps_per_second': 0.252, 'epoch': 2.0}
{'loss': 2.1261, 'grad_norm': 2.538963556289673, 'learning_rate': 2.393684210526316e-05, 'epoch': 2.02}
{'loss': 1.9694, 'grad_norm': 2.384615898132324, 'learning_rate': 2.368421052631579e-05, 'epoch': 2.11}
{'loss': 1.9442, 'grad_norm': 2.289508581161499, 'learning_rate': 2.343157894736842e-05, 'epoch': 2.19}
{'loss': 1.9172, 'grad_norm': 2.1681413650512695, 'learning_rate': 2.3178947368421054e-05, 'epoch': 2.27}
{'loss': 1.877, 'grad_norm': 2.1226418018341064, 'learning_rate': 2.2926315789473683e-05, 'epoch': 2.36}
{'loss': 1.85, 'grad_norm': 1.9248988628387451, 'learning_rate': 2.2673684210526318e-05, 'epoch': 2.44}
{'loss': 1.8261, 'grad_norm': 2.5146727561950684, 'learning_rate': 2.2421052631578946e-05, 'epoch': 2.53}
{'loss': 1.7334, 'grad_norm': 2.9920055866241455, 'learning_

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.88667893409729, 'eval_accuracy': 0.6736842105263158, 'eval_f1': 0.6571778469512564, 'eval_runtime': 74.4352, 'eval_samples_per_second': 5.105, 'eval_steps_per_second': 0.322, 'epoch': 3.0}
{'loss': 1.5581, 'grad_norm': 2.4517459869384766, 'learning_rate': 2.0905263157894737e-05, 'epoch': 3.03}
{'loss': 1.5248, 'grad_norm': 1.8101624250411987, 'learning_rate': 2.065263157894737e-05, 'epoch': 3.12}
{'loss': 1.4207, 'grad_norm': 2.083425283432007, 'learning_rate': 2.04e-05, 'epoch': 3.2}
{'loss': 1.41, 'grad_norm': 2.358433961868286, 'learning_rate': 2.0147368421052633e-05, 'epoch': 3.28}
{'loss': 1.4652, 'grad_norm': 2.7642390727996826, 'learning_rate': 1.989473684210526e-05, 'epoch': 3.37}
{'loss': 1.419, 'grad_norm': 2.5002408027648926, 'learning_rate': 1.9642105263157897e-05, 'epoch': 3.45}
{'loss': 1.4183, 'grad_norm': 2.3595733642578125, 'learning_rate': 1.9389473684210525e-05, 'epoch': 3.54}
{'loss': 1.3761, 'grad_norm': 2.137265682220459, 'learning_rate': 1.9136842

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.63411545753479, 'eval_accuracy': 0.7236842105263158, 'eval_f1': 0.7143545182479067, 'eval_runtime': 128.3195, 'eval_samples_per_second': 2.961, 'eval_steps_per_second': 0.187, 'epoch': 4.0}
{'loss': 1.2452, 'grad_norm': 2.3106367588043213, 'learning_rate': 1.7873684210526316e-05, 'epoch': 4.04}
{'loss': 1.1457, 'grad_norm': 2.703010320663452, 'learning_rate': 1.7621052631578948e-05, 'epoch': 4.13}
{'loss': 1.1294, 'grad_norm': 1.9474276304244995, 'learning_rate': 1.736842105263158e-05, 'epoch': 4.21}
{'loss': 1.0698, 'grad_norm': 2.300886631011963, 'learning_rate': 1.711578947368421e-05, 'epoch': 4.29}
{'loss': 1.0751, 'grad_norm': 2.1971287727355957, 'learning_rate': 1.6863157894736844e-05, 'epoch': 4.38}
{'loss': 1.0903, 'grad_norm': 1.9363985061645508, 'learning_rate': 1.6610526315789472e-05, 'epoch': 4.46}
{'loss': 1.0226, 'grad_norm': 2.4788880348205566, 'learning_rate': 1.6357894736842108e-05, 'epoch': 4.55}
{'loss': 1.0394, 'grad_norm': 1.8380842208862305, 'learn

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.4474661350250244, 'eval_accuracy': 0.7473684210526316, 'eval_f1': 0.7409928401993879, 'eval_runtime': 52.4727, 'eval_samples_per_second': 7.242, 'eval_steps_per_second': 0.457, 'epoch': 5.0}
{'loss': 0.8983, 'grad_norm': 1.4935849905014038, 'learning_rate': 1.4842105263157895e-05, 'epoch': 5.05}
{'loss': 0.8813, 'grad_norm': 1.7877466678619385, 'learning_rate': 1.4589473684210527e-05, 'epoch': 5.14}
{'loss': 0.8462, 'grad_norm': 1.4427244663238525, 'learning_rate': 1.4336842105263159e-05, 'epoch': 5.22}
{'loss': 0.8798, 'grad_norm': 2.305615186691284, 'learning_rate': 1.408421052631579e-05, 'epoch': 5.31}
{'loss': 0.8319, 'grad_norm': 1.4197845458984375, 'learning_rate': 1.3831578947368421e-05, 'epoch': 5.39}
{'loss': 0.8614, 'grad_norm': 2.501708507537842, 'learning_rate': 1.3578947368421053e-05, 'epoch': 5.47}
{'loss': 0.8176, 'grad_norm': 1.7151557207107544, 'learning_rate': 1.3326315789473685e-05, 'epoch': 5.56}
{'loss': 0.8057, 'grad_norm': 3.1731605529785156, 'lea

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.324196219444275, 'eval_accuracy': 0.7631578947368421, 'eval_f1': 0.7607690809536142, 'eval_runtime': 54.0464, 'eval_samples_per_second': 7.031, 'eval_steps_per_second': 0.444, 'epoch': 6.0}
{'loss': 0.7114, 'grad_norm': 2.043274164199829, 'learning_rate': 1.1810526315789474e-05, 'epoch': 6.06}
{'loss': 0.6761, 'grad_norm': 1.5924066305160522, 'learning_rate': 1.1557894736842106e-05, 'epoch': 6.15}
{'loss': 0.6953, 'grad_norm': 1.6386523246765137, 'learning_rate': 1.1305263157894736e-05, 'epoch': 6.23}
{'loss': 0.6932, 'grad_norm': 1.6725308895111084, 'learning_rate': 1.1052631578947368e-05, 'epoch': 6.32}
{'loss': 0.6493, 'grad_norm': 1.3465479612350464, 'learning_rate': 1.08e-05, 'epoch': 6.4}
{'loss': 0.6227, 'grad_norm': 1.279639482498169, 'learning_rate': 1.0547368421052632e-05, 'epoch': 6.48}
{'loss': 0.5951, 'grad_norm': 1.407650351524353, 'learning_rate': 1.0294736842105264e-05, 'epoch': 6.57}
{'loss': 0.6387, 'grad_norm': 1.9260202646255493, 'learning_rate': 1.0

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.2436517477035522, 'eval_accuracy': 0.75, 'eval_f1': 0.7465532295634228, 'eval_runtime': 53.9218, 'eval_samples_per_second': 7.047, 'eval_steps_per_second': 0.445, 'epoch': 7.0}
{'loss': 0.5816, 'grad_norm': 1.4623305797576904, 'learning_rate': 8.778947368421053e-06, 'epoch': 7.07}
{'loss': 0.5936, 'grad_norm': 1.2557752132415771, 'learning_rate': 8.526315789473685e-06, 'epoch': 7.16}
{'loss': 0.5669, 'grad_norm': 1.2263545989990234, 'learning_rate': 8.273684210526317e-06, 'epoch': 7.24}
{'loss': 0.5163, 'grad_norm': 1.102920413017273, 'learning_rate': 8.021052631578949e-06, 'epoch': 7.33}
{'loss': 0.5096, 'grad_norm': 1.2629755735397339, 'learning_rate': 7.768421052631579e-06, 'epoch': 7.41}
{'loss': 0.5522, 'grad_norm': 1.1942319869995117, 'learning_rate': 7.5157894736842115e-06, 'epoch': 7.49}
{'loss': 0.5362, 'grad_norm': 1.1898623704910278, 'learning_rate': 7.2631578947368426e-06, 'epoch': 7.58}
{'loss': 0.5063, 'grad_norm': 1.2049922943115234, 'learning_rate': 7.01

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.1957699060440063, 'eval_accuracy': 0.75, 'eval_f1': 0.7470298530111844, 'eval_runtime': 50.7758, 'eval_samples_per_second': 7.484, 'eval_steps_per_second': 0.473, 'epoch': 8.0}
{'loss': 0.4649, 'grad_norm': 1.0138967037200928, 'learning_rate': 5.747368421052631e-06, 'epoch': 8.08}
{'loss': 0.4693, 'grad_norm': 0.9548399448394775, 'learning_rate': 5.494736842105263e-06, 'epoch': 8.17}
{'loss': 0.48, 'grad_norm': 1.041069746017456, 'learning_rate': 5.242105263157895e-06, 'epoch': 8.25}
{'loss': 0.4761, 'grad_norm': 1.124782919883728, 'learning_rate': 4.989473684210527e-06, 'epoch': 8.34}
{'loss': 0.474, 'grad_norm': 1.2339004278182983, 'learning_rate': 4.736842105263158e-06, 'epoch': 8.42}
{'loss': 0.4603, 'grad_norm': 1.1868741512298584, 'learning_rate': 4.48421052631579e-06, 'epoch': 8.51}
{'loss': 0.4909, 'grad_norm': 1.0428881645202637, 'learning_rate': 4.2315789473684215e-06, 'epoch': 8.59}
{'loss': 0.4503, 'grad_norm': 1.744766354560852, 'learning_rate': 3.978947368

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.1650307178497314, 'eval_accuracy': 0.7526315789473684, 'eval_f1': 0.7506950046885792, 'eval_runtime': 51.7999, 'eval_samples_per_second': 7.336, 'eval_steps_per_second': 0.463, 'epoch': 9.0}
{'loss': 0.4488, 'grad_norm': 0.986046552658081, 'learning_rate': 2.968421052631579e-06, 'epoch': 9.01}
{'loss': 0.4256, 'grad_norm': 1.0235873460769653, 'learning_rate': 2.715789473684211e-06, 'epoch': 9.09}
{'loss': 0.4639, 'grad_norm': 1.0451924800872803, 'learning_rate': 2.4631578947368424e-06, 'epoch': 9.18}
{'loss': 0.4194, 'grad_norm': 0.969524085521698, 'learning_rate': 2.2105263157894734e-06, 'epoch': 9.26}
{'loss': 0.4243, 'grad_norm': 0.9470532536506653, 'learning_rate': 1.9578947368421052e-06, 'epoch': 9.35}
{'loss': 0.4226, 'grad_norm': 2.724477529525757, 'learning_rate': 1.7052631578947369e-06, 'epoch': 9.43}
{'loss': 0.4193, 'grad_norm': 0.7828384041786194, 'learning_rate': 1.4526315789473685e-06, 'epoch': 9.52}
{'loss': 0.4427, 'grad_norm': 1.0531713962554932, 'learn

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.15370774269104, 'eval_accuracy': 0.75, 'eval_f1': 0.7489343388324992, 'eval_runtime': 57.2709, 'eval_samples_per_second': 6.635, 'eval_steps_per_second': 0.419, 'epoch': 10.0}
{'train_runtime': 5108.05, 'train_samples_per_second': 2.976, 'train_steps_per_second': 0.186, 'train_loss': 1.2321367283871298, 'epoch': 10.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.15370774269104, 'eval_accuracy': 0.75, 'eval_f1': 0.7489343388324992, 'eval_runtime': 56.1626, 'eval_samples_per_second': 6.766, 'eval_steps_per_second': 0.427, 'epoch': 10.0}


In [45]:
from datasets import load_dataset
from transformers import ViTForImageClassification, ViTImageProcessor, TrainingArguments, Trainer
import torch
from torchvision import transforms
from sklearn.metrics import accuracy_score, f1_score

# Load dataset and split
dataset = load_dataset("yainage90/fashion-pattern-images")
split = dataset["train"].train_test_split(test_size=0.2, seed=42)
trainset = split["train"]
testset = split["test"]

# Get label info
labels = trainset.features["label"].names
num_labels = len(labels)
id2label = {str(i): l for i, l in enumerate(labels)}
label2id = {l: i for i, l in enumerate(labels)}

# Preprocessing (ViT processor + optional augmentation)
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

augment = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(10),
    # Do NOT include ToTensor or Normalize here
])


def transform_examples(batch):
    # Data augmentation only on training set
    if "train" in batch["__split__"][0]:
        images = [augment(img.convert("RGB")) for img in batch["image"]]
    else:
        images = [img.convert("RGB") for img in batch["image"]]
    # The processor expects PIL Images or numpy arrays in [0, 255] range
    processed = processor(images=images, return_tensors="pt")
    # processed["pixel_values"] is a tensor of shape (batch_size, 3, 224, 224)
    return {
        "pixel_values": processed["pixel_values"],
        "labels": batch["label"]
    }
# Add split info for augmentation
trainset = trainset.add_column("__split__", ["train"] * len(trainset))
testset = testset.add_column("__split__", ["test"] * len(testset))

trainset = trainset.map(transform_examples, batched=True, remove_columns=trainset.column_names)
testset = testset.map(transform_examples, batched=True, remove_columns=testset.column_names)

# Model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


# Data collator
def collate_fn(batch):
    # Ensure pixel_values are tensors and stack them
    pixel_values = torch.stack([x["pixel_values"] if isinstance(x["pixel_values"], torch.Tensor) else torch.tensor(x["pixel_values"]) for x in batch])
    labels = torch.tensor([l for x in batch for l in (x["labels"] if isinstance(x["labels"], list) else [x["labels"]])])
    return {"pixel_values": pixel_values, "labels": labels}
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./vit2-pattern",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,  # Try more epochs for better results
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=8,
    fp16=True,
    remove_unused_columns=True,
    report_to="none",
    learning_rate=3e-5,  # Lower learning rate for better fine-tuning
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset,
    data_collator=collate_fn,
    tokenizer=None,
    compute_metrics=compute_metrics,
)

# Train
trainer.train(resume_from_checkpoint="./vit2-pattern/checkpoint-950")

# Evaluate
metrics = trainer.evaluate()
print(metrics)

Resolving data files:   0%|          | 0/1900 [00:00<?, ?it/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


  0%|          | 0/1425 [00:00<?, ?it/s]

  checkpoint_rng_state = torch.load(rng_file)


{'loss': 0.4204, 'grad_norm': 0.9561194181442261, 'learning_rate': 9.957894736842106e-06, 'epoch': 10.02}
{'loss': 0.4244, 'grad_norm': 0.9059658050537109, 'learning_rate': 9.789473684210527e-06, 'epoch': 10.11}
{'loss': 0.4234, 'grad_norm': 0.9929202795028687, 'learning_rate': 9.621052631578949e-06, 'epoch': 10.19}
{'loss': 0.4175, 'grad_norm': 1.1322929859161377, 'learning_rate': 9.45263157894737e-06, 'epoch': 10.27}
{'loss': 0.3923, 'grad_norm': 0.9534863829612732, 'learning_rate': 9.28421052631579e-06, 'epoch': 10.36}
{'loss': 0.3972, 'grad_norm': 0.780124306678772, 'learning_rate': 9.11578947368421e-06, 'epoch': 10.44}
{'loss': 0.4063, 'grad_norm': 1.0266274213790894, 'learning_rate': 8.947368421052632e-06, 'epoch': 10.53}
{'loss': 0.3943, 'grad_norm': 0.9619603753089905, 'learning_rate': 8.778947368421053e-06, 'epoch': 10.61}
{'loss': 0.3782, 'grad_norm': 0.8985133767127991, 'learning_rate': 8.610526315789474e-06, 'epoch': 10.69}
{'loss': 0.3768, 'grad_norm': 0.8068420886993408, 

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.1161777973175049, 'eval_accuracy': 0.7394736842105263, 'eval_f1': 0.7392406503023499, 'eval_runtime': 53.3603, 'eval_samples_per_second': 7.121, 'eval_steps_per_second': 0.45, 'epoch': 11.0}
{'loss': 0.3796, 'grad_norm': 0.7194156646728516, 'learning_rate': 7.93684210526316e-06, 'epoch': 11.03}
{'loss': 0.3637, 'grad_norm': 0.8792483806610107, 'learning_rate': 7.768421052631579e-06, 'epoch': 11.12}
{'loss': 0.3566, 'grad_norm': 0.7101445198059082, 'learning_rate': 7.600000000000001e-06, 'epoch': 11.2}
{'loss': 0.3404, 'grad_norm': 0.6447864174842834, 'learning_rate': 7.431578947368421e-06, 'epoch': 11.28}
{'loss': 0.3437, 'grad_norm': 0.7010276317596436, 'learning_rate': 7.2631578947368426e-06, 'epoch': 11.37}
{'loss': 0.3457, 'grad_norm': 0.7493733763694763, 'learning_rate': 7.094736842105264e-06, 'epoch': 11.45}
{'loss': 0.3481, 'grad_norm': 0.8023068904876709, 'learning_rate': 6.926315789473685e-06, 'epoch': 11.54}
{'loss': 0.3363, 'grad_norm': 0.8175363540649414, 'l

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.0894335508346558, 'eval_accuracy': 0.7447368421052631, 'eval_f1': 0.7436619178067236, 'eval_runtime': 50.9611, 'eval_samples_per_second': 7.457, 'eval_steps_per_second': 0.471, 'epoch': 12.0}
{'loss': 0.3189, 'grad_norm': 0.7114591002464294, 'learning_rate': 5.915789473684211e-06, 'epoch': 12.04}
{'loss': 0.323, 'grad_norm': 0.6806594729423523, 'learning_rate': 5.747368421052631e-06, 'epoch': 12.13}
{'loss': 0.3123, 'grad_norm': 0.6839451193809509, 'learning_rate': 5.578947368421052e-06, 'epoch': 12.21}
{'loss': 0.3115, 'grad_norm': 0.7367866039276123, 'learning_rate': 5.410526315789474e-06, 'epoch': 12.29}
{'loss': 0.3008, 'grad_norm': 0.6142610907554626, 'learning_rate': 5.242105263157895e-06, 'epoch': 12.38}
{'loss': 0.3128, 'grad_norm': 2.843189001083374, 'learning_rate': 5.073684210526316e-06, 'epoch': 12.46}
{'loss': 0.3129, 'grad_norm': 0.6275171637535095, 'learning_rate': 4.9052631578947365e-06, 'epoch': 12.55}
{'loss': 0.3125, 'grad_norm': 0.6973397135734558, '

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.0814205408096313, 'eval_accuracy': 0.7315789473684211, 'eval_f1': 0.7327807131760055, 'eval_runtime': 53.1242, 'eval_samples_per_second': 7.153, 'eval_steps_per_second': 0.452, 'epoch': 13.0}
{'loss': 0.3028, 'grad_norm': 0.6144954562187195, 'learning_rate': 3.894736842105263e-06, 'epoch': 13.05}
{'loss': 0.3002, 'grad_norm': 0.6317115426063538, 'learning_rate': 3.7263157894736843e-06, 'epoch': 13.14}
{'loss': 0.3018, 'grad_norm': 0.6611103415489197, 'learning_rate': 3.557894736842105e-06, 'epoch': 13.22}
{'loss': 0.2925, 'grad_norm': 0.5990911722183228, 'learning_rate': 3.3894736842105264e-06, 'epoch': 13.31}
{'loss': 0.2894, 'grad_norm': 0.6253290772438049, 'learning_rate': 3.2210526315789476e-06, 'epoch': 13.39}
{'loss': 0.2915, 'grad_norm': 0.5395194888114929, 'learning_rate': 3.0526315789473684e-06, 'epoch': 13.47}
{'loss': 0.2871, 'grad_norm': 0.6530255675315857, 'learning_rate': 2.8842105263157897e-06, 'epoch': 13.56}
{'loss': 0.2874, 'grad_norm': 0.6690309047698

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.072715401649475, 'eval_accuracy': 0.7368421052631579, 'eval_f1': 0.7375867046891705, 'eval_runtime': 77.58, 'eval_samples_per_second': 4.898, 'eval_steps_per_second': 0.309, 'epoch': 14.0}
{'loss': 0.2844, 'grad_norm': 0.6431567668914795, 'learning_rate': 1.8736842105263158e-06, 'epoch': 14.06}
{'loss': 0.2756, 'grad_norm': 0.5587624907493591, 'learning_rate': 1.7052631578947369e-06, 'epoch': 14.15}
{'loss': 0.2788, 'grad_norm': 0.5767632126808167, 'learning_rate': 1.5368421052631579e-06, 'epoch': 14.23}
{'loss': 0.2837, 'grad_norm': 0.7035180330276489, 'learning_rate': 1.3684210526315791e-06, 'epoch': 14.32}
{'loss': 0.2839, 'grad_norm': 0.6471250653266907, 'learning_rate': 1.2000000000000002e-06, 'epoch': 14.4}
{'loss': 0.2806, 'grad_norm': 0.7250261902809143, 'learning_rate': 1.031578947368421e-06, 'epoch': 14.48}
{'loss': 0.2801, 'grad_norm': 0.6492757797241211, 'learning_rate': 8.631578947368421e-07, 'epoch': 14.57}
{'loss': 0.2747, 'grad_norm': 0.7524358034133911,

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.0711021423339844, 'eval_accuracy': 0.7368421052631579, 'eval_f1': 0.7375867046891705, 'eval_runtime': 131.5153, 'eval_samples_per_second': 2.889, 'eval_steps_per_second': 0.182, 'epoch': 15.0}
{'train_runtime': 2194.7286, 'train_samples_per_second': 10.389, 'train_steps_per_second': 0.649, 'train_loss': 0.10791874935752467, 'epoch': 15.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 1.0711021423339844, 'eval_accuracy': 0.7368421052631579, 'eval_f1': 0.7375867046891705, 'eval_runtime': 91.4089, 'eval_samples_per_second': 4.157, 'eval_steps_per_second': 0.263, 'epoch': 15.0}


In [47]:
# Save the model
trainer.save_model("./vit2-pattern-final")
# Save the processor
processor.save_pretrained("./vit2-pattern-final")


['./vit2-pattern-final\\preprocessor_config.json']

In [None]:
image_path = ""