In [1]:
!pip install transformers datasets torch torchvision

Collecting transformers
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Using cached torch-2.9.1-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting torchvision
  Using cached torchvision-0.24.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (5.9 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.20.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0

In [2]:
from transformers import AutoImageProcessor, SwinForImageClassification
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "microsoft/swin-tiny-patch4-window7-224"
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = SwinForImageClassification.from_pretrained(model_name)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
dataset = load_dataset("cifar10", split="test[:8]")

Generating train split: 100%|██████████| 50000/50000 [00:00<00:00, 80524.16 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 208872.40 examples/s]


In [5]:
images = [item["img"] for item in dataset]
labels = [item["label"] for item in dataset]

In [6]:
inputs = image_processor(images, return_tensors="pt").to(model.device)

In [7]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

In [8]:
predicted_labels = logits.argmax(dim=-1).cpu().numpy()

In [9]:
num_classes = len(model.config.id2label)
if num_classes != len(set(labels)):
    print("Warning: Model label space does not match CIFAR-10 labels. Mapping may be required.")
    class_mapping = {i: i % 10 for i in range(num_classes)}
    predicted_labels = [class_mapping[label] for label in predicted_labels]



In [10]:
class_names = [
    "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"
]
predicted_class_names = [class_names[label] for label in predicted_labels]
true_class_names = [class_names[label] for label in labels]

In [11]:
for i, (true_label, predicted_label) in enumerate(zip(true_class_names, predicted_class_names)):
    print(
        f"Image {i + 1}: True Label = {true_label}, Predicted Label = {predicted_label}")

Image 1: True Label = cat, Predicted Label = dog
Image 2: True Label = ship, Predicted Label = automobile
Image 3: True Label = ship, Predicted Label = ship
Image 4: True Label = airplane, Predicted Label = bird
Image 5: True Label = frog, Predicted Label = bird
Image 6: True Label = frog, Predicted Label = ship
Image 7: True Label = automobile, Predicted Label = dog
Image 8: True Label = frog, Predicted Label = automobile
