In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor
import os


In [3]:
! unzip /content/drive/MyDrive/Oranges/train.zip

Archive:  /content/drive/MyDrive/Oranges/train.zip
   creating: train/
   creating: train/Blood Oranges_transform/
  inflating: train/Blood Oranges_transform/1.png  
  inflating: train/Blood Oranges_transform/10.png  
  inflating: train/Blood Oranges_transform/11.png  
  inflating: train/Blood Oranges_transform/12.png  
  inflating: train/Blood Oranges_transform/13.png  
  inflating: train/Blood Oranges_transform/14.png  
  inflating: train/Blood Oranges_transform/15.png  
  inflating: train/Blood Oranges_transform/16.png  
  inflating: train/Blood Oranges_transform/17.png  
  inflating: train/Blood Oranges_transform/18.png  
  inflating: train/Blood Oranges_transform/19.png  
  inflating: train/Blood Oranges_transform/2.png  
  inflating: train/Blood Oranges_transform/20.png  
  inflating: train/Blood Oranges_transform/3.png  
  inflating: train/Blood Oranges_transform/4.png  
  inflating: train/Blood Oranges_transform/5.png  
  inflating: train/Blood Oranges_transform/6.png  
  infla

In [4]:
# Define the augmentations and transformations
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),  # Randomly crop and resize the image
    transforms.RandomHorizontalFlip(),  # Random horizontal flip
    transforms.RandomRotation(20),  # Random rotation
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),  # Random color jitter
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # Random translation
    transforms.RandomErasing(),  # Random erasing for regularization
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

transform_val = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

# Load the dataset
full_dataset = datasets.ImageFolder(root='/content/train', transform=transform_train)

# Split into train and validation sets
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

# Apply transforms
train_dataset.dataset.transform = transform_train
val_dataset.dataset.transform = transform_val

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [5]:
for images, labels in train_loader:
    print(f"Image shape: {images.shape}, Labels shape: {labels.shape}")
    break

Image shape: torch.Size([8, 3, 224, 224]), Labels shape: torch.Size([8])


In [6]:
# Load the pre-trained Vision Transformer model for image classification
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',  # Pre-trained on ImageNet21k, suitable for transfer learning
    num_labels=len(train_dataset.dataset.classes)  # Number of classes in your dataset
)

# Freeze the backbone layers for transfer learning (optional)
for param in model.vit.parameters():
    param.requires_grad = False

# Optionally, unfreeze later layers of the ViT for fine-tuning:
for param in model.vit.encoder.layer[-1].parameters():
    param.requires_grad = True

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [7]:
train_loader.dataset.dataset

Dataset ImageFolder
    Number of datapoints: 89
    Root location: /content/train
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear, antialias=True)
               RandomHorizontalFlip(p=0.5)
               RandomRotation(degrees=[-20.0, 20.0], interpolation=nearest, expand=False, fill=0)
               ColorJitter(brightness=(0.8, 1.2), contrast=(0.8, 1.2), saturation=(0.8, 1.2), hue=(-0.2, 0.2))
               RandomAffine(degrees=[0.0, 0.0], translate=(0.1, 0.1))
               RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False)
               ToTensor()
               Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
           )

In [8]:
print(f"Dataset classes: {train_loader.dataset.dataset.classes}")
print(f"Dataset length: {len(train_loader.dataset)}")

Dataset classes: ['Blood Oranges_transform', 'Navel_transform', 'Tangelo_transform', 'Tangerine_transform', 'cara cara_transform']
Dataset length: 71


In [9]:
from PIL import Image

img_path = os.path.join('train', train_loader.dataset.dataset.classes[0], os.listdir(os.path.join('train', train_loader.dataset.dataset.classes[0]))[0])
img = Image.open(img_path)
print(f"Image size: {img.size}")

Image size: (224, 224)


In [10]:
single_image, single_label = next(iter(train_loader))
print(f"Type of single_image: {type(single_image)}")
print(f"Type of single_label: {type(single_label)}")

Type of single_image: <class 'torch.Tensor'>
Type of single_label: <class 'torch.Tensor'>


In [None]:
# Define the optimizer and loss function
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=3e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
def train(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images).logits
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation accuracy
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).logits
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_accuracy = 100 * correct / total
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Accuracy: {val_accuracy:.2f}%")

        model.train()  # Switch back to training mode

# Start training
train(model, train_loader, val_loader, criterion, optimizer, epochs=100)

In [13]:
torch.save(model.state_dict(), "vit_classification_model.pth")

In [18]:
import torch
from PIL import Image
from transformers import ViTForImageClassification, ViTFeatureExtractor

# Load the fine-tuned model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',
    num_labels=len(train_dataset.dataset.classes)
)

model.load_state_dict(torch.load("vit_classification_model.pth"))
model.eval()  # Set model to evaluation mode

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the feature extractor used during training
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# Function to perform inference on a single image
def predict_image(image_path, model, feature_extractor, device):
    # Open the image
    image = Image.open(image_path)

    # Apply the same transformations as in validation
    inputs = feature_extractor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class
    predicted_class_idx = logits.argmax(-1).item()
    predicted_class = train_dataset.dataset.classes[predicted_class_idx]

    return predicted_class




Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [20]:
# Example usage
image_path = "/content/train/cara cara_transform/6.png"  # Replace with your image path
predicted_class = predict_image(image_path, model, feature_extractor, device)
print(f"Predicted class: {predicted_class}")

Predicted class: cara cara_transform


In [21]:
! mv "/content/vit_classification_model.pth" "/content/drive/MyDrive/Oranges/VIT FINE TUNED MODEL"