In [12]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import ViTForImageClassification, ViTImageProcessor
from transformers import pipeline
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import tqdm  # For progress bars
from sklearn.metrics import classification_report
import pandas as pd

In [13]:
# 1. Load the ViT Model and Processor
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=2  # Binary classification: helmet or no-helmet
)
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 2. Detect device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [15]:
# 3. Dataset Class
class HelmetDataset(Dataset):
    def __init__(self, image_paths, labels, processor):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx]
        inputs = self.processor(images=image, return_tensors="pt")
        return inputs["pixel_values"].squeeze(0), torch.tensor(label)

In [16]:
# 4. Load CSV data
train_df = pd.read_csv('./train_labels.csv')
val_df = pd.read_csv('./valid_labels.csv')

train_image_paths = train_df['image_path'].tolist()
train_labels = train_df['label'].tolist()

val_image_paths = val_df['image_path'].tolist()
val_labels = val_df['label'].tolist()

In [17]:
# 5. Create Dataset instances
train_data = HelmetDataset(train_image_paths, train_labels, processor)
val_data = HelmetDataset(val_image_paths, val_labels, processor)

In [18]:
# 6. Create DataLoader instances
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [19]:
# 7. Training Function (Consolidated)
def train_one_epoch(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for images, labels in tqdm(data_loader, desc="Training", total=len(data_loader)):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    return avg_loss

In [20]:
# 8. Evaluation Function (Consolidated)
def evaluate(model, data_loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for images, labels in tqdm(data_loader, desc="Evaluating", total=len(data_loader)):
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            preds = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds))

In [21]:
# 9. Optimizer and Loss Function
optimizer = Adam(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

In [22]:
# 10. Training Loop with Validation
num_epochs = 5
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train the model for one epoch
    avg_train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, device)
    print(f"Training Loss: {avg_train_loss:.4f}")
        # Evaluate the model on the validation set
    evaluate(model, val_loader, device)


Epoch 1/5


Training:  12%|████████                                                             | 13/111 [08:27<1:03:42, 39.01s/it]


KeyboardInterrupt: 

In [None]:
# 11. Save the Model and Processor
model.save_pretrained("./models/helmet_vit")
processor.save_pretrained("./models/helmet_vit")

In [None]:
# 12. Inference with Pipeline (Optional - for future inference)
helmet_detector = pipeline("image-classification", model="./models/helmet_vit")
# For inference on test images
image_paths = ["./Media/riders_1.jpg", "./Media/riders_2.jpg", "./Media/riders_3.jpg"]  # Replace with your test images

results = []
for img_path in tqdm(image_paths, desc="Processing Images", ncols=100):
    result = helmet_detector(img_path)
    results.append((img_path, result))

# Print inference results
for img_path, result in results:
    print(f"Results for {img_path}: {result}")
