In [1]:
import torch
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform_svhn = transforms.Compose([
    transforms.Resize((64, 64)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),  
   
])

In [15]:
svhn_train = datasets.SVHN(root='./data', split='train', download=True, transform=transform_svhn)
svhn_test = datasets.SVHN(root='./data', split='test', download=True, transform=transform_svhn)

train_loader = DataLoader(svhn_train, batch_size=32, shuffle=True)
test_loader = DataLoader(svhn_test, batch_size=32, shuffle=False)

Using downloaded and verified file: ./data/train_32x32.mat
Using downloaded and verified file: ./data/test_32x32.mat


In [16]:
effnet_model = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.IMAGENET1K_V1)

num_ftrs = effnet_model.classifier[1].in_features
effnet_model.classifier = nn.Sequential(
     nn.Dropout(p=0.5, inplace=True),
    nn.Linear(num_ftrs, 10),  
)

effnet_model = effnet_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(effnet_model.parameters(), lr=0.0001)

def train_effnet(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)  
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Training Loss: {total_loss / len(loader):.4f}")

def evaluate_effnet(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'EfficientNet SVHN Accuracy: {accuracy:.2f}%')

In [17]:
train_effnet(effnet_model, train_loader, optimizer, criterion, device)
evaluate_effnet(effnet_model, test_loader, device)

Training Loss: 1.7701
EfficientNet SVHN Accuracy: 60.89%


In [7]:
transform_svhn = transforms.Compose([
    transforms.Resize((384, 384)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4377, 0.4438, 0.4728], std=[0.1980, 0.2010, 0.1970]),  
])
svhn_train = datasets.SVHN(root='./data', split='train', download=True, transform=transform_svhn)
svhn_test = datasets.SVHN(root='./data', split='test', download=True, transform=transform_svhn)

train_loader = DataLoader(svhn_train, batch_size=32, shuffle=True)
test_loader = DataLoader(svhn_test, batch_size=32, shuffle=False)

vit_model = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1)

for param in vit_model.parameters():
    param.requires_grad = False
    
    
for param in vit_model.heads.head.parameters():
    param.requires_grad = True

num_ftrs = vit_model.heads.head.in_features
vit_model.heads.head = nn.Linear(num_ftrs, 10)  

vit_model = vit_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vit_model.parameters(), lr=1e-4)


def train_vit(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)  
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Training Loss: {total_loss / len(loader):.4f}")

def evaluate_vit(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'ViT SVHN Accuracy: {accuracy:.2f}%')

Using downloaded and verified file: ./data/train_32x32.mat
Using downloaded and verified file: ./data/test_32x32.mat


Downloading: "https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16_swag-9ac1b537.pth
100%|██████████| 331M/331M [00:08<00:00, 42.6MB/s] 


In [8]:
train_vit(vit_model, train_loader, optimizer, criterion, device)
evaluate_vit(vit_model, test_loader, device)

Training Loss: 1.7791
ViT SVHN Accuracy: 50.87%


In [6]:
from PIL import Image
import torch
from transformers import CLIPModel, CLIPProcessor

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)

class_labels = [str(i) for i in range(10)]  

def evaluate_clip(model, processor, loader, class_labels):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, targets in loader:
            pil_images = [Image.fromarray((image.permute(1, 2, 0).cpu().numpy() * 255).astype('uint8')) for image in images]
            
            image_inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)
            text_inputs = processor(text=class_labels, return_tensors="pt", padding=True).to(device)
            
            outputs = model(**image_inputs, **text_inputs)
            
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=1)
            
            _, predicted = torch.max(probs, 1)
            
            correct += (predicted.cpu() == targets).sum().item()
            total += images.size(0)
    
    accuracy = 100 * correct / total
    print(f'CLIP SVHN Accuracy: {accuracy:.2f}%')

evaluate_clip(clip_model, clip_processor, test_loader, class_labels)


config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



CLIP SVHN Accuracy: 24.20%
