In [1]:
import torch
from torchvision import models

model = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)

model.eval()

weights = models.EfficientNet_B4_Weights.DEFAULT.get_state_dict(progress=True)

Downloading: "https://download.pytorch.org/models/efficientnet_b4_rwightman-23ab8bcd.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b4_rwightman-23ab8bcd.pth
100%|██████████| 74.5M/74.5M [00:00<00:00, 115MB/s] 


In [2]:
import torch
from torch import nn
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader
import torch.optim as optim

efficientnet_b4_model = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)

efficientnet_b4_model.classifier[1] = nn.Linear(efficientnet_b4_model.classifier[1].in_features, 10)  

for param in efficientnet_b4_model.features.parameters():
    param.requires_grad = False

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

optimizer = optim.AdamW(efficientnet_b4_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
efficientnet_b4_model.to(device)

epochs = 3
for epoch in range(epochs):
    efficientnet_b4_model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = efficientnet_b4_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

efficientnet_b4_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = efficientnet_b4_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy on CIFAR-10: {accuracy:.2f}%')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:11<00:00, 14631524.63it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Epoch 1, Loss: 1.6568229990896322
Epoch 2, Loss: 1.0595059520299817
Epoch 3, Loss: 0.866766551520225
Test Accuracy on CIFAR-10: 80.02%


In [3]:
import torch
from torchvision import models

vit_b16_model = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1)

vit_b16_model.eval()

weights = models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1.get_state_dict(progress=True)

Downloading: "https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16_swag-9ac1b537.pth
100%|██████████| 331M/331M [00:18<00:00, 18.4MB/s] 


In [4]:
import torch
from torch import nn
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader
import torch.optim as optim

vit_b16_model = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1)

vit_b16_model.heads.head = nn.Linear(vit_b16_model.heads.head.in_features, 10) 

for param in vit_b16_model.parameters():
    param.requires_grad = False

for param in vit_b16_model.heads.parameters():
    param.requires_grad = True

transform = transforms.Compose([
    transforms.Resize((384, 384)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

optimizer = optim.AdamW(vit_b16_model.heads.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vit_b16_model.to(device)

epochs = 3
for epoch in range(epochs):
    vit_b16_model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = vit_b16_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

vit_b16_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = vit_b16_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy on CIFAR-10: {accuracy:.2f}%')

Files already downloaded and verified
Files already downloaded and verified
Epoch 1, Loss: 0.3775843938756603
Epoch 2, Loss: 0.15415096293565217
Epoch 3, Loss: 0.13294663082379835
Test Accuracy on CIFAR-10: 95.16%


In [5]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image 
probs = logits_per_image.softmax(dim=1) 

config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [6]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
clip_model.to(device)

transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),        
])

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

cifar10_labels = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]


clip_model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
       
        pil_images = [Image.fromarray((image.permute(1, 2, 0).cpu().numpy() * 255).astype('uint8')) for image in images]
        image_inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)

        text_inputs = processor(text=cifar10_labels, return_tensors="pt", padding=True).to(device)

        outputs = clip_model(**image_inputs, **text_inputs)

        logits_per_image = outputs.logits_per_image

        probs = logits_per_image.softmax(dim=1)

        _, predicted = torch.max(probs, dim=1)

        correct += (predicted == labels.to(device)).sum().item()
        total += labels.size(0)
  
accuracy = 100 * correct / total
print(f'CLIP clip_model accuracy on CIFAR-10 test set: {accuracy:.2f}%')

Files already downloaded and verified
CLIP clip_model accuracy on CIFAR-10 test set: 87.30%


In [12]:
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import models

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


shape_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.Grayscale(num_output_channels=3),  
    transforms.ToTensor(), 
    transforms.Lambda(lambda x: torch.where(x > 0.5, torch.tensor(1.0, device=x.device), torch.tensor(0.0, device=x.device))) 
])


texture_transform = transforms.Compose([
    transforms.Resize((384, 384)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  
    transforms.Lambda(lambda x: x + 0.1 * torch.randn_like(x, device=x.device))  
])


color_transform = transforms.Compose([
    transforms.Resize((384, 384)),  
    transforms.Grayscale(num_output_channels=1), 
    transforms.ToTensor(), 
    transforms.Lambda(lambda x: torch.cat([x, x, x], dim=0)), 
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])



cifar10_train = datasets.CIFAR10(root='./data/cifar10', train=True, download=True, transform=None)
cifar10_test = datasets.CIFAR10(root='./data/cifar10', train=False, download=True, transform=None)

cifar10_data = cifar10_train + cifar10_test

def collate_fn(batch, transform):
    images, labels = zip(*batch)
    transformed_images = [transform(img) for img in images]
    return torch.stack(transformed_images), torch.tensor(labels)

shape_loader = DataLoader(cifar10_data, batch_size=32, shuffle=False, collate_fn=lambda batch: collate_fn(batch, shape_transform))
texture_loader = DataLoader(cifar10_data, batch_size=32, shuffle=False, collate_fn=lambda batch: collate_fn(batch, texture_transform))
color_loader = DataLoader(cifar10_data, batch_size=32, shuffle=False, collate_fn=lambda batch: collate_fn(batch, color_transform))

vit_b16_model = vit_b16_model.to(device)
efficientnet_b4_model = efficientnet_b4_model.to(device)

def evaluate_model(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device) 
            outputs = model(images)  
            _, predicted = torch.max(outputs, 1)  
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

shape_bias_acc_vit = evaluate_model(vit_b16_model, shape_loader, device)
texture_bias_acc_vit = evaluate_model(vit_b16_model, texture_loader, device)
color_bias_acc_vit = evaluate_model(vit_b16_model, color_loader, device)

shape_bias_acc_effnet = evaluate_model(efficientnet_b4_model, shape_loader, device)
texture_bias_acc_effnet = evaluate_model(efficientnet_b4_model, texture_loader, device)
color_bias_acc_effnet = evaluate_model(efficientnet_b4_model, color_loader, device)

print(f'ViT_B_16 Shape Bias Accuracy: {shape_bias_acc_vit:.2f}%')
print(f'ViT_B_16 Texture Bias Accuracy: {texture_bias_acc_vit:.2f}%')
print(f'ViT_B_16 Color Bias Accuracy: {color_bias_acc_vit:.2f}%')

print(f'EfficientNet_B4 Shape Bias Accuracy: {shape_bias_acc_effnet:.2f}%')
print(f'EfficientNet_B4 Texture Bias Accuracy: {texture_bias_acc_effnet:.2f}%')
print(f'EfficientNet_B4 Color Bias Accuracy: {color_bias_acc_effnet:.2f}%')


Files already downloaded and verified
Files already downloaded and verified
ViT_B_16 Shape Bias Accuracy: 24.46%
ViT_B_16 Texture Bias Accuracy: 95.88%
ViT_B_16 Color Bias Accuracy: 79.93%
EfficientNet_B4 Shape Bias Accuracy: 10.12%
EfficientNet_B4 Texture Bias Accuracy: 10.03%
EfficientNet_B4 Color Bias Accuracy: 43.23%


In [18]:
import os
import torch
from torchvision import datasets, transforms
import torchvision.transforms.functional as F
from PIL import Image
import cv2
import numpy as np
import random
from datasets import load_dataset

shape_dir = './data/shape'
texture_dir = './data/texture'
color_dir = './data/color'

os.makedirs(shape_dir, exist_ok=True)
os.makedirs(texture_dir, exist_ok=True)
os.makedirs(color_dir, exist_ok=True)

transform = transforms.Compose([transforms.Resize((384, 384)), transforms.ToTensor()])
cifar10_train = datasets.CIFAR10(root='./data/cifar10', train=True, download=True, transform=transform)
cifar10_test = datasets.CIFAR10(root='./data/cifar10', train=False, download=True, transform=transform)

cifar10_data = cifar10_train + cifar10_test

def edge_transform(img):
    img = np.array(img) 
    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) 
    edges = cv2.Canny(img_gray, threshold1=100, threshold2=200)  
    return Image.fromarray(edges)  

shape_transform = transforms.Compose([
    transforms.Lambda(lambda img: edge_transform(img)),  
    transforms.ToTensor() 
])

texture_dataset = load_dataset("cansa/Describable-Textures-Dataset-DTD", split="train")

def get_texture_images(dataset):
    textures = []
    for item in dataset:
        img = item['image'] if isinstance(item['image'], Image.Image) else Image.open(item['image']).convert('RGB')
        textures.append(transforms.Resize((384, 384))(img))  
    return textures

texture_images = get_texture_images(texture_dataset)

def blend_with_texture(img, texture):
    alpha = 0.5  
    img_tensor = transforms.ToTensor()(img) if not isinstance(img, torch.Tensor) else img
    texture_tensor = transforms.ToTensor()(texture) if not isinstance(texture, torch.Tensor) else texture
    
    if img_tensor.size() != texture_tensor.size():
        texture_tensor = F.resize(texture_tensor, img_tensor.size()[1:]) 

    blended_img = (alpha * img_tensor) + ((1 - alpha) * texture_tensor)
    return blended_img

def apply_texture_transform(img):
    texture_img = random.choice(texture_images)  
    return blend_with_texture(img, texture_img)  

texture_transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Lambda(lambda x: apply_texture_transform(x)),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

color_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3), 
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

def save_images_limited(dataset, transform, save_dir, limit=800):
    os.makedirs(save_dir, exist_ok=True)
    for i, (img, label) in enumerate(dataset):
        if i >= limit:  
            break
        img_pil = transforms.ToPILImage()(img) 
        img_transformed = transform(img_pil) 
        img_pil_transformed = transforms.ToPILImage()(img_transformed)  
        class_dir = os.path.join(save_dir, str(label))
        os.makedirs(class_dir, exist_ok=True)
        img_pil_transformed.save(os.path.join(class_dir, f"{i}.png"))

print("Saving shape bias dataset with edge detection method...")
save_images_limited(cifar10_data, shape_transform, shape_dir)

print("Saving texture bias dataset with DTD textures...")
save_images_limited(cifar10_data, texture_transform, texture_dir)

print("Saving color bias dataset...")
save_images_limited(cifar10_data, color_transform, color_dir)

print("Datasets saved successfully!")

Files already downloaded and verified
Files already downloaded and verified


Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/5640 [00:00<?, ?it/s]

Saving shape bias dataset with edge detection method...
Saving texture bias dataset with DTD textures...
Saving color bias dataset...
Datasets saved successfully!


In [20]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.optim as optim
from torchvision import models
import numpy as np
import random
import cv2
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def edge_transform(img):
    img = np.array(img)  
    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) 
    edges = cv2.Canny(img_gray, threshold1=100, threshold2=200)  
    edges_rgb = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)  
    return Image.fromarray(edges_rgb) 

shape_transform = transforms.Compose([
    transforms.Lambda(lambda img: edge_transform(img)), 
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

texture_dataset_dir = './data/texture'
texture_dataset = datasets.ImageFolder(root=texture_dataset_dir, transform=transforms.ToTensor())

texture_images = [texture_dataset[i][0] for i in range(len(texture_dataset))]

def apply_texture_transform(img):
    texture_img = random.choice(texture_images)  
    texture_resized = transforms.Resize((224, 224))(texture_img) 
    return blend_with_texture(img, texture_resized)

texture_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: apply_texture_transform(x)), 
    transforms.Resize((224, 224)),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

color_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3), 
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

shape_dataset = datasets.ImageFolder(root='./data/shape', transform=shape_transform)
texture_dataset = datasets.ImageFolder(root='./data/texture', transform=texture_transform)
color_dataset = datasets.ImageFolder(root='./data/color', transform=color_transform)

batch_size = 32
shape_loader = DataLoader(shape_dataset, batch_size=batch_size, shuffle=True)
texture_loader = DataLoader(texture_dataset, batch_size=batch_size, shuffle=True)
color_loader = DataLoader(color_dataset, batch_size=batch_size, shuffle=True)

model = models.vit_b_16(pretrained=True).to(device)
model.eval()  

def evaluate_bias(data_loader):
    total = 0
    correct = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()
    return correct / total

shape_bias_accuracy = evaluate_bias(shape_loader)
print(f'Shape Bias Accuracy: {shape_bias_accuracy:.4f}')

texture_bias_accuracy = evaluate_bias(texture_loader)
print(f'Texture Bias Accuracy: {texture_bias_accuracy:.4f}')

color_bias_accuracy = evaluate_bias(color_loader)
print(f'Color Bias Accuracy: {color_bias_accuracy:.4f}')


Shape Bias Accuracy: 0.0000


RecursionError: maximum recursion depth exceeded while calling a Python object

In [16]:
transform = transforms.Compose([
  transforms.Resize((384, 384)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
cifar_loader = DataLoader(cifar10_test, batch_size=32, shuffle=False)

cifar10_labels = [
    'airplane', 'automobile', 'bird', 'cat', 'deer',
    'dog', 'frog', 'horse', 'ship', 'truck'
]


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

efficient_net = efficientnet_b4_model
vit_model = vit_b16_model

clip_model = clip_model
clip_processor = processor

Files already downloaded and verified


In [17]:
import numpy as np

def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

def evaluate_clip_model(clip_model, dataloader, device, processor):
    clip_model.eval()  
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)  
            def tensor_to_pil(image_tensor):
                image_tensor = image_tensor.permute(1, 2, 0).cpu().numpy() 
                image_tensor = (image_tensor * 255).astype(np.uint8)  
                return Image.fromarray(image_tensor)

            pil_images = [tensor_to_pil(image) for image in images]

            image_inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)

            text_inputs = processor(text=[f"a photo of a {cifar10_labels[label.item()]}" for label in labels],
                                    return_tensors="pt", padding=True).to(device)

            outputs = clip_model(**image_inputs, **text_inputs)

            logits_per_image = outputs.logits_per_image 
            predicted = logits_per_image.argmax(dim=1) 

            total += labels.size(0)
            correct += (predicted == labels).sum().item() 

    accuracy = 100 * correct / total
    return accuracy




In [18]:
original_acc_effnet = evaluate_model(efficient_net, cifar_loader, device)
original_acc_vit = evaluate_model(vit_model, cifar_loader, device)
original_acc_clip = evaluate_clip_model(clip_model, cifar_loader, device, clip_processor)

print("Original Accuracy:")
print(f"EfficientNet: {original_acc_effnet:.2f}%")
print(f"ViT: {original_acc_vit:.2f}%")
print(f"CLIP-ViT: {original_acc_clip:.2f}%")

Original Accuracy:
EfficientNet: 62.55%
ViT: 95.33%
CLIP-ViT: 6.71%


In [20]:

def add_local_noise(img, noise_level, device):
    noisy_img = img.clone().to(device)  
    noise = noise_level * torch.randn((3, 8, 8)).to(device)  
    noisy_img[:, :8, :8] = noisy_img[:, :8, :8] + noise 
    return torch.clamp(noisy_img, -1, 1)  

def evaluate_model_on_noisy_images(model, dataloader, noise_level, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device) 
            noisy_images = torch.stack([add_local_noise(img, noise_level, device) for img in images])
            outputs = model(noisy_images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

def evaluate_clip_on_noisy_images(clip_model, dataloader, device, processor, noise_level=0.5):
    clip_model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            noisy_images = torch.stack([add_local_noise(img, noise_level, device) for img in images]).to(device)
            
            noisy_images_pil = [transforms.ToPILImage()(img).convert("RGB") for img in noisy_images]
            
            inputs = processor(
                text=[f"a photo of a {cifar10_test.classes[label]}" for label in labels],
                images=noisy_images_pil,
                return_tensors="pt",
                padding=True
            ).to(device)
            
            outputs = clip_model(**inputs)
            logits_per_image = outputs.logits_per_image
            
            predicted = logits_per_image.argmax(dim=1)
            total += labels.size(0)
            correct += (predicted.to(device) == labels.to(device)).sum().item()
    
    accuracy = 100 * correct / total
    return accuracy

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  

noisy_acc_effnet = evaluate_model_on_noisy_images(efficient_net, cifar_loader, noise_level=0.5, device=device)
noisy_acc_vit = evaluate_model_on_noisy_images(vit_model, cifar_loader, noise_level=0.5, device=device)
noisy_acc_clip = evaluate_clip_on_noisy_images(clip_model, cifar_loader, device, clip_processor, noise_level=0.5)


print("\nNoisy Accuracy:")
print(f"EfficientNet: {noisy_acc_effnet:.2f}%")
print(f"ViT: {noisy_acc_vit:.2f}%")
print(f"CLIP-ViT: {noisy_acc_clip:.2f}%")


Noisy Accuracy:
EfficientNet: 49.89%
ViT: 81.08%
CLIP-ViT: 7.26%


In [22]:
def style_transfer_vgg(content_img, style_img):
    return content_img * 0.5 + style_img * 0.5

def evaluate_model_on_styled_images(model, dataloader):
    correct = 0
    total = 0
    model.eval()

    with torch.no_grad():
        for images, labels in dataloader:
            styled_images = torch.stack([style_transfer_vgg(img, img) for img in images]).to(device)
            outputs = model(styled_images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted.to(device) == labels.to(device)).sum().item()

    accuracy = 100 * correct / total
    return accuracy

def evaluate_clip_on_styled_images(clip_model, dataloader, device, processor):
    correct = 0
    total = 0
    clip_model.eval()

    with torch.no_grad():
        for images, labels in dataloader:
            styled_images = torch.stack([style_transfer_vgg(img, img) for img in images])
            styled_images_pil = [transforms.ToPILImage()(img).convert("RGB") for img in styled_images]
            inputs = processor(text=[f"a photo of a {cifar10_test.classes[i]}" for i in range(10)], images=styled_images_pil, return_tensors="pt", padding=True).to(device)
            outputs = clip_model(**inputs)
            logits_per_image = outputs.logits_per_image
            predicted = logits_per_image.argmax(dim=1)
            total += labels.size(0)
            correct += (predicted.to(device) == labels.to(device)).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

styled_acc_effnet = evaluate_model_on_styled_images(efficient_net, cifar_loader)
styled_acc_vit = evaluate_model_on_styled_images(vit_model, cifar_loader)
styled_acc_clip = evaluate_clip_on_styled_images(clip_model, cifar_loader, device, clip_processor)


print("\nStyled Accuracy:")
print(f"EfficientNet: {styled_acc_effnet:.2f}%")
print(f"ViT: {styled_acc_vit:.2f}%")
print(f"CLIP-ViT: {styled_acc_clip:.2f}%")


Styled Accuracy:
EfficientNet: 62.55%
ViT: 95.33%
CLIP-ViT: 26.83%


In [24]:
def scramble_image(image, patch_size=8):
    c, h, w = image.shape
    patches = image.unfold(1, patch_size, patch_size).unfold(2, patch_size, patch_size)
    patches = patches.permute(1, 2, 0, 3, 4).reshape(-1, c, patch_size, patch_size)

    np.random.shuffle(patches)

    scrambled_img = patches.view(h // patch_size, w // patch_size, c, patch_size, patch_size).permute(2, 0, 3, 1, 4).reshape(c, h, w)
    return scrambled_img

def evaluate_model_on_scrambled_images(model, dataloader):
    correct = 0
    total = 0
    model.eval()

    with torch.no_grad():
        for images, labels in dataloader:
            scrambled_images = torch.stack([scramble_image(img) for img in images]).to(device)
            outputs = model(scrambled_images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted.to(device) == labels.to(device)).sum().item()

    accuracy = 100 * correct / total
    return accuracy

def evaluate_clip_on_scrambled_images(clip_model, dataloader, device, processor):
    correct = 0
    total = 0
    clip_model.eval()

    with torch.no_grad():
        for images, labels in dataloader:
            scrambled_images = torch.stack([scramble_image(img) for img in images])
            scrambled_images_pil = [transforms.ToPILImage()(img).convert("RGB") for img in scrambled_images]
            inputs = processor(text=[f"a photo of a {cifar10_test.classes[i]}" for i in range(10)], images=scrambled_images_pil, return_tensors="pt", padding=True).to(device)
            outputs = clip_model(**inputs)
            logits_per_image = outputs.logits_per_image
            predicted = logits_per_image.argmax(dim=1)
            total += labels.size(0)
            correct += (predicted.to(device) == labels.to(device)).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  

scrambled_acc_effnet = evaluate_model_on_scrambled_images(efficient_net, cifar_loader)
scrambled_acc_vit = evaluate_model_on_scrambled_images(vit_model, cifar_loader)
scrambled_acc_clip = evaluate_clip_on_scrambled_images(clip_model, cifar_loader, device, clip_processor)


print("\nScrambled Accuracy:")
print(f"EfficientNet: {scrambled_acc_effnet:.2f}%")
print(f"ViT: {scrambled_acc_vit:.2f}%")
print(f"CLIP-ViT: {scrambled_acc_clip:.2f}%")

  np.random.shuffle(patches)



Scrambled Accuracy:
EfficientNet: 9.25%
ViT: 17.25%
CLIP-ViT: 10.95%
