Output shape: torch.Size([32, 256])


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Define transformations
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

# Define the model
class GlobalAttentionCNN(nn.Module):
    def __init__(self):
        super(GlobalAttentionCNN, self).__init__()
        # Define your CNN layers and GlobalAttention module
        
    def forward(self, x):
        # Forward pass through your CNN and GlobalAttention module
        return x

# Initialize the model
model = GlobalAttentionCNN()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation loop
    model.eval()
    with torch.no_grad():
        total_correct = 0
        total_samples = 0
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
        
        accuracy = total_correct / total_samples
        print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy:.4f}')

# Evaluate the trained model on the test set
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f'Accuracy on the test set: {accuracy:.4f}')


Files already downloaded and verified
Files already downloaded and verified


ValueError: optimizer got an empty parameter list

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GlobalAttention(nn.Module):
    def __init__(self, patch_size, num_patches, hidden_dim):
        super(GlobalAttention, self).__init__()
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.hidden_dim = hidden_dim
        
        # Linear layers to project patch embeddings
        self.patch_embedding = nn.Linear(patch_size * patch_size * 3, hidden_dim)
        self.global_embedding = nn.Linear(hidden_dim, 1)
        
    def forward(self, patches):
        # patches: (batch_size, num_patches, patch_size * patch_size * 3)
        
        # Project patch embeddings
        patch_embeddings = self.patch_embedding(patches)  # (batch_size, num_patches, hidden_dim)
        
        # Calculate attention scores
        attention_scores = self.global_embedding(torch.tanh(patch_embeddings))  # (batch_size, num_patches, 1)
        
        # Compute attention weights using softmax
        attention_weights = F.softmax(attention_scores, dim=1)  # (batch_size, num_patches, 1)
        
        # Weighted sum of patch embeddings
        global_embedding = torch.sum(patch_embeddings * attention_weights, dim=1)  # (batch_size, hidden_dim)
        
        return global_embedding

class GlobalAttentionCNN(nn.Module):
    def __init__(self, patch_size, num_patches, hidden_dim):
        super(GlobalAttentionCNN, self).__init__()
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.hidden_dim = hidden_dim
        
        # Define CNN layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Define GlobalAttention module
        self.global_attention = GlobalAttention(patch_size, num_patches, hidden_dim)
        
        # Define fully connected layers for classification
        self.fc1 = nn.Linear(hidden_dim, 256)
        self.fc2 = nn.Linear(256, 10)  # 10 classes in CIFAR-10
    
    def forward(self, x):
        # Forward pass through CNN layers
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)
        
        # Rearrange features into patches
        batch_size, _, h, w = x.size()
        x = x.view(batch_size, h * w, -1)
        
        # Apply GlobalAttention module
        x = self.global_attention(x)
        
        # Apply fully connected layers for classification
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

# Example usage
patch_size = 8
num_patches = 64
hidden_dim = 256

# Initialize the model
model = GlobalAttentionCNN(patch_size, num_patches, hidden_dim)


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Define transformations
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

# Define the CNN layers
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 4 * 4, 512)  # Assuming input image size is 32x32 after pooling 3 times
        self.fc2 = nn.Linear(512, 10)  # 10 classes for CIFAR-10

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 64 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the Global Attention module
class GlobalAttention(nn.Module):
    def __init__(self, patch_size, num_patches, hidden_dim):
        super(GlobalAttention, self).__init__()
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.hidden_dim = hidden_dim
        self.patch_embedding = nn.Linear(patch_size * patch_size * 3, hidden_dim)
        self.global_embedding = nn.Linear(hidden_dim, 1)

    def forward(self, patches):
        patch_embeddings = self.patch_embedding(patches)
        attention_scores = self.global_embedding(torch.tanh(patch_embeddings))
        attention_weights = F.softmax(attention_scores, dim=1)
        global_embedding = torch.sum(patch_embeddings * attention_weights, dim=1)
        return global_embedding

# Initialize the model
class GlobalAttentionCNN(nn.Module):
    def __init__(self, patch_size, num_patches, hidden_dim):
        super(GlobalAttentionCNN, self).__init__()
        self.cnn = CNN()
        self.global_attention = GlobalAttention(patch_size, num_patches, hidden_dim)

    def forward(self, x):
        x = self.cnn(x)
        x = self.global_attention(x)
        return x

# Initialize the model and send to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GlobalAttentionCNN(patch_size=16, num_patches=64, hidden_dim=256).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 200 == 199:    # Print every 200 mini-batches
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {running_loss / 200:.3f}')
            running_loss = 0.0

print('Finished Training')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data[0].to(device), data[1].to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy on the test set: {100 * correct / total}%')


Files already downloaded and verified
Files already downloaded and verified


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x10 and 768x256)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torch.nn.functional import softmax

# Define the CNN model with attention between patches
class PatchAttentionCNN(nn.Module):
    def __init__(self):
        super(PatchAttentionCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc = nn.Linear(256 * 8 * 8, 10)  # CIFAR-10 has 10 classes

        # Attention mechanism
        self.attention = nn.MultiheadAttention(embed_dim=256, num_heads=8)

    def forward(self, x):
        # Forward pass through CNN layers
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 256 * 8 * 8)  # Flatten the output
        x = self.fc(x)  # Linear layer

        # Attention mechanism between patches
        x = x.unsqueeze(0)  # Add batch dimension
        x = x.permute(1, 0, 2)  # Reshape for attention mechanism
        x, _ = self.attention(x, x, x)  # Multihead attention
        x = x.squeeze(0)  # Remove batch dimension

        return x

# Load CIFAR-10 dataset and define transforms
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = ImageFolder(root='path/to/train/dataset', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Initialize the model, loss function, and optimizer
model = PatchAttentionCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')

print('Finished Training')


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'path/to/train/dataset'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torch.nn.functional import softmax

# Define the CNN model with attention between patches
class PatchAttentionCNN(nn.Module):
    def __init__(self):
        super(PatchAttentionCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc = nn.Linear(256 * 4 * 4, 10)  # CIFAR-10 has 10 classes

        # Attention mechanism
        self.attention = nn.MultiheadAttention(embed_dim=256, num_heads=8)

    def forward(self, x):
        # Forward pass through CNN layers
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 256 * 4 * 4)  # Flatten the output
        x = self.fc(x)  # Linear layer

        # Attention mechanism between patches
        x = x.unsqueeze(0)  # Add batch dimension
        x = x.permute(1, 0, 2)  # Reshape for attention mechanism
        x, _ = self.attention(x, x, x)  # Multihead attention
        x = x.squeeze(0)  # Remove batch dimension

        return x

# CIFAR-10 dataset preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = CIFAR10(root='./data', train=False, transform=transform, download=True)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = PatchAttentionCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')

print('Finished Training')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on the test set: {100 * accuracy:.2f}%')


In [None]:
# Gemini
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Define Patch Embedding Layer
class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, num_channels, embedding_dim):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        self.projection = nn.Conv2d(num_channels, embedding_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.projection(x)  # Reshape into patches
        x = x.flatten(2).transpose(1, 2)  # Reshape into (B, num_patches, embedding_dim)
        return x

# Define Transformer Encoder Block with Positional Encoding (Optional)
class TransformerEncoderBlock(nn.Module):
    def __init__(self, dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.attention_layer = nn.MultiheadAttention(dim, num_heads, dropout=dropout)
        self.mlp_layer = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, dim),
        )
        self.layernorm1 = nn.LayerNorm(dim)
        self.layernorm2 = nn.LayerNorm(dim)

    def forward(self, x, pos):  # Add positional encoding input (pos)
        attention_output, attention_weights = self.attention_layer(x, x, x)  # Self-attention
        x = x + attention_output
        x = x + self.mlp_layer(self.layernorm1(x))
        return x

# Define Vision Transformer (ViT) Model
class ViT(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels=3):
        super().__init__()
        self.patch_embedding = PatchEmbedding(image_size, patch_size, channels, dim)
        self.class_token = nn.Parameter(torch.zeros(1, 1, dim))
        self.pos_encoder = nn.Embedding(self.patch_embedding.num_patches + 1, dim)  # Add 1 for class token
        self.encoder = nn.Sequential(*[
            TransformerEncoderBlock(dim, heads, mlp_dim) for _ in range(depth)
        ])
        self.classification_head = nn.Linear(dim, num_classes)

    def forward(self, x):
        x = self.patch_embedding(x)
        # Add positional encoding (consider different strategies like sine or learned)
        pos = self.pos_encoder(torch.arange(1, x.shape[1] + 1, device=x.device))  # 1 for class token
        x = torch.cat((self.class_token.expand(x.shape[0], -1, -1), x), dim=1)
        # print(x.size), pos.size())
        x = x + pos  # Add positional encoding
        x = self.encoder(x)
        x = self.classification_head(x[:, 0])  # Take the class token output
        return x

# Training Function (Example)
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if (i+1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.3f}'.format(
                epoch, args.epochs, i+1, len(train_loader.dataset), loss.item()))

# Training Setup (Example)
# Training Setup (Example)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Hyperparameters (replace with your desired values)
image_size = 32  # CIFAR-10 image size
patch_size = 4  # Patch size for ViT
num_classes = 10  # Number of classes in CIFAR-10
dim = 128  # Embedding dimension
depth = 6  # Number of Transformer encoder blocks
heads = 8  # Number of heads in multi-head attention
mlp_dim = 4 * dim  # MLP dimension
learning_rate = 1e-3  # Learning rate for optimizer
epochs = 10  # Number of training epochs

# Data Loading and Augmentation
train_transform = transforms.Compose([
    transforms.RandomCrop(image_size, padding=4),  # Random cropping for augmentation
    transforms.RandomHorizontalFlip(),  # Random horizontal flip for augmentation
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2010, 0.2178, 0.2302)),  # Normalize CIFAR-10 data
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2010, 0.2178, 0.2302)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Adjust batch size as needed
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model Initialization
model = ViT(image_size, patch_size, num_classes, dim, depth, heads, mlp_dim).to(device)

# Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(epochs):
    train(model, device, train_loader, optimizer, criterion, epoch)

#  (Optional) Save the trained model
torch.save(model.state_dict(), 'vit_cifar10.pt')

#  (Optional) Evaluate model performance on test set
# ... (implement evaluation logic here)

