<a href="https://colab.research.google.com/github/jihuitong/vit-pytorch/blob/main/vit_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install vit_pytorch linformer torchvision tqdm

Collecting vit_pytorch
  Downloading vit_pytorch-1.8.9-py3-none-any.whl.metadata (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting linformer
  Downloading linformer-0.2.3-py3-none-any.whl.metadata (602 bytes)
Downloading vit_pytorch-1.8.9-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.9/135.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading linformer-0.2.3-py3-none-any.whl (6.2 kB)
Installing collected packages: linformer, vit_pytorch
Successfully installed linformer-0.2.3 vit_pytorch-1.8.9


In [8]:
# 导入相关库
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.optim.lr_scheduler import StepLR
from vit_pytorch import ViT
import torch.optim as optim
from tqdm.notebook import tqdm

batch_size = 64
epochs = 10
lr = 3e-5
gamma = 0.7
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=test_transforms)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train Data: {len(train_loader.dataset)}")
print(f"Test Data: {len(test_loader.dataset)}")


Files already downloaded and verified
Files already downloaded and verified
Train Data: 50000
Test Data: 10000


In [9]:
model = ViT(
    image_size=224,  # 图像大小
    patch_size=16,   # 每个patch的大小
    num_classes=10,  # CIFAR-10有10个类别
    dim=512,         # transformer的维度
    depth=6,         # transformer的深度
    heads=8,         # 注意力头数
    mlp_dim=1024,    # MLP的维度
    dropout=0.1,     # dropout
    emb_dropout=0.1  # embedding层的dropout
).to(device)

# 损失函数和优化器
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

In [10]:
def train_model(model, train_loader, criterion, optimizer, scheduler, device, epochs):
    """
    :param model: 训练的模型
    :param train_loader: 训练数据加载器
    :param criterion: 损失函数
    :param optimizer: 优化器
    :param scheduler: 学习率调度器
    :param device: 设备（'cuda'）
    :param epochs: 训练轮数
    """
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_accuracy = 0


        for data, label in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
            data = data.to(device)
            label = label.to(device)

            output = model(data)
            loss = criterion(output, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            acc = (output.argmax(dim=1) == label).float().mean()
            epoch_accuracy += acc / len(train_loader)
            epoch_loss += loss / len(train_loader)


        model.eval()
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        with torch.no_grad():
            for data, label in test_loader:
                data = data.to(device)
                label = label.to(device)

                val_output = model(data)
                val_loss = criterion(val_output, label)

                acc = (val_output.argmax(dim=1) == label).float().mean()
                epoch_val_accuracy += acc / len(test_loader)
                epoch_val_loss += val_loss / len(test_loader)

        print(
            f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n")


        scheduler.step()


In [11]:
# 训练模型
train_model(model, train_loader, criterion, optimizer, scheduler, device, epochs)

Epoch 1/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 1 - loss : 1.9060 - acc: 0.2916 - val_loss : 1.5773 - val_acc: 0.4266



Epoch 2/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 2 - loss : 1.6751 - acc: 0.3902 - val_loss : 1.3962 - val_acc: 0.4969



Epoch 3/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 3 - loss : 1.5806 - acc: 0.4288 - val_loss : 1.3557 - val_acc: 0.5149



Epoch 4/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 4 - loss : 1.5341 - acc: 0.4468 - val_loss : 1.3250 - val_acc: 0.5298



Epoch 5/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 5 - loss : 1.5036 - acc: 0.4625 - val_loss : 1.2698 - val_acc: 0.5493



Epoch 6/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 6 - loss : 1.4780 - acc: 0.4729 - val_loss : 1.2785 - val_acc: 0.5462



Epoch 7/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 7 - loss : 1.4681 - acc: 0.4767 - val_loss : 1.2779 - val_acc: 0.5553



Epoch 8/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 8 - loss : 1.4568 - acc: 0.4802 - val_loss : 1.2465 - val_acc: 0.5620



Epoch 9/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 9 - loss : 1.4510 - acc: 0.4805 - val_loss : 1.2511 - val_acc: 0.5614



Epoch 10/10:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch : 10 - loss : 1.4444 - acc: 0.4880 - val_loss : 1.2393 - val_acc: 0.5663



In [6]:
def evaluate_model(model, test_loader, device):
    """
    :param model: 评估模型
    :param test_loader: 测试数据加载器
    :param device: 设备（'cuda'）
    """
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, label in test_loader:
            data = data.to(device)
            label = label.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

In [12]:
# 评估模型
test_accuracy = evaluate_model(model, test_loader, device)

Test Accuracy: 56.69%
