# Dog Breed Classifer

In [4]:
# model
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import models
from tqdm import tqdm

# dataset
import os
import math
import glob
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from torchvision import transforms
from torch.utils.data import Dataset, Subset, DataLoader
import matplotlib.pyplot as plt

# save result
import pickle

In [5]:
torch.manual_seed(2022) # Set the random seed so that the random numbers generated are the same each time
try:
    device = torch.device("mps") 
except:
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f'Current Device : {device}')



Current Device : mps


## dataset

In [6]:
img_names = glob.glob("./dog-breed-identification/train/*.jpg") 

print(f'Total images in Dataset : {len(img_names)}')



Total images in Dataset : 10222


In [7]:

print(img_names[0])
print(type(img_names[0]))

img = Image.open(img_names[0])
print(img.size)

img1 = Image.open(img_names[1])
print(img1.size)

img2 = Image.open(img_names[2])
print(img2.size)



./dog-breed-identification/train/84accc2dc9f5bb3ebee89fe1bf23639c.jpg
<class 'str'>
(500, 430)
(400, 300)
(272, 350)


In [8]:
# Defines a class called DogDataset, which is a custom PyTorch dataset class

class DogDataset(Dataset):

    def __init__(self, img_path, csv_path):
        self.csv_path = csv_path
        self.transform = None

        self.img_names = glob.glob(f"{img_path}/*.jpg") 

        if csv_path:
            label_df = pd.read_csv(csv_path)
            self.label_idx2name = label_df['breed'].unique() 
            
            self.label_name2idx = {} 
            for i in range(len(self.label_idx2name)):
                self.label_name2idx[self.label_idx2name[i]] = i
                
            
            self.img2label = {}
            for _, row in label_df.iterrows():
                self.img2label[f"{img_path}/{row['id']}.jpg"] = self.label_name2idx[row['breed']]
    
    def __len__(self):
        return len(self.img_names)
    
    def __getitem__(self, index): 
        img = self.img_names[index]

        if self.csv_path:
            label = self.img2label[img]
            label = torch.tensor(label)
        else:
            label = -1
        
        img = Image.open(img).convert("RGB")
        
        img = self.transform(img)
        
        return (img, label) 
    
    
    
    

In [17]:
# Define two preprocessing functions
# vit_train_transform_fn -- used for preprocessing the train dataset
# vit_valid_transform_fn -- used for preprocessing the valid dataset

# The goal is to adjust the original images to the format suitable for the pre-trained Vision Transformer (ViT) model.
# The pre-trained ViT model: the input images are 224x224 RGB images, and the pixel values of the images are normalized (i.e., mean subtraction and standard deviation division).

channel_mean = torch.Tensor([0.485, 0.456, 0.406])
channel_std = torch.Tensor([0.229, 0.224, 0.225])

vit_train_transform_fn = transforms.Compose([
    transforms.Resize(256), 
    transforms.CenterCrop(224), 

    transforms.RandomHorizontalFlip(p=0.6),
    transforms.RandomRotation(degrees=(30)),

    transforms.ToTensor(), 
    transforms.Normalize(mean=channel_mean, std=channel_std), 
])

vit_valid_transform_fn = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=channel_mean, std=channel_std),
])




In [18]:

dataset = DogDataset(
    img_path="./dog-breed-identification/train",
    csv_path="./dog-breed-identification/labels.csv",
)

# print(type(dataset))
# print(dataset[0])
# csv_path="./dog-breed-identification/labels.csv",
# label_df = pd.read_csv(csv_path) # label_df是pandas的DataFrame
# label_idx2name = label_df['breed'].unique()
    
    

In [19]:

indexes = list(range(len(dataset)))

train_indexes, valid_indexes = train_test_split(indexes, test_size=0.1)

train_dataset = Subset(dataset, train_indexes) 
valid_dataset = Subset(dataset, valid_indexes)

print(f"Number of samples in train_dataset: {len(train_dataset)}")
print(f"Number of samples in valid_dataset: {len(valid_dataset)}")


Number of samples in train_dataset: 9199
Number of samples in valid_dataset: 1023


In [20]:

dataset.transform = vit_train_transform_fn
train_dataset.transform = vit_train_transform_fn
valid_dataset.transform = vit_valid_transform_fn


In [21]:

train_valid_dataloader = DataLoader(
    dataset,
    batch_size=8, 
    shuffle=True
    
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=32,
    shuffle=True
)



In [23]:

# The main purpose of this function is to visualize a batch of image samples,
# helping us to intuitively understand the content and labels of these samples.

def show_samples(batch_img, batch_label=None, num_samples=16):
    sample_idx = 0
    total_col = 4
    total_row = math.ceil(num_samples / 4)
    col_idx = 0
    row_idx = 0

    fig, axs = plt.subplots(total_row, total_col, figsize=(15, 15))
    
    while sample_idx < num_samples:
        img = batch_img[sample_idx] 
        img = img.view(3, -1) * channel_std.view(3, -1) + channel_mean.view(3, -1)
        
        img = img.view(3, 224, 224) 
        
        img = img.permute(1, 2, 0)
        
        axs[row_idx, col_idx].imshow(img) 

        if batch_label != None: 
            axs[row_idx, col_idx].set_title(dataset.label_idx2name[(batch_label[sample_idx])])

        sample_idx += 1
        col_idx += 1
        if col_idx == 4: 
            col_idx = 0
            row_idx += 1

      

In [None]:


batch_img, batch_label = next(iter(train_dataloader))

show_samples(batch_img, batch_label, 8)



## build model


In [17]:
class PretrainViT(nn.Module):

    def __init__(self):
        super(PretrainViT, self).__init__()

        
        model = models.vit_l_16() ###
        model.load_state_dict(torch.load('./vit_l_16-852ce7e3.pth')) ###


        num_classifier_feature = model.heads.head.in_features
        
        
        model.heads.head = nn.Sequential(
            nn.Linear(num_classifier_feature, 120)
        )
        
        
        self.model = model

        for param in self.model.named_parameters():
            if "heads" not in param[0]:
                param[1].requires_grad = False
                

    def forward(self, x):
        return self.model(x)
    
    
    

In [18]:

net = PretrainViT() 
net.to(device) 
print(f"number of paramaters: {sum([param.numel() for param in net.parameters() if param.requires_grad])}")



number of paramaters: 123000


## train model

In [19]:


criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(net.parameters(), lr=0.009, momentum=0.9)


In [20]:


def get_accuracy(output, label):
    output = output.to("cpu") 
    label = label.to("cpu")

    sm = F.softmax(output, dim=1)
    _, index = torch.max(sm, dim=1)
    
    return torch.sum((label == index)) / label.size()[0]


In [21]:


def train(model, dataloader):
    model.train()
    
    running_loss = 0.0 
    total_loss = 0.0 
    running_acc = 0.0
    total_acc = 0.0

    for batch_idx, (batch_img, batch_label) in enumerate(dataloader): 

        batch_img = batch_img.to(device)
        batch_label = batch_label.to(device)

        optimizer.zero_grad() 
        
        output = net(batch_img) 
        
        loss = criterion(output, batch_label) 
        
        loss.backward() 
        optimizer.step()

        running_loss += loss.item()
        total_loss += loss.item()
        
        acc = get_accuracy(output, batch_label) 
        
        running_acc += acc
        total_acc += acc
        
        if batch_idx % 100 == 0 and batch_idx != 0:
            print(f"[step: {batch_idx:4d}/{len(dataloader)}] loss: {running_loss / 100:.3f}")
            running_loss = 0.0
            running_acc = 0.0
    
    return total_loss / len(dataloader), total_acc / len(dataloader)



In [22]:

def validate(model, dataloader):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0

    for batch_idx, (batch_img, batch_label) in enumerate(dataloader):

        batch_img = batch_img.to(device)
        batch_label = batch_label.to(device)

        # optimizer.zero_grad()
        output = net(batch_img)
        loss = criterion(output, batch_label)
        # loss.backward()
        # optimizer.step()

        total_loss += loss.item()
        acc = get_accuracy(output, batch_label)
        total_acc += acc
    
    return total_loss / len(dataloader), total_acc / len(dataloader)



In [23]:


EPOCHS = 7 
train_loss_history = []
valid_loss_history = []

train_acc_history = []
valid_acc_history = []

for epoch in range(EPOCHS):
    train_loss, train_acc = train(net, train_dataloader) 
    valid_loss, valid_acc = validate(net, valid_dataloader)
    
    print(f"Epoch: {epoch:2d}, training loss: {train_loss:.3f}, training acc: {train_acc:.3f} validation loss: {valid_loss:.3f}, validation acc: {valid_acc:.3f}")

    train_loss_history.append(train_loss)
    valid_loss_history.append(valid_loss)

    train_acc_history.append(train_acc)
    valid_acc_history.append(valid_acc)

    if valid_loss <= min(valid_loss_history):
        torch.save(net.state_dict(), "net.pt")
        
# Epoch:  0, training loss: 0.400, training acc: 0.924 validation loss: 0.146, validation acc: 0.963
# Epoch:  1, training loss: 0.086, training acc: 0.975 validation loss: 0.137, validation acc: 0.961
# Epoch:  2, training loss: 0.062, training acc: 0.982 validation loss: 0.117, validation acc: 0.964
# Epoch:  3, training loss: 0.049, training acc: 0.987 validation loss: 0.129, validation acc: 0.965
# Epoch:  4, training loss: 0.040, training acc: 0.989 validation loss: 0.130, validation acc: 0.960
# Epoch:  5, training loss: 0.034, training acc: 0.991 validation loss: 0.137, validation acc: 0.961
# Epoch:  6, training loss: 0.030, training acc: 0.992 validation loss: 0.146, validation acc: 0.959

        

[step:  100/1150] loss: 2.540
[step:  200/1150] loss: 0.491
[step:  300/1150] loss: 0.224
[step:  400/1150] loss: 0.210
[step:  500/1150] loss: 0.179
[step:  600/1150] loss: 0.166
[step:  700/1150] loss: 0.146
[step:  800/1150] loss: 0.188
[step:  900/1150] loss: 0.148
[step: 1000/1150] loss: 0.142
[step: 1100/1150] loss: 0.129
Epoch:  0, training loss: 0.400, training acc: 0.924 validation loss: 0.146, validation acc: 0.963
[step:  100/1150] loss: 0.098
[step:  200/1150] loss: 0.091
[step:  300/1150] loss: 0.073
[step:  400/1150] loss: 0.090
[step:  500/1150] loss: 0.083
[step:  600/1150] loss: 0.069
[step:  700/1150] loss: 0.106
[step:  800/1150] loss: 0.083
[step:  900/1150] loss: 0.078
[step: 1000/1150] loss: 0.102
[step: 1100/1150] loss: 0.077
Epoch:  1, training loss: 0.086, training acc: 0.975 validation loss: 0.137, validation acc: 0.961
[step:  100/1150] loss: 0.059
[step:  200/1150] loss: 0.064
[step:  300/1150] loss: 0.070
[step:  400/1150] loss: 0.049
[step:  500/1150] loss

## predict on test dataset and submit to Kaggle

In [24]:

net = PretrainViT()

net.load_state_dict(torch.load("./net.pt", map_location="cpu"))

net.to(device)
net.eval()

PretrainViT(
  (model): VisionTransformer(
    (conv_proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (encoder): Encoder(
      (dropout): Dropout(p=0.0, inplace=False)
      (layers): Sequential(
        (encoder_layer_0): EncoderBlock(
          (ln_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (self_attention): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (dropout): Dropout(p=0.0, inplace=False)
          (ln_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): MLPBlock(
            (0): Linear(in_features=1024, out_features=4096, bias=True)
            (1): GELU(approximate='none')
            (2): Dropout(p=0.0, inplace=False)
            (3): Linear(in_features=4096, out_features=1024, bias=True)
            (4): Dropout(p=0.0, inplace=False)
          )
        )
        (encoder_layer_1): EncoderBlock(
          (ln_

In [25]:


submit_df = pd.read_csv("./dog-breed-identification/sample_submission.csv")
test_names = submit_df["id"].values
columns = list(dataset.label_idx2name)



In [26]:


class TestDataset(Dataset):

    def __init__(self, test_names, transform_fn):
        self.test_names = test_names
        self.transform = transform_fn
    
    def __len__(self):
        return len(self.test_names)

    def __getitem__(self, idx):
        name = self.test_names[idx]
        path = os.path.join("./dog-breed-identification/test", name + ".jpg")
        img = Image.open(path)
        img = self.transform(img)
        return (img, name)
    
    

In [27]:


dataset = TestDataset(
    test_names = test_names,
    transform_fn = vit_valid_transform_fn
)


test_dataloader = DataLoader(
    dataset=dataset,
    batch_size=64,
    shuffle=False
)




In [28]:


len(columns)



120

In [29]:

with torch.no_grad():

    dfs = []

    for batch_idx, (batch_img, batch_name) in tqdm(enumerate(test_dataloader)):
        df = pd.DataFrame(columns=["id"] + columns)
        df["id"] = batch_name

        batch_img = batch_img.to(device)
        output = net(batch_img)
        sm = F.softmax(output, dim=1)
        df[columns] = sm.cpu().numpy()
        dfs.append(df)




162it [27:08, 10.05s/it]


In [30]:

my_submit = pd.concat(dfs)

my_submit.to_csv("submit.csv", index=False)



In [37]:

from PIL import Image
import torchvision.transforms as transforms
import pandas as pd

# load model
net = PretrainViT()
net.load_state_dict(torch.load("net.pt", map_location="cpu"))
net.to(device)
net.eval()

image_path='my_test_bianmu.jpg'

# image_path = "path_to_your_image.jpg"
image_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=channel_mean, std=channel_std)
])
image = Image.open(image_path).convert("RGB")
image = image_transform(image).unsqueeze(0).to(device)

# predict
with torch.no_grad():
    output = net(image)
    probabilities = torch.softmax(output, dim=1)
    print(probabilities)
    _, predicted_label = torch.max(probabilities, dim=1)
    print(predicted_label)


# get the prediction result
csv_path="./dog-breed-identification/labels.csv"
label_df = pd.read_csv(csv_path) 
label_idx2name = label_df['breed'].unique()

predicted_class = label_idx2name[predicted_label.item()]

print("Predicted class:", predicted_class)





tensor([[1.1493e-05, 8.4608e-06, 1.5825e-06, 2.5776e-05, 2.0882e-06, 4.5376e-06,
         6.3245e-06, 7.9643e-06, 4.6323e-06, 1.8938e-05, 1.4869e-06, 2.2787e-05,
         8.4207e-06, 3.7243e-06, 1.2103e-05, 1.8954e-05, 4.4983e-06, 4.2624e-06,
         1.0047e-05, 8.3524e-06, 1.6876e-05, 1.2239e-05, 1.0477e-05, 6.9312e-06,
         9.3976e-06, 1.0263e-05, 5.3778e-06, 3.6557e-05, 8.5943e-06, 8.0441e-06,
         9.7913e-06, 1.1273e-05, 5.6956e-06, 2.0685e-05, 1.2340e-05, 4.5120e-06,
         3.6703e-06, 1.4041e-05, 5.5831e-06, 1.0359e-05, 1.8812e-06, 1.4196e-06,
         7.4255e-06, 8.2518e-05, 7.5308e-06, 6.8385e-06, 3.0926e-05, 9.0590e-06,
         5.1992e-06, 5.4543e-06, 3.5545e-06, 3.7932e-05, 8.8038e-05, 9.8376e-01,
         1.9835e-06, 1.3028e-02, 2.9180e-06, 1.0111e-06, 1.4640e-06, 2.1915e-06,
         2.1954e-06, 1.6943e-05, 9.7967e-06, 3.1048e-06, 2.7619e-06, 7.7830e-06,
         8.5651e-06, 1.7553e-05, 5.2244e-06, 5.2910e-06, 1.8366e-04, 3.6585e-06,
         3.7353e-06, 3.9332e