# 1. Import library

In [1]:
# Regarding pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
# Other
import timm
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
import os
from sklearn.model_selection import train_test_split

# 2. Dataset

In [2]:
train_path = '/kaggle/input/market-dataset-2/Market-Pytorch/Market/train/'
val_path = '/kaggle/input/market-dataset-2/Market-Pytorch/Market/val/'

In [3]:
def extractImageandLabel(path):
    list_subfolder = os.listdir(path)
    list_images = []
    list_classes = []
    list_labels = []
    for i, subfolder in enumerate(list_subfolder):
        _class = int(subfolder) # Class của đối tượng
        sub_path = path + subfolder + '/' # Đường dẫn đến folder chứa các ảnh thuộc cùng 1 class
        list_image_names = os.listdir(sub_path) # Tên các ảnh của folder ở trên
        list_image_paths = [sub_path + name for name in list_image_names] # Đường dẫn tới các ảnh đó
        for image_path in list_image_paths:
            image = Image.open(image_path)
            list_images.append(image)
            list_classes.append(_class)
            list_labels.append(i)
    return list_images, list_classes, list_labels

In [4]:
train_list_images, train_list_classes, train_list_labels = extractImageandLabel(train_path)

In [5]:
val_list_images, val_list_classes, val_list_labels = extractImageandLabel(val_path)

In [6]:
print('Số lượng ảnh ở tập train: ', len(train_list_images))
print('Số lượng lớp ở tập train: ', len(set(train_list_classes)))
print('Số lượng ảnh ở tập val: ', len(val_list_images))
print('Số lượng lớp ở tập val: ', len(set(val_list_classes)))

Số lượng ảnh ở tập train:  12185
Số lượng lớp ở tập train:  751
Số lượng ảnh ở tập val:  751
Số lượng lớp ở tập val:  751


In [7]:
transform_train_list = [
    transforms.Resize((224,224), interpolation= 3),
    transforms.ToTensor(),
    transforms.RandomHorizontalFlip(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
]
transform_val_list = [
    transforms.Resize((224, 224), interpolation= 3),
    transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
]
data_transforms = [
    transforms.Compose(transform_train_list), # train
    transforms.Compose(transform_val_list) # val
]

In [8]:
class TrainValDataset(Dataset):
    def __init__(self, list_images, list_labels, data_transforms, is_train = True):
        self.list_images = list_images
        self.list_labels = list_labels
        self.data_transforms = data_transforms
        self.is_train = is_train
    def __len__(self):
        return len(self.list_images)
    def __getitem__(self, idx):
        label = self.list_labels[idx] # label
        # Image
        image = self.list_images[idx]
        transform = self.data_transforms[0]   
        if not self.is_train:
            transform = self.data_transforms[1]
        image = transform(image)
        return image, label

In [9]:
train_dataset = TrainValDataset(train_list_images, train_list_labels, data_transforms)
val_dataset = TrainValDataset(val_list_images, val_list_labels, data_transforms, is_train = False)

In [10]:
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = 32)
val_loader = DataLoader(val_dataset, shuffle = True, batch_size = 32)

In [11]:
a, b = next(iter(train_loader)) # Dùng để kiểm tra code TrainValDataset chạy đúng không

# 3. Model

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
# Load pre-trained ViT
vit_base = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=751)
vit_base= vit_base.to(device)
vit_base.eval()

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [14]:
class FCLayer(nn.Module):
    def __init__(self, input_dim=768, num_bottleneck=512, num_classes=751):
        super().__init__()
        self.input_dim = input_dim
        self.num_bottleneck = num_bottleneck
        self.num_classes = num_classes
        self.linear1 = nn.Linear(self.input_dim, self.num_bottleneck)
        self.linear2 = nn.Linear(self.num_bottleneck, self.num_classes)
        self.ReLU = nn.ReLU()
        self.batch_norm = nn.BatchNorm1d(self.num_bottleneck)
        
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.ReLU(x)
        x = self.batch_norm(x)
        x = self.linear2(x)
        return x

In [15]:
class LATransformer(nn.Module):
    def __init__(self, backbone_model, _lambda):
        super().__init__()
        self.backbone_model = backbone_model
        self._lambda = _lambda
        self.avgpool = nn.AdaptiveAvgPool2d((1,768))
        self.fc_layers = nn.ModuleList([FCLayer() for _ in range(3)])
        
    def forward(self, x):
        x = self.backbone_model.patch_embed(x)
        cls_token = self.backbone_model.cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_token, x), dim=1)
        x = self.backbone_model.pos_drop(x + self.backbone_model.pos_embed)

        # Feed forward through transformer blocks
        for i in range(12):
            x = self.backbone_model.blocks[i](x)
        x = self.backbone_model.norm(x)

        # extract the cls token
        G = x[:,0,:].unsqueeze(1) # cls
        Q = x[:,1:,:] # local feature # (batch_size, 128, 768)
        Q1 = Q[:,0:36,:]
        Q2 = Q[:,28:160,:]
        Q3 = Q[:,128:,:]
        f = [self.avgpool(Q1), self.avgpool(Q2), self.avgpool(Q3)]
        for i in range(3):
            out = torch.mul(f[i], self._lambda).squeeze()
            f[i] = torch.div(torch.add(G.squeeze(),out), 1+self._lambda)
        y = []
        for i in range(3):
            y_i = self.fc_layers[i](f[i])
            y.append(y_i)
            
        return f, y

In [16]:
la_transformer_model = LATransformer(vit_base, 8).to(device)
# la_transformer_model.load_state_dict(torch.load('/kaggle/input/best-model/best_model.pth'))
CE_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(la_transformer_model.parameters(), lr=0.0001)

In [17]:
def freeze_all_blocks(model):
    num_blocks = 12
    for block in model.backbone_model.blocks[:num_blocks]:
        for param in block.parameters():
            param.requires_grad = False

In [18]:
def unfreeze(model, num_blocks, remaining_count):
    block = model.backbone_model.blocks[remaining_count - 1]
    for param in block.parameters():
        param.requires_grad = True

# 4. Train

In [34]:
# Train mà không dung block wise
def train(model, train_loader, num_epochs):
    softmax = nn.Softmax(dim = 1)
    for epoch in range(num_epochs):
        model.train()
        correct = 0
        total_loss = 0
        for image, label in tqdm(train_loader):
            optimizer.zero_grad()
            image, label = image.to(device), label.to(device)
            _, outputs = model(image)
            score = 0
            loss = 0.0
            for output in outputs:
                score += softmax(output)
                loss += CE_loss(output, label)
            total_loss += loss.item()
            #
            loss.backward()
            optimizer.step()
            #
            pred = torch.argmax(score, dim = 1)
            correct += (pred == label).sum()
            #
        avg_loss = total_loss/len(train_list_images)
        file_path = f'/kaggle/working/model_epoch_{epoch}.pth'
        torch.save(model.state_dict(), file_path)
        print(f'Epoch {epoch}: Loss: {avg_loss} Train_accuracy: {correct/len(train_list_images)}')

In [35]:
train(la_transformer_model, train_loader, 1)

  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 13: Loss: 0.003277631509005045 Train_accuracy: 0.9967993497848511


In [19]:
# Train sử dụng block wise
def train_with_blockwise(model, train_loader, num_epochs, t = 2, lr_decay = 0.85):
    softmax = nn.Softmax(dim = 1)
    freeze_all_blocks(model)
    count = 0
    for epoch in range(num_epochs):
        model.train()
        if epoch > 0 and epoch % t ==0:
            count += 1
            remaining_cout = 12 - count
            unfreeze(model, 12, remaining_cout)
            optimizer.param_groups[0]['lr'] *= lr_decay
        correct = 0
        total_loss = 0
        for image, label in tqdm(train_loader):
            optimizer.zero_grad()
            image, label = image.to(device), label.to(device)
            _, outputs = model(image)
            score = 0
            loss = 0.0
            for output in outputs:
                score += softmax(output)
                loss += CE_loss(output, label)
            total_loss += loss.item()
            #
            loss.backward()
            optimizer.step()
            #
            pred = torch.argmax(score, dim = 1)
            correct += (pred == label).sum()
            #
        avg_loss = total_loss/len(train_list_images)
        file_path = f'/kaggle/working/model_epoch_{epoch}.pth'
        torch.save(model.state_dict(), file_path)
        print(f'Epoch {epoch}: Loss: {avg_loss} Train_accuracy: {correct/len(train_list_images)}')

In [20]:
train_with_blockwise(la_transformer_model,train_loader, 26)

  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 0: Loss: 0.5747545452845796 Train_accuracy: 0.09995896369218826


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 1: Loss: 0.4162617938949967 Train_accuracy: 0.3189987540245056


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 2: Loss: 0.27472100091557744 Train_accuracy: 0.5643003582954407


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 3: Loss: 0.16341195010630186 Train_accuracy: 0.7763643860816956


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 4: Loss: 0.09132582433017297 Train_accuracy: 0.897661030292511


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 5: Loss: 0.04508650524478297 Train_accuracy: 0.9610996842384338


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 6: Loss: 0.022324143201540764 Train_accuracy: 0.9875256419181824


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 7: Loss: 0.010206138169271416 Train_accuracy: 0.9959786534309387


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 8: Loss: 0.006202331093011159 Train_accuracy: 0.9978662133216858


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 9: Loss: 0.0035050896741984855 Train_accuracy: 0.9993434548377991


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 10: Loss: 0.0027046276663824044 Train_accuracy: 0.9995076060295105


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 11: Loss: 0.00265907961500684 Train_accuracy: 0.9989331364631653


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 12: Loss: 0.0023332517248626904 Train_accuracy: 0.9988510608673096


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 13: Loss: 0.001062766525238618 Train_accuracy: 0.9998358488082886


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 14: Loss: 0.0007165659988695694 Train_accuracy: 1.0


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 15: Loss: 0.0005622121790530229 Train_accuracy: 1.0


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 16: Loss: 0.0004736749965515619 Train_accuracy: 1.0


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 17: Loss: 0.00039759337963978924 Train_accuracy: 1.0


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 18: Loss: 0.00034603791075973027 Train_accuracy: 1.0


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 19: Loss: 0.0026108652753789843 Train_accuracy: 0.9975379705429077


  0%|          | 0/381 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [21]:
os.chdir(r'/kaggle/working')
from IPython.display import FileLink
FileLink(r'model_epoch_19.pth')

# 5. Validate

In [22]:
def validate(model, val_loader):
    correct = 0
    total = 751
    total_loss = 0
    softmax = nn.Softmax(dim = 1)
    for image, label in tqdm(val_loader):
        image, label = image.to(device), label.to(device)
        score = 0
        loss = 0
        with torch.no_grad():
            _, outputs = model(image)
            for output in outputs:
                score += softmax(output)
                loss += CE_loss(output, label)
            total_loss += loss.item()
            pred = torch.argmax(score, dim = 1)
            correct += (pred == label).sum()
    avg_loss = total_loss/total
    print(f'Loss: {avg_loss} Val_accuracy: {correct/total}')    

In [23]:
validate(la_transformer_model, val_loader)

  0%|          | 0/24 [00:00<?, ?it/s]

Loss: 0.05136664324848058 Val_accuracy: 0.9001331329345703


# 6. Test

In [24]:
query_path = '/kaggle/input/market-dataset-2/Market-Pytorch/Market/query/'
gallery_path = '/kaggle/input/market-dataset-2/Market-Pytorch/Market/gallery/'

In [25]:
list_subfolder = os.listdir('/kaggle/input/market-dataset-2/Market-Pytorch/Market/gallery/')
dictionary = {}
for i, subfolder in enumerate(list_subfolder):
    _class = int(subfolder)
    dictionary[_class] = i

In [26]:
def extractImageandLabelTest(path, dictionary):
    list_subfolder = os.listdir(path)
    list_images = []
    list_labels = []
    for i, subfolder in enumerate(list_subfolder):
        _class = int(subfolder) # Class của đối tượng
        sub_path = path + subfolder + '/' # Đường dẫn đến folder chứa các ảnh thuộc cùng 1 class
        list_image_names = os.listdir(sub_path) # Tên các ảnh của folder ở trên
        list_image_paths = [sub_path + name for name in list_image_names] # Đường dẫn tới các ảnh đó
        for image_path in list_image_paths:
            image = Image.open(image_path)
            list_images.append(image)   
            list_labels.append(dictionary[_class])
    return list_images, list_labels

In [27]:
gallery_list_images, gallery_list_labels  = extractImageandLabelTest(gallery_path, dictionary)
query_list_images, query_list_labels  = extractImageandLabelTest(query_path, dictionary)

In [28]:
print('Số lượng ảnh trong gallery: ', len(gallery_list_images))
print('Số lượng class trong gallery: ', len(set(gallery_list_labels)))
print('Số lượng ảnh trong query: ', len(query_list_images))
print('Số lượng class trong query: ', len(set(query_list_labels)))

Số lượng ảnh trong gallery:  19732
Số lượng class trong gallery:  752
Số lượng ảnh trong query:  3368
Số lượng class trong query:  750


In [29]:
gallery_dataset = TrainValDataset(gallery_list_images, gallery_list_labels, data_transforms, is_train = False)
query_dataset = TrainValDataset(query_list_images, query_list_labels, data_transforms)

In [30]:
query_loader = DataLoader(query_dataset, batch_size = 16, shuffle=False)
gallery_loader = DataLoader(gallery_dataset, batch_size = 16, shuffle = False)

In [31]:
def extract_feature(model, dataloader):
    features = torch.FloatTensor().to(device)
    for image, label in tqdm(dataloader):
        image, label = image.to(device), label.to(device)
        with torch.no_grad():
            f, _ = model(image)
            temp = torch.cat((f[0], f[1]), dim = 1)
            f123 = torch.cat((temp, f[2]), dim = 1)
            features = torch.cat((features, f123), dim = 0)
    return features

In [32]:
query_feature= extract_feature(la_transformer_model, query_loader)
gallery_feature = extract_feature(la_transformer_model, gallery_loader)

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/1234 [00:00<?, ?it/s]

In [33]:
def concatenated_feature(feature):
    mean = torch.mean(feature, dim=1, keepdim=True).expand_as(feature)
    std_dev = torch.std(feature, dim=1, keepdim=True).expand_as(feature)
    normalized_feature = (feature - mean) / std_dev
    list_feature = [feature.view(-1) for feature in normalized_feature]
    return list_feature

In [34]:
concatenated_query_vectors = concatenated_feature(query_feature)
concatenated_gallery_vectors = concatenated_feature(gallery_feature)

In [35]:
def rank1(_class, output):
    pred_class = output[1]
    true_class = np.array(_class)
    if _class == pred_class[0][0]:
        return True
    return False

In [36]:
def rank5(_class, output):
    pred_class = output[1]
    if _class in pred_class[0][:5]:
        return True
    return False

In [37]:
def rank10(_class, output):
    pred_class = output[1]
    if _class in pred_class[0][:10]:
        return True
    return False

In [38]:
def calc_map(label, output): # mai code lại do chưa hiểu mAP là gì
    count = 0
    score = 0
    good = 0
    for out in output[1][0]:
        count += 1
        if out==label:
            good += 1
            score += (good/count)
    if good==0:
        return 0
    return score/good

In [39]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [40]:
import faiss


index = faiss.IndexIDMap(faiss.IndexFlatIP(2304))

index.add_with_ids(np.array([t.to('cpu').numpy() for t in concatenated_gallery_vectors]),np.array(gallery_list_labels))


def search(query: str, k=1):
    encoded_query = query.unsqueeze(dim=0).to('cpu').numpy()
    top_k = index.search(encoded_query, k)
    return top_k

In [41]:
rank1_score = 0
rank5_score = 0
rank10_score = 0
ap = 0
count = 0
for query, label in zip(concatenated_query_vectors, query_list_labels):
    count += 1
    label = label
    output = search(query, k=10)
    rank1_score += rank1(label, output)
    rank5_score += rank5(label, output)
    rank10_score += rank10(label, output)
    print("Correct: {}, Total: {}, Incorrect: {}".format(rank1_score, count, count-rank1_score), end="\r")
    ap += calc_map(label, output)

print("Rank1: {}, Rank5: {}, Rank10: {}, mAP: {}".format(rank1_score/len(query_feature),
                                                         rank5_score/len(query_feature),
                                                         rank10_score/len(query_feature), ap/len(query_feature)))

Rank1: 0.9848574821852731, Rank5: 0.9979216152019003, Rank10: 0.998812351543943, mAP: 0.9294586456745533


# 5. Visulization