In [31]:
import random
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.utils as utils
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import matplotlib.pyplot as plt
import numpy as np
import tarfile
import os
from PIL import Image

from timeit import default_timer as timer
from tqdm.auto import tqdm
from torchvision import datasets, models, transforms
from torchsummary import summary
from typing import Callable, Dict, List, Tuple, Union
from torch.optim import lr_scheduler
# from torch.utils.tensorboard import SummaryWriter
# import skimage

import clip #importing clip model

print("Torch version:", torch.__version__)

Torch version: 2.0.0+cu117


In [32]:
device = "cpu"
if torch.cuda.is_available:
  print('cuda available')
  device = torch.device("cuda:0")
else:
  print('Please set GPU via Edit -> Notebook Settings.')

cuda available


In [33]:
model, preprocess = clip.load("ViT-B/32",jit=False) #loading the CLIP model based on ViT
model.cuda().eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [34]:
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

# 1. 2차 분류

## {label}

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import numpy as np
import os
import clip

# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, preprocess = clip.load("ViT-B/32",jit=False) #loading the CLIP model based on ViT
model.cuda().eval()

folder=torchvision.datasets.ImageFolder(root='/workspace/classification_exp/dataset/LogoDet-3K') #커스텀데이터 부르기
names = folder.classes

batch_size = 64

for i in range(len(names)):
    
    folder_path = f"/workspace/classification_exp/dataset/LogoDet-3K/{names[i]}"
    
    print(i+1, " : ", names[i])
    
    test_set = torchvision.datasets.ImageFolder(root=folder_path, transform=preprocess) #커스텀데이터 부르기
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False) #데이터를 미니배치 형태로 생성

    print("num of classes : ", len(test_set.classes))

    # 모델 및 텍스트 토큰화 함수 정의
    text_tokens = clip.tokenize(test_set.classes).to(device)

    # 모델 추론 및 정확도 계산
    correct_labels = 0
    total_labels = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)

            # 이미지 특성 추출
            image_features = model.encode_image(images).float()
            image_features /= image_features.norm(dim=-1, keepdim=True)

            # 텍스트 특성 추출
            text_features = model.encode_text(text_tokens).float()
            text_features /= text_features.norm(dim=-1, keepdim=True)

            # 텍스트 특성과의 유사도 계산
            text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            top_probs, top_labels = text_probs.topk(5, dim=-1)

            # 정확도 계산
            correct_labels += (top_labels[:, 0] == labels).sum().item()
            total_labels += labels.size(0)

    # 정확도 출력
    accuracy = correct_labels / total_labels
    print("The overall accuracy for the CLIP Zero-shot model without ensembling is: {}".format(accuracy))
    print("Number of correct labels:", correct_labels)

1  :  Clothes
num of classes :  604
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6547047911469328
Number of correct labels: 20470
2  :  Electronic
num of classes :  224
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6793798449612403
Number of correct labels: 6573
3  :  Food
num of classes :  932
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6573758200562324
Number of correct labels: 35071
4  :  Leisure
num of classes :  111
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.8441512752858399
Number of correct labels: 4799
5  :  Medical
num of classes :  47
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.5802281368821293
Number of correct labels: 2289
6  :  Necessities
num of classes :  432
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6200547901055515
Number of correct labels: 15391
7  :  Others
num of classes :  371
The o

## a photo of {label}

In [45]:
# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, preprocess = clip.load("ViT-B/32",jit=False) #loading the CLIP model based on ViT
model.cuda().eval()

folder=torchvision.datasets.ImageFolder(root='/workspace/classification_exp/dataset/LogoDet-3K', transform=preprocess) #커스텀데이터 부르기
names = folder.classes

batch_size = 64

for i in range(len(names)):
    
    folder_path = f"/workspace/classification_exp/dataset/LogoDet-3K/{names[i]}"
    
    print(i+1, " : ", names[i])
    
    test_set = torchvision.datasets.ImageFolder(root=folder_path, transform=preprocess) #커스텀데이터 부르기
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False) #데이터를 미니배치 형태로 생성

    print("num of classes : ", len(test_set.classes))

    # 모델 및 텍스트 토큰화 함수 정의
    text_descriptions = [f"a photo of a {label}" for label in test_set.classes]
    text_tokens = clip.tokenize(text_descriptions).to(device)

    # 모델 추론 및 정확도 계산
    correct_labels = 0
    total_labels = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)

            # 이미지 특성 추출
            image_features = model.encode_image(images).float()
            image_features /= image_features.norm(dim=-1, keepdim=True)

            # 텍스트 특성 추출
            text_features = model.encode_text(text_tokens).float()
            text_features /= text_features.norm(dim=-1, keepdim=True)

            # 텍스트 특성과의 유사도 계산
            text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            top_probs, top_labels = text_probs.topk(5, dim=-1)

            # 정확도 계산
            correct_labels += (top_labels[:, 0] == labels).sum().item()
            total_labels += labels.size(0)

    # 정확도 출력
    accuracy = correct_labels / total_labels
    print("The overall accuracy for the CLIP Zero-shot model with ensembling is: {}".format(accuracy))
    print("Number of correct labels:", correct_labels)
    print("")

1  :  Clothes
num of classes :  604
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6519541994498816
Number of correct labels: 20384

2  :  Electronic
num of classes :  224
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6775193798449612
Number of correct labels: 6555

3  :  Food
num of classes :  932
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.637282099343955
Number of correct labels: 33999

4  :  Leisure
num of classes :  111
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.8402814423922603
Number of correct labels: 4777

5  :  Medical
num of classes :  47
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.5077313054499366
Number of correct labels: 2003

6  :  Necessities
num of classes :  432
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.6198130690516477
Number of correct labels: 15385

7  :  Others
num of classes :  371


# 1. 1차 분류

In [56]:
total_num = 0

folder=torchvision.datasets.ImageFolder(root='/workspace/classification_exp/dataset/LogoDet-3K', transform=preprocess) #커스텀데이터 부르기
names = folder.classes

for i in range(len(names)):
    folder_path = f"/workspace/classification_exp/dataset/LogoDet-3K/{names[i]}"
    dataset = torchvision.datasets.ImageFolder(root=folder_path, transform=preprocess) #커스텀데이터 부르기
    total_num += len(dataset)
    
    
print("Total number of data instance is : ", total_num)

Total number of data instance is :  158654


## {label}

In [47]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision
import numpy as np
import os
import clip
from torch.utils.data import ConcatDataset



# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, preprocess = clip.load("ViT-B/32",jit=False) #loading the CLIP model based on ViT
model.cuda().eval()

folder=torchvision.datasets.ImageFolder(root='/workspace/classification_exp/dataset/LogoDet-3K', transform=preprocess) #커스텀데이터 부르기
names = folder.classes
class_names = []

batch_size = 64

folder_path = f"/workspace/classification_exp/dataset/LogoDet-3K/{names[0]}"
combined_dataset = torchvision.datasets.ImageFolder(root=folder_path, transform=preprocess) #커스텀데이터 부르기
class_names+=(combined_dataset.classes)

for i in range(len(names)-1):
    folder_path = f"/workspace/classification_exp/dataset/LogoDet-3K/{names[i+1]}"
    dataset = torchvision.datasets.ImageFolder(root=folder_path, transform=preprocess) #커스텀데이터 부르기
    combined_dataset = ConcatDataset([combined_dataset, dataset])
    
    class_names+=(dataset.classes)

print("num of classes : ", len(class_names))
    
test_loader = DataLoader(combined_dataset, batch_size=batch_size, shuffle=False) #데이터를 미니배치 형태로 생성

# 모델 및 텍스트 토큰화 함수 정의
text_tokens = clip.tokenize(class_names).to(device)

# 모델 추론 및 정확도 계산
correct_labels = 0
total_labels = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # 이미지 특성 추출
        image_features = model.encode_image(images).float()
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # 텍스트 특성 추출
        text_features = model.encode_text(text_tokens).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # 텍스트 특성과의 유사도 계산
        text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        top_probs, top_labels = text_probs.topk(5, dim=-1)

        # 정확도 계산
        correct_labels += (top_labels[:, 0] == labels).sum().item()
        total_labels += labels.size(0)

# 정확도 출력
accuracy = correct_labels / total_labels
print("The overall accuracy for the CLIP Zero-shot model without ensembling is: {}".format(accuracy))
print("Number of correct labels:", correct_labels)

num of classes :  3000
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.11409734390560591
Number of correct labels: 18102


## a photo of {label}

In [48]:
print("num of classes : ", len(class_names))
    
test_loader = DataLoader(combined_dataset, batch_size=batch_size, shuffle=False) #데이터를 미니배치 형태로 생성

# 모델 및 텍스트 토큰화 함수 정의
text_descriptions = [f"a photo of a {label}" for label in class_names]
text_tokens = clip.tokenize(text_descriptions).to(device)

# 모델 추론 및 정확도 계산
correct_labels = 0
total_labels = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # 이미지 특성 추출
        image_features = model.encode_image(images).float()
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # 텍스트 특성 추출
        text_features = model.encode_text(text_tokens).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # 텍스트 특성과의 유사도 계산
        text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        top_probs, top_labels = text_probs.topk(5, dim=-1)

        # 정확도 계산
        correct_labels += (top_labels[:, 0] == labels).sum().item()
        total_labels += labels.size(0)

# 정확도 출력
accuracy = correct_labels / total_labels
print("The overall accuracy for the CLIP Zero-shot model with ensembling is: {}".format(accuracy))
print("Number of correct labels:", correct_labels)

num of classes :  3000
The overall accuracy for the CLIP Zero-shot model without ensembling is: 0.11472134330051559
Number of correct labels: 18201
