# VIT 

In [22]:
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import timm
from sklearn.preprocessing import normalize

In [23]:
class DiffusionTestDataset(Dataset):
    def __init__(self, images, transform):
        self.images = images
        self.transform = transform
    
    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(self.images[idx])
        image = self.transform(image)
        return image

In [24]:
def predict(
    images,
    model_path,
    model_name,
    input_size,
    batch_size,
    learning_rate
):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.Resize(input_size),
        transforms.RandomHorizontalFlip(p=0.5),
#         transforms.RandomRotation(degrees=10),

        # transforms.RandomVerticalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    dataset = DiffusionTestDataset(images, transform)
    dataloader = DataLoader(
        dataset=dataset,
        shuffle=False,
        batch_size=batch_size,
        pin_memory=True,
        num_workers=2,
        drop_last=False
    )

    model = timm.create_model(
        model_name,
        pretrained=False,
        num_classes=384
    )
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    
    tta_preds = None
    for _ in range(2):
        preds = []
        for X in tqdm(dataloader, leave=False):
            X = X.to(device)

            with torch.no_grad():
                X_out = model(X).cpu().numpy()
                # L2 normalize -- Start
                X_out = X_out / ( np.abs(X_out).max(axis=-1, keepdims=True) + 0.0000001)  # To avoid to overflow at normalize()
                X_out = normalize( X_out )
                # L2 normalize -- End
                preds.append(X_out)
                
        if tta_preds is None:
            tta_preds = np.vstack(preds).flatten()
        else:
            tta_preds += np.vstack(preds).flatten()
    
    return tta_preds / 2

In [25]:
import itertools

# 하이퍼파라미터 조합 생성
param_grid = {
    'model_path': ['/kaggle/input/stable-diffusion-vit-baseline-train/vit_base_patch16_224.pth'],
    'model_name': ['vit_base_patch16_224'],
    'input_size': [224],
    'batch_size': [16, 32, 64, 128, 256],
    'patch_size': [8, 16, 32],
    'learning_rate': [1e-5, 1e-4, 1e-3]
}

param_list = list(itertools.product(*param_grid.values()))

# 결과 저장을 위한 딕셔너리 생성
results = {}

# 하이퍼파라미터 조합마다 predict 수행
for params in param_list:
    # 딕셔너리 형태로 변환
    params_dict = {list(param_grid.keys())[i]: params[i] for i in range(len(params))}
    print("Testing params:", params_dict)
    
    # predict 수행
    embeddings = predict(images, params_dict['model_path'], params_dict['model_name'], 
                         params_dict['input_size'], params_dict['batch_size'], params_dict['learning_rate'])
    
    # 결과 저장
    results[str(params_dict)] = embeddings.mean()
    
# 결과 출력
print("Results:", results)

# 최적의 조합 찾기
best_params = max(results, key=results.get)
print("Best params:", best_params)

In [26]:
class CFG:
    model_path = '/kaggle/input/stable-diffusion-vit-baseline-train/vit_base_patch16_224.pth'
    model_name = 'vit_base_patch16_224'
    input_size = 224
    batch_size = 256
    patch_size = 8
    learning_rate = 0.001

In [27]:
images = list(Path('/kaggle/input/stable-diffusion-image-to-prompts/images').glob('*.png'))
imgIds = [i.stem for i in images]
EMBEDDING_LENGTH = 384
imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

embeddings3 = predict(images, CFG.model_path, CFG.model_name, CFG.input_size, CFG.batch_size, CFG.learning_rate)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

# ENSEMBLE THE OUTPUTS

In [28]:
embeddings = embeddings2*ratio_CLIP_Interrogator + ratio_ViT_B_16 * embeddings3

In [29]:
submission = pd.DataFrame(
    index=imgId_eId,
    data=embeddings,
    columns=['val']
).rename_axis('imgId_eId')

In [30]:
submission.head()

Unnamed: 0_level_0,val
imgId_eId,Unnamed: 1_level_1
f27825b2c_0,-0.030706
f27825b2c_1,0.045031
f27825b2c_2,-0.031132
f27825b2c_3,-0.045387
f27825b2c_4,-0.060089


In [31]:
submission.to_csv('submission.csv')