In [2]:
import torch 
from einops import rearrange # 텐서 연산 돕는 도구로 코드 가독성 증진
from torchvision.transforms import Normalize 
from torchvision.transforms.functional import resize 
from torchvision.utils import save_image 
from torchvision.io.image import read_image, ImageReadMode 

In [3]:
# 이미지를 읽고 resize
I1 = read_image( "img/1.jpg" , ImageReadMode.RGB) 
I2 = read_image( "img/2.jpg" , ImageReadMode.RGB) 

H, W = 672 , 672
I1 = resize(I1, (H, W)) 
I2 = resize(I2, (H, W))

In [4]:
# 배치 차원을 쌓아서 배치 이미지 텐서 얻기(B*C*H*W)
I = torch.stack([I1, I2], dim=0)

In [5]:
# DINOv2 전처리 변환에 따른 정규화
# [0, 1]로 정규화된 텐서의 평균 및 표준편차
IMAGENET_DEFAULT_MEAN = ( 0.485 , 0.456 , 0.406 ) 
IMAGENET_DEFAULT_STD = ( 0.229 , 0.224 , 0.225 ) 

norm = Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) 

I_norm = norm(I / 255 )

In [6]:
# patch tokens 얻기
dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
features = dinov2.forward_features(I_norm)
E_patch = features["x_norm_patchtokens"]

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to C:\Users\User/.cache\torch\hub\main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth" to C:\Users\User/.cache\torch\hub\checkpoints\dinov2_vitb14_pretrain.pth
100%|██████████| 330M/330M [00:35<00:00, 9.75MB/s] 


In [8]:
# PCA 이용해서 배경 제거
E_patch_norm = rearrange(E_patch, "B L E -> (B L) E")

# Getting Values of the pricipal value decomposition
_, _, V = torch.pca_lowrank(E_patch_norm)

# Projecting embeddings to the first component of the V matrix
E_pca_1 = torch.matmul(E_patch_norm, V[:, :1])


In [9]:
# 모든 값을 범위에 매핑하기 위해 min max scaling 사용
def minmax_norm(x):
    """Min-max normalization"""
    return (x - x.min(0).values) / (x.max(0).values - x.min(0).values)
    
E_pca_1_norm = minmax_norm(E_pca_1)

In [10]:
# 임계값을 설정해 전경, 배경 패치 탐색
M_fg = E_pca_1_norm.squeeze() > 0.5
M_bg = E_pca_1_norm.squeeze() <= 0.5 

In [11]:
# 전경 임베딩에서만 pca 계산
# Getting Values of the pricipal value decomposition for foreground pixels
_, _, V = torch.pca_lowrank(E_patch_norm[M_fg])

# Projecting foreground embeddings to the first 3 component of the V matrix
E_pca_3_fg = torch.matmul(E_patch_norm[M_fg], V[:, :3])
E_pca_3_fg = minmax_norm(E_pca_3_fg)

In [12]:
B, L, _ = E_patch.shape
Z = B * L
I_draw = torch.zeros(Z,3)

In [13]:
# 마스크 인덱싱을 통해 전경 픽셀 추가
I_draw[M_fg] = E_pca_3_fg


In [14]:
# 이미지를 배치의 원래 모양으로 재조정
I_draw = rearrange(I_draw, "(B L) C -> B L C", B=B)


In [15]:
I_draw = rearrange(I_draw, "B (h w) C -> B h w C", h=H//14, w=W//14)


In [17]:
# Unpacking PCA images
image_1_pca = I_draw[0]
image_2_pca = I_draw[1]

# To chanel first format torchvision format
image_1_pca = rearrange(image_1_pca, "H W C -> C H W")
image_2_pca = rearrange(image_2_pca, "H W C -> C H W")

# Resizing it to ease visualization 
image_1_pca = resize(image_1_pca, (H,W))
image_2_pca = resize(image_2_pca, (H,W))

# Saving
save_image(image_1_pca, "img/image_1_pca.png")
save_image(image_2_pca, "img/image_2_pca.png")