In [None]:
import torch
from torch import nn
from transformers import AutoConfig, AutoModel, SwinModel, ViTModel


class HuggingfaceImageEncoder(nn.Module):
    def __init__(
        self,
        name: str = "google/vit-base-patch16-224",
        pretrained: bool = True,
        gradient_checkpointing: bool = False,
        cache_dir: str = "~/.cache/huggingface/hub",
        model_type: str = "vit",
        local_files_only: bool = False,
    ):
        super().__init__()
        self.model_type = model_type
        if pretrained:
            if self.model_type == "swin":
                self.image_encoder = SwinModel.from_pretrained(name)
            else:
                self.image_encoder = AutoModel.from_pretrained(
                    name, add_pooling_layer=False, cache_dir=cache_dir, local_files_only=local_files_only
                )
        else:
            # initializing with a config file does not load the weights associated with the model
            model_config = AutoConfig.from_pretrained(name, cache_dir=cache_dir, local_files_only=local_files_only)
            if type(model_config).__name__ == "ViTConfig":
                self.image_encoder = ViTModel(model_config, add_pooling_layer=False)
            else:
                # TODO: add vision models if needed
                raise NotImplementedError(f"Not support training from scratch : {type(model_config).__name__}")

        if gradient_checkpointing and self.image_encoder.supports_gradient_checkpointing:
            self.image_encoder.gradient_checkpointing_enable()

        self.out_dim = self.image_encoder.config.hidden_size

    def forward(self, image):
        if self.model_type == "vit":
            output = self.image_encoder(pixel_values=image, interpolate_pos_encoding=True)
        elif self.model_type == "swin":
            output = self.image_encoder(pixel_values=image)
        return output["last_hidden_state"]  # (batch, seq_len, hidden_size)

In [None]:
!pip install --upgrade transformers

In [None]:
!rm -rf /root/.cache/huggingface/

In [None]:
from transformers import ViTModel

model = ViTModel.from_pretrained('google/vit-base-patch16-224', cache_dir='/tmp/huggingface_hub')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from PIL import Image
from torchvision import transforms

# 이미지를 로드합니다.
image = Image.open('/content/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg')

# 이미지 전처리를 위한 변환을 정의합니다.
preprocess = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),  # 모델 입력 사이즈에 맞게 이미지 크기를 조정합니다.
    transforms.ToTensor(),          # 이미지를 텐서로 변환합니다.
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # 정규화를 진행합니다.
])

# 이미지를 전처리합니다.
image_tensor = preprocess(image).unsqueeze(0)  # 배치 차원을 추가합니다.

# 모델 인스턴스를 생성합니다.
encoder = HuggingfaceImageEncoder()

# 이미지를 모델에 넣어 인코딩합니다.
with torch.no_grad():  # 그래디언트 계산을 하지 않도록 설정합니다.
    encoded_images = encoder(image_tensor)

In [None]:
!rm -rf /root/.cache/huggingface/

In [None]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])