In [1]:
from PIL import Image
from transformers import AutoModel, AutoConfig
from transformers import CLIPImageProcessor, pipeline, CLIPTokenizer
import torch
import torchvision.transforms as T
from torchvision.transforms import InterpolationMode

image_path = f"./Scene/0/{0}.jpg"
model_name_or_path = "BAAI/EVA-CLIP-8B" # or /path/to/local/EVA-CLIP-8B
image_size = 224

processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [7]:
model = AutoModel.from_pretrained(
    model_name_or_path, 
    torch_dtype=torch.float32,
    trust_remote_code=True).to('cpu').eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
image = Image.open(image_path)
class_names=['building', 'forest', 'glacier', 'mountain', 'sea', 'street']

prompts = []
for class_name in class_names:
    prompts.extend([
        f"A photo of a {class_name}",
        f"An image depicting a {class_name}",
        f"A scenic view of {class_name}",
        f"A {class_name} landscape",
        f"A snapshot of a {class_name}",
        f"A beautiful {class_name} scene",
        f"An artistic representation of {class_name}",
        f"A {class_name} captured in nature",
        f"A {class_name} during sunset",
        f"A serene {class_name} environment"
    ])

image1 = Image.open(f"./Scene/0/{0}.jpg")
image2 = Image.open(f"./Scene/0/{1}.jpg")

images = [image1, image2]
tokenizer = CLIPTokenizer.from_pretrained(model_name_or_path)
input_ids = tokenizer(prompts,  return_tensors="pt", padding=True, truncation=True, max_length=50).input_ids.to('cpu')
input_pixels = processor(images=images, return_tensors="pt").pixel_values.to('cpu')



In [9]:
with torch.no_grad(), torch.amp.autocast('cuda'):
    image_features = model.encode_image(input_pixels)
    text_features = model.encode_text(input_ids)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
predict = label_probs.argmax(axis=1) // 10
predict = predict.numpy().astype(int).tolist()
print(predict)

[5, 0]


In [5]:
from datetime import datetime
import pandas as pd
import os

In [77]:
submission = dict({'id_idx': [], 'label': []})
start_time = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f'submission_{start_time}_ver4_1.csv'

for idx in range(0, 810, 10):
    print(idx, end=" ")
    images = []
    for i in range(10):
        images.append(Image.open(f"./Scene/0/{idx + i}.jpg"))
        
    tokenizer = CLIPTokenizer.from_pretrained(model_name_or_path)
    input_ids = tokenizer(prompts,  return_tensors="pt", padding=True, truncation=True, max_length=50).input_ids.to('cpu')
    input_pixels = processor(images=images, return_tensors="pt").pixel_values.to('cpu')
        
    with torch.no_grad(), torch.amp.autocast('cuda'):
        image_features = model.encode_image(input_pixels)
        text_features = model.encode_text(input_ids)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    
    label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    predict = label_probs.argmax(axis=1) // 10
    predict = predict.numpy().astype(int).tolist()  

    submission['label'] += predict
    submission['id_idx'] = list(range(len(submission['label'])))
    pd.DataFrame(submission).to_csv(os.path.join("/home/jinjinjara1022/ajou/submissions/", file_name), index=False)

0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750 760 770 780 790 800 