In [7]:
#!pip install transformers einops timm pillow
from transformers import AutoModel

# Initialize the model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)

# New meaningful sentences
sentences = ['A blue cat', 'A red cat']

# Public image URLs
image_urls = [
    'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
    'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
]

# Encode text and images
text_embeddings = model.encode_text(sentences)
image_embeddings = model.encode_image(image_urls)  # also accepts PIL.image, local filenames, dataURI

# Compute similarities
print(text_embeddings[0] @ text_embeddings[1].T) # text embedding similarity
print(text_embeddings[0] @ image_embeddings[0].T) # text-image cross-modal similarity
print(text_embeddings[0] @ image_embeddings[1].T) # text-image cross-modal similarity
print(text_embeddings[1] @ image_embeddings[0].T) # text-image cross-modal similarity
print(text_embeddings[1] @ image_embeddings[1].T)# text-image cross-modal similarity

0.5635972
0.2905935
0.05685927
0.12773362
0.2915658


In [12]:
import torch
import torch.nn.functional as F

# 레이블 리스트
labels_list = ['Buildings', 'Forests', 'Glacier', 'Mountains', 'Sea', 'Street']

# 이미지 리스트
images = ["./Scene/0/0.jpg", "./Scene/0/1.jpg"]

# 텍스트 및 이미지 임베딩 계산
text_embeddings = model.encode_text(labels_list)
image_embeddings = model.encode_image(images)

# 결과 계산 (점수 매트릭스)
rst = 20 * image_embeddings @ text_embeddings.T

# numpy.ndarray를 torch.Tensor로 변환
rst = torch.tensor(rst)

# 소프트맥스 함수 적용 (차원=1, 각 행에 대해 확률 계산)
softmax_probs = F.softmax(rst, dim=1)

# 출력: softmax 결과 확인
print(softmax_probs)

tensor([[0.1893, 0.0478, 0.0434, 0.0912, 0.0652, 0.5631],
        [0.7001, 0.0112, 0.0320, 0.1071, 0.0651, 0.0846]])


In [14]:
from datetime import datetime
import pandas as pd
import os

In [23]:
model.logit_scale.exp()

tensor(98.5356, grad_fn=<ExpBackward0>)

In [19]:
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

submission = dict({'id_idx': [], 'label': []})

for idx in range(81):
    print(idx+1, end=" ")
    images = []
    for i in range(100):
        images.append(f"./Scene/0/{idx*100 + i}.jpg")    
    image_embeddings = torch.tensor(model.encode_image(images))
    
    text_probs = F.softmax(image_embeddings @ text_embeddings.T * 10)

    for i in text_probs:
        print(i)
    break
    rst = text_probs.tolist()
    submission['label'] += rst

1 tensor([0.2005, 0.1008, 0.0961, 0.1392, 0.1176, 0.3458])
tensor([0.4195, 0.0531, 0.0897, 0.1641, 0.1279, 0.1458])
tensor([0.0577, 0.4277, 0.1364, 0.1162, 0.1372, 0.1247])
tensor([0.2798, 0.0773, 0.2220, 0.0879, 0.1207, 0.2123])
tensor([0.0499, 0.0841, 0.2467, 0.3115, 0.2542, 0.0536])
tensor([0.1031, 0.3425, 0.1254, 0.1536, 0.0871, 0.1883])
tensor([0.4888, 0.0609, 0.0724, 0.0638, 0.0663, 0.2478])
tensor([0.0379, 0.0781, 0.2587, 0.4989, 0.0993, 0.0271])
tensor([0.0610, 0.4874, 0.0989, 0.1078, 0.0968, 0.1481])
tensor([0.0577, 0.1088, 0.2319, 0.3656, 0.1534, 0.0824])
tensor([0.2522, 0.1069, 0.1622, 0.1465, 0.1485, 0.1837])
tensor([0.0719, 0.0460, 0.1301, 0.1015, 0.5912, 0.0592])
tensor([0.0698, 0.0736, 0.1388, 0.2326, 0.4297, 0.0556])
tensor([0.2789, 0.0637, 0.0576, 0.0961, 0.0497, 0.4540])
tensor([0.0404, 0.1126, 0.2019, 0.5167, 0.0984, 0.0299])
tensor([0.1091, 0.0578, 0.3574, 0.2390, 0.1729, 0.0639])
tensor([0.5538, 0.0394, 0.0997, 0.0971, 0.0718, 0.1381])
tensor([0.1325, 0.0547, 0.083

  text_probs = F.softmax(image_embeddings @ text_embeddings.T * 10)


In [25]:
file_name = f'JINA_soft_{current_time}.csv'
submission['id_idx'] = list(range(len(submission['label'])))
pd.DataFrame(submission).to_csv(os.path.join("./submissions/", file_name), index=False)