In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import os, glob
import requests
import torch, torchvision
import torch.nn.functional as F
from collections import Counter
from PIL import Image
from transformers import CLIPModel, CLIPProcessor, AutoModelForVisualQuestionAnswering, AutoProcessor
from torch.utils.data import Dataset, DataLoader

In [3]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", dtype=torch.bfloat16, attn_implementation="sdpa")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [4]:
blip_vqa_model = AutoModelForVisualQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    dtype=torch.float16,
    device_map="auto"
)
blip_vqa_processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

question = "What is the weather in this image?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

output = blip_vqa_model.generate(**inputs)
blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

'snowy'

# CLIP 기본 체크
clip_processor, clip_model 체크

In [5]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]

inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True)
# 'pixel_values', 'input_ids', 'attention_mask'

outputs = clip_model(**inputs)
# 'logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'
# (1, 3)            , (3, 1)           , (3, 512)     , (1, 512)      , ()

logits_per_image = outputs.logits_per_image # (1, 3)
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item():.3f}")

Most likely label: a photo of a cat with probability: 0.992


In [6]:
path = '/content/drive/MyDrive/project/data'
landmark_list = ["네모탑", "웅진역사관", "지혜의 숲", "지혜의숲 조각상", "피노키오", "활판 공방"]

image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.webp", "*.jfif"]
image_files = {}

for landmark in landmark_list:
    landmark_dir = os.path.join(path, landmark)

    files = []

    for ext in image_extensions:
        pattern = os.path.join(landmark_dir, ext)
        files.extend(glob.glob(pattern))

    image_files[landmark] = files

for k, v in image_files.items():
    print(k, len(v))

네모탑 26
웅진역사관 3
지혜의 숲 4
지혜의숲 조각상 13
피노키오 23
활판 공방 8


BLIP으로 mood 생성해보기

In [9]:
emotion_words = {}
summary = {}

for landmark, path_list in image_files.items():
    words = []
    for img_path in path_list:
        image = Image.open(img_path).convert("RGB")
        question = "What's the atmosphere of this place like?"
        inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

        output = blip_vqa_model.generate(**inputs)
        words.append(blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0])
    emotion_words[landmark] = words

for landmark, word_list in emotion_words.items():
    counter = Counter(word_list)
    most_common_list = counter.most_common()
    summary[landmark] = most_common_list

summary

{'네모탑': [('calm', 13),
  ('peaceful', 5),
  ('gloomy', 2),
  ('cold', 2),
  ('happy', 1),
  ('urban', 1),
  ('cloudy', 1),
  ('foggy', 1)],
 '웅진역사관': [('calm', 2), ('modern', 1)],
 '지혜의 숲': [('calm', 4)],
 '지혜의숲 조각상': [('calm', 8), ('urban', 3), ('nostalgic', 1), ('relaxed', 1)],
 '피노키오': [('peaceful', 8),
  ('happy', 6),
  ('urban', 4),
  ('gloomy', 2),
  ('amusement', 1),
  ('sunny', 1),
  ('asian', 1)],
 '활판 공방': [('calm', 3), ('industrial', 2), ('gloomy', 2), ('urban', 1)]}

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
def make_prompts_from_keywords(keywords):
    prompts = []
    for kw in keywords:
        prompts.append(f"A photo that conveys a {kw} mood.")
    return prompts

In [17]:
import torch
import torch.nn.functional as F

def clip_similarity_ranking(img_path, prompts):
    # 이미지 가져오기
    image = Image.open(img_path).convert("RGB")

    # processor로 batch 입력 생성
    inputs = clip_processor(
        text=prompts,
        images=image,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        image_embeds = clip_model.get_image_features(inputs["pixel_values"])
        text_embeds = clip_model.get_text_features(inputs["input_ids"], inputs["attention_mask"])

    # cosine 유사도 비교를 위해 정규화
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)

    # cosine similarity
    sims = image_embeds @ text_embeds.T
    sims = sims.squeeze(0).tolist()

    # prompt와 점수 묶어서 높은 순으로 정렬
    scored = list(zip(prompts, sims))
    scored.sort(key=lambda x: x[1], reverse=True)

    return scored

In [18]:
keywords = ["calm", "peaceful", "gloomy", "cold", "happy", "urban", "cloudy", "foggy"]

prompts = make_prompts_from_keywords(keywords)

test_img = image_files["네모탑"][0]
ranking = clip_similarity_ranking(test_img, prompts)

for p, s in ranking:
    print(f"{s:.3f}  |  {p}")

0.212  |  A photo that conveys a urban mood.
0.208  |  A photo that conveys a gloomy mood.
0.204  |  A photo that conveys a calm mood.
0.204  |  A photo that conveys a cloudy mood.
0.202  |  A photo that conveys a cold mood.
0.202  |  A photo that conveys a foggy mood.
0.201  |  A photo that conveys a peaceful mood.
0.190  |  A photo that conveys a happy mood.


In [22]:
atmosphere = [word for word, count in summary["네모탑"]]
atmosphere

['calm', 'peaceful', 'gloomy', 'cold', 'happy', 'urban', 'cloudy', 'foggy']

In [31]:
atmosphere = [word for word, count in summary["네모탑"]]
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["네모탑"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

for i in range(probs.shape[0]):
    most_likely_idx = probs[i].argmax().item()
    most_likely_label = labels[most_likely_idx]
    most_likely_prob = probs[i][most_likely_idx].item()
    top_labels_per_image.append(most_likely_label)

    print(f"[Image {i}] {most_likely_label}  (p={most_likely_prob:.3f})")

[Image 0] A photo that conveys a urban mood  (p=0.262)
[Image 1] A photo that conveys a foggy mood  (p=0.504)
[Image 2] A photo that conveys a urban mood  (p=0.314)
[Image 3] A photo that conveys a urban mood  (p=0.395)
[Image 4] A photo that conveys a urban mood  (p=0.422)
[Image 5] A photo that conveys a urban mood  (p=0.303)
[Image 6] A photo that conveys a urban mood  (p=0.311)
[Image 7] A photo that conveys a urban mood  (p=0.473)
[Image 8] A photo that conveys a urban mood  (p=0.297)
[Image 9] A photo that conveys a urban mood  (p=0.357)
[Image 10] A photo that conveys a cloudy mood  (p=0.295)
[Image 11] A photo that conveys a urban mood  (p=0.424)
[Image 12] A photo that conveys a urban mood  (p=0.357)
[Image 13] A photo that conveys a urban mood  (p=0.233)
[Image 14] A photo that conveys a peaceful mood  (p=0.430)
[Image 15] A photo that conveys a urban mood  (p=0.359)
[Image 16] A photo that conveys a calm mood  (p=0.230)
[Image 17] A photo that conveys a urban mood  (p=0.602)

In [32]:
counter = Counter(top_labels_per_image)
print(counter)

Counter({'A photo that conveys a urban mood': 17, 'A photo that conveys a foggy mood': 3, 'A photo that conveys a calm mood': 3, 'A photo that conveys a cloudy mood': 1, 'A photo that conveys a peaceful mood': 1, 'A photo that conveys a gloomy mood': 1})


상위 5개를 선별

In [35]:
atmosphere = [word for word, count in summary["네모탑"]]
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["네모탑"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a urban mood  (p=0.262)
  2. A photo that conveys a gloomy mood  (p=0.159)
  3. A photo that conveys a calm mood  (p=0.141)
  4. A photo that conveys a foggy mood  (p=0.124)
  5. A photo that conveys a cloudy mood  (p=0.109)
[Image 1] Top-5 moods:
  1. A photo that conveys a foggy mood  (p=0.504)
  2. A photo that conveys a calm mood  (p=0.112)
  3. A photo that conveys a cloudy mood  (p=0.112)
  4. A photo that conveys a gloomy mood  (p=0.087)
  5. A photo that conveys a peaceful mood  (p=0.068)
[Image 2] Top-5 moods:
  1. A photo that conveys a urban mood  (p=0.314)
  2. A photo that conveys a foggy mood  (p=0.169)
  3. A photo that conveys a gloomy mood  (p=0.132)
  4. A photo that conveys a cloudy mood  (p=0.132)
  5. A photo that conveys a peaceful mood  (p=0.103)
[Image 3] Top-5 moods:
  1. A photo that conveys a urban mood  (p=0.395)
  2. A photo that conveys a gloomy mood  (p=0.113)
  3. A photo that conveys a cloudy mood  (p=0.1

In [36]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a calm mood: 23
A photo that conveys a peaceful mood: 23
A photo that conveys a urban mood: 22
A photo that conveys a cloudy mood: 20
A photo that conveys a gloomy mood: 18
A photo that conveys a foggy mood: 15
A photo that conveys a cold mood: 9
