In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [75]:
import os, glob, re
import requests
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from collections import Counter
from transformers import CLIPModel, CLIPProcessor, AutoModelForVisualQuestionAnswering, AutoProcessor
from torch.utils.data import Dataset, DataLoader

In [3]:
clip_model = CLIPModel.from_pretrained(
    "openai/clip-vit-base-patch32",
    dtype=torch.float16,
    attn_implementation="sdpa") # sdpa=scaled dot product attention
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

blip_vqa_model = AutoModelForVisualQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    dtype=torch.float16,
    device_map="auto"
)
blip_vqa_processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

# Seed 설정

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# CLIP 예시
clip_processor, clip_model

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]

inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True)
# 'pixel_values', 'input_ids', 'attention_mask'

outputs = clip_model(**inputs)
# 'logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'
# (1, 3)            , (3, 1)           , (3, 512)     , (1, 512)

logits_per_image = outputs.logits_per_image # (1, 3)
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item():.3f}")

Most likely label: a photo of a cat with probability: 0.992


# BLIP 예시

In [18]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

question = "What is the weather in this image?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)
# 'pixel_values', 'input_ids', 'attention_mask'

output = blip_vqa_model.generate(**inputs)
# token id

blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

'snowy'

# 이미지 load

In [14]:
path = '/content/drive/MyDrive/project/data'
landmark_list = ["네모탑", "웅진역사관", "지혜의 숲", "지혜의숲 조각상", "피노키오", "활판 공방"]

image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.webp", "*.jfif"]
image_files = {}

for landmark in landmark_list:
    landmark_dir = os.path.join(path, landmark)

    files = []

    for ext in image_extensions:
        pattern = os.path.join(landmark_dir, ext)
        files.extend(glob.glob(pattern))

    image_files[landmark] = files

for k, v in image_files.items():
    print(f"랜드마크: {k}, 이미지 개수: {len(v)}")
    print(f"{k} 예시 이미지")
    image = Image.open(v[0]).convert("RGB")
    display(image)

Output hidden; open in https://colab.research.google.com to view.

# BLIP으로 분위기 label 생성해보기

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
emotion_words = {}
summary = {}

questions = [
    "What's the atmosphere of this place like?",
    "How does this place feel?",
    "What kind of mood does this place give off?",
    "Describe the mood or feeling of this scene.",
    "What emotion does this image convey?"
]

for landmark, path_list in image_files.items():
    words = []
    for img_path in path_list:
        image = Image.open(img_path).convert("RGB")

        # 여러 질문을 통해 특정 장소에 해당하는 label 후보 생성
        for question in questions:
            inputs = blip_vqa_processor(
                images=image,
                text=question,
                return_tensors="pt"
            ).to(blip_vqa_model.device, torch.float16)

            output = blip_vqa_model.generate(**inputs)
            answer = blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]
            words.append(answer)

    # 각 랜드마크에서 나온 단어 빈도 별로
    counter = Counter(words)
    most_common_list = counter.most_common()
    summary[landmark] = most_common_list

for k, v in summary.items():
    print(f"{k} 랜드마크에서 등장한 분위기 단어 빈도 순: {v}")

네모탑 랜드마크에서 등장한 분위기 단어: [('calm', 65), ('happy', 23), ('happiness', 15), ('joy', 10), ('peaceful', 6), ('cold', 4), ('gloomy', 2), ('sad', 1), ('sadness', 1), ('urban', 1), ('cloudy', 1), ('foggy', 1)]
웅진역사관 랜드마크에서 등장한 분위기 단어: [('calm', 8), ('joy', 2), ('good', 1), ('happiness', 1), ('busy', 1), ('modern', 1), ('relaxed', 1)]
지혜의 숲 랜드마크에서 등장한 분위기 단어: [('calm', 11), ('joy', 3), ('relaxed', 2), ('busy', 1), ('good', 1), ('happy', 1), ('happiness', 1)]
지혜의숲 조각상 랜드마크에서 등장한 분위기 단어: [('calm', 24), ('happiness', 8), ('sad', 8), ('happy', 7), ('relaxed', 6), ('joy', 5), ('urban', 3), ('busy', 2), ('good', 1), ('nostalgic', 1)]
피노키오 랜드마크에서 등장한 분위기 단어: [('happy', 90), ('peaceful', 8), ('happiness', 8), ('urban', 4), ('gloomy', 2), ('amusement', 1), ('sunny', 1), ('asian', 1)]
활판 공방 랜드마크에서 등장한 분위기 단어: [('calm', 13), ('happiness', 6), ('good', 5), ('sad', 4), ('industrial', 2), ('busy', 2), ('gloomy', 2), ('happy', 2), ('joy', 1), ('urban', 1), ('serious', 1), ('boredom', 1)]


# label 검증
**네모탑**: ['calm', 'happy', 'joy', 'peaceful', 'cold', 'gloomy', 'sad', 'urban', 'cloudy', 'foggy']  
**웅진역사관**: ['calm', 'joy', 'good', 'happy', 'busy', 'modern', 'relaxed']  
**지혜의 숲**: ['calm', 'joy', 'relaxed', 'busy', 'good', 'happy']  
**지혜의숲 조각상**: ['calm', 'happy', 'sad', 'relaxed', 'joy', 'urban', 'busy', 'good', 'nostalgic']  
**피노키오**: ['happy', 'peaceful', 'urban', 'gloomy', 'amusement', 'sunny', 'asian']  
**활판 공방**: ['calm', 'happy', 'good', 'sad', 'industrial', 'busy', 'gloomy', 'joy', 'urban', 'serious', 'boredom']



In [20]:
def make_prompts_from_keywords(keywords):
    prompts = []
    for kw in keywords:
        prompts.append(f"A photo that conveys a {kw} mood.")
    return prompts

# 키워드 prompt 강화 버전

In [26]:
def make_prompts_from_keywords(keywords):
    prompts = []
    for kw in keywords:
        variants = [
            f"A photo of a {kw} place.",
            f"A scene with a {kw} atmosphere.",
            f"A landscape that feels {kw}.",
            f"A {kw} environment.",
            f"A picture showing a {kw} feeling."
        ]
        prompts.extend(variants)
    return prompts

In [28]:
make_prompts_from_keywords(["calm", "happy"])

['A photo of a calm place.',
 'A scene with a calm atmosphere.',
 'A landscape that feels calm.',
 'A calm environment.',
 'A picture showing a calm feeling.',
 'A photo of a happy place.',
 'A scene with a happy atmosphere.',
 'A landscape that feels happy.',
 'A happy environment.',
 'A picture showing a happy feeling.']

# cosine similarity로 비교
embedding space에서 구별하기가 어려움, 모든 prompt의 방향이 서로 가깝게 모여서 dot product가 거의 비슷하게 되는 듯

In [32]:
def clip_similarity_ranking(img_path, prompts):
    image = Image.open(img_path).convert("RGB")

    inputs = clip_processor(
        text=prompts,
        images=image,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        image_embeds = clip_model.get_image_features(inputs["pixel_values"])
        text_embeds = clip_model.get_text_features(inputs["input_ids"], inputs["attention_mask"])

    # cosine 유사도 비교를 위해 정규화(L2 norm)
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)

    # cosine similarity
    logit_scale = clip_model.logit_scale.exp()
    sims = (image_embeds @ text_embeds.T)
    sims = sims.squeeze(0).tolist()

    # prompt와 점수 묶어서 높은 순으로 정렬
    scored = list(zip(prompts, sims))
    scored.sort(key=lambda x: x[1], reverse=True)

    return scored

In [33]:
keywords = ['calm', 'happy', 'joy', 'peaceful', 'cold', 'gloomy', 'sad', 'urban', 'cloudy', 'foggy']

prompts = make_prompts_from_keywords(keywords)

test_img = image_files["네모탑"][0]
ranking = clip_similarity_ranking(test_img, prompts)

for p, s in ranking:
    print(f"{s:.3f}  |  {p}")

0.239  |  A scene with a urban atmosphere.
0.237  |  A photo of a urban place.
0.231  |  A landscape that feels urban.
0.231  |  A urban environment.
0.226  |  A picture showing a urban feeling.
0.220  |  A photo of a calm place.
0.219  |  A scene with a calm atmosphere.
0.218  |  A scene with a cloudy atmosphere.
0.216  |  A scene with a sad atmosphere.
0.216  |  A scene with a gloomy atmosphere.
0.216  |  A picture showing a cold feeling.
0.216  |  A scene with a peaceful atmosphere.
0.215  |  A photo of a foggy place.
0.215  |  A photo of a sad place.
0.214  |  A photo of a cloudy place.
0.210  |  A photo of a gloomy place.
0.210  |  A landscape that feels sad.
0.209  |  A sad environment.
0.209  |  A photo of a peaceful place.
0.208  |  A scene with a foggy atmosphere.
0.208  |  A picture showing a calm feeling.
0.208  |  A picture showing a cloudy feeling.
0.206  |  A picture showing a foggy feeling.
0.205  |  A scene with a cold atmosphere.
0.203  |  A picture showing a peaceful 

# 평균 이미지로 유사도 계산

In [40]:
def clip_similarity_ranking(image_paths, prompts, device="cuda"):
    # 이미지 embedding 평균 구하기
    image_embeds = []
    for img_path in image_paths:
        image = Image.open(img_path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            image_feat = clip_model.get_image_features(**inputs)
            image_feat = F.normalize(image_feat, p=2, dim=-1)
            image_embeds.append(image_feat)

    # 여러 이미지의 임베딩 평균을 구해 랜드마크를 대표하는 벡터를 추출
    image_embeds = torch.stack(image_embeds).mean(0)  # shape: [1, D]

    text_inputs = clip_processor(text=prompts, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        text_embeds = clip_model.get_text_features(**text_inputs)
        text_embeds = F.normalize(text_embeds, p=2, dim=-1)

    # cosine similarity 계산
    logit_scale = clip_model.logit_scale.exp()
    sims = (image_embeds @ text_embeds.T) * logit_scale
    sims = sims.squeeze(0).tolist()

    scored = list(zip(prompts, sims))
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored

In [41]:
keywords = ['calm', 'happy', 'joy', 'peaceful', 'cold', 'gloomy', 'sad', 'urban', 'cloudy', 'foggy']
prompts = make_prompts_from_keywords(keywords)

test_imgs = image_files["네모탑"][:5]
clip_model.to(device)

ranking = clip_similarity_ranking(test_imgs, prompts)

for p, s in ranking:
    print(f"{s:.3f} | {p}")

23.812 | A landscape that feels urban.
23.281 | A photo of a urban place.
23.047 | A scene with a urban atmosphere.
22.953 | A urban environment.
22.562 | A photo of a foggy place.
22.500 | A picture showing a urban feeling.
22.047 | A photo of a sad place.
21.906 | A photo of a peaceful place.
21.875 | A photo of a calm place.
21.766 | A picture showing a cold feeling.
21.703 | A photo of a cold place.
21.688 | A landscape that feels cold.
21.688 | A photo of a cloudy place.
21.547 | A photo of a gloomy place.
21.516 | A scene with a peaceful atmosphere.
21.500 | A landscape that feels sad.
21.422 | A scene with a calm atmosphere.
21.422 | A photo of a joy place.
21.328 | A scene with a cloudy atmosphere.
21.047 | A cold environment.
20.906 | A landscape that feels foggy.
20.906 | A picture showing a foggy feeling.
20.844 | A landscape that feels happy.
20.812 | A scene with a foggy atmosphere.
20.734 | A landscape that feels joy.
20.672 | A peaceful environment.
20.625 | A sad enviro

결론적으로 효과적이지 못한 방법이었던 것 같음  
cosine similarity를 통해서는 확실히 구분할 수 없음

# 후보들을 통해서 직접 inference

In [45]:
atmosphere = ['calm', 'happy', 'joy', 'peaceful', 'cold', 'gloomy', 'sad', 'cloudy', 'foggy']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["네모탑"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

for i in range(probs.shape[0]):
    most_likely_idx = probs[i].argmax().item()
    most_likely_label = labels[most_likely_idx]
    most_likely_prob = probs[i][most_likely_idx].item()
    top_labels_per_image.append(most_likely_label)

    print(f"[Image {i}] {most_likely_label}  (p={most_likely_prob:.3f})")

[Image 0] A photo that conveys a sad mood  (p=0.178)
[Image 1] A photo that conveys a foggy mood  (p=0.491)
[Image 2] A photo that conveys a foggy mood  (p=0.189)
[Image 3] A photo that conveys a gloomy mood  (p=0.157)
[Image 4] A photo that conveys a cold mood  (p=0.334)
[Image 5] A photo that conveys a peaceful mood  (p=0.259)
[Image 6] A photo that conveys a cloudy mood  (p=0.207)
[Image 7] A photo that conveys a peaceful mood  (p=0.282)
[Image 8] A photo that conveys a calm mood  (p=0.187)
[Image 9] A photo that conveys a calm mood  (p=0.220)
[Image 10] A photo that conveys a cloudy mood  (p=0.299)
[Image 11] A photo that conveys a peaceful mood  (p=0.336)
[Image 12] A photo that conveys a sad mood  (p=0.332)
[Image 13] A photo that conveys a peaceful mood  (p=0.237)
[Image 14] A photo that conveys a peaceful mood  (p=0.419)
[Image 15] A photo that conveys a cold mood  (p=0.413)
[Image 16] A photo that conveys a calm mood  (p=0.203)
[Image 17] A photo that conveys a cloudy mood  (p

In [46]:
counter = Counter(top_labels_per_image)
print(counter)

Counter({'A photo that conveys a peaceful mood': 7, 'A photo that conveys a foggy mood': 5, 'A photo that conveys a calm mood': 5, 'A photo that conveys a cloudy mood': 3, 'A photo that conveys a sad mood': 2, 'A photo that conveys a gloomy mood': 2, 'A photo that conveys a cold mood': 2})


'urban'이 있을 때 많이 나오는 것을 확인할 수 있음  
위 예시는 후보에서 'urban'을 제거했을 때의 결과  
1개만 선택하는 것은 결과를 잘 반영하지 못한다고 생각하여 상위 5개를 선별

# 상위 5개 뽑기

In [47]:
atmosphere = ['calm', 'happy', 'joy', 'peaceful', 'cold', 'gloomy', 'sad', 'cloudy', 'foggy']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["네모탑"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a sad mood  (p=0.178)
  2. A photo that conveys a gloomy mood  (p=0.175)
  3. A photo that conveys a calm mood  (p=0.160)
  4. A photo that conveys a foggy mood  (p=0.119)
  5. A photo that conveys a cold mood  (p=0.115)
[Image 1] Top-5 moods:
  1. A photo that conveys a foggy mood  (p=0.491)
  2. A photo that conveys a calm mood  (p=0.122)
  3. A photo that conveys a cloudy mood  (p=0.106)
  4. A photo that conveys a gloomy mood  (p=0.100)
  5. A photo that conveys a peaceful mood  (p=0.072)
[Image 2] Top-5 moods:
  1. A photo that conveys a foggy mood  (p=0.189)
  2. A photo that conveys a gloomy mood  (p=0.172)
  3. A photo that conveys a cloudy mood  (p=0.159)
  4. A photo that conveys a peaceful mood  (p=0.128)
  5. A photo that conveys a sad mood  (p=0.118)
[Image 3] Top-5 moods:
  1. A photo that conveys a gloomy mood  (p=0.157)
  2. A photo that conveys a cloudy mood  (p=0.148)
  3. A photo that conveys a cold mood  (p=0.128)
  4

In [48]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a peaceful mood: 25
A photo that conveys a calm mood: 23
A photo that conveys a gloomy mood: 20
A photo that conveys a cloudy mood: 17
A photo that conveys a sad mood: 16
A photo that conveys a foggy mood: 13
A photo that conveys a cold mood: 12
A photo that conveys a joy mood: 4


5개를 뽑은 다음 사진들을 통해 검증해보니 1개만 뽑았을 때보다 blip의 결과와 유사한 것을 확인할 수 있었음

# 다른 랜드마크

## 웅진역사관

In [49]:
atmosphere = ['calm', 'joy', 'good', 'happy', 'busy', 'modern', 'relaxed']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["웅진역사관"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a calm mood  (p=0.440)
  2. A photo that conveys a relaxed mood  (p=0.343)
  3. A photo that conveys a busy mood  (p=0.155)
  4. A photo that conveys a modern mood  (p=0.021)
  5. A photo that conveys a happy mood  (p=0.017)
[Image 1] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.599)
  2. A photo that conveys a calm mood  (p=0.149)
  3. A photo that conveys a relaxed mood  (p=0.088)
  4. A photo that conveys a modern mood  (p=0.064)
  5. A photo that conveys a joy mood  (p=0.053)
[Image 2] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.302)
  2. A photo that conveys a relaxed mood  (p=0.235)
  3. A photo that conveys a calm mood  (p=0.208)
  4. A photo that conveys a modern mood  (p=0.152)
  5. A photo that conveys a joy mood  (p=0.045)


In [50]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a calm mood: 3
A photo that conveys a relaxed mood: 3
A photo that conveys a busy mood: 3
A photo that conveys a modern mood: 3
A photo that conveys a joy mood: 2
A photo that conveys a happy mood: 1


## 지혜의 숲

In [53]:
atmosphere = ['calm', 'joy', 'relaxed', 'busy', 'good', 'happy']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["지혜의 숲"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.339)
  2. A photo that conveys a relaxed mood  (p=0.256)
  3. A photo that conveys a calm mood  (p=0.222)
  4. A photo that conveys a joy mood  (p=0.084)
  5. A photo that conveys a happy mood  (p=0.067)
[Image 1] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.460)
  2. A photo that conveys a calm mood  (p=0.311)
  3. A photo that conveys a relaxed mood  (p=0.142)
  4. A photo that conveys a joy mood  (p=0.043)
  5. A photo that conveys a happy mood  (p=0.024)
[Image 2] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.484)
  2. A photo that conveys a calm mood  (p=0.202)
  3. A photo that conveys a relaxed mood  (p=0.130)
  4. A photo that conveys a joy mood  (p=0.089)
  5. A photo that conveys a happy mood  (p=0.056)
[Image 3] Top-5 moods:
  1. A photo that conveys a calm mood  (p=0.471)
  2. A photo that conveys a relaxed mood  (p=0.314)
  3. A photo that conveys a busy mood  (p=0.153)
  4. A photo t

In [54]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a busy mood: 4
A photo that conveys a relaxed mood: 4
A photo that conveys a calm mood: 4
A photo that conveys a joy mood: 4
A photo that conveys a happy mood: 4


## 지혜의숲 조각상

In [56]:
atmosphere = ['calm', 'happy', 'sad', 'relaxed', 'joy', 'urban', 'busy', 'good', 'nostalgic']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["지혜의숲 조각상"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.211)
  2. A photo that conveys a calm mood  (p=0.189)
  3. A photo that conveys a urban mood  (p=0.183)
  4. A photo that conveys a nostalgic mood  (p=0.147)
  5. A photo that conveys a relaxed mood  (p=0.140)
[Image 1] Top-5 moods:
  1. A photo that conveys a nostalgic mood  (p=0.351)
  2. A photo that conveys a calm mood  (p=0.188)
  3. A photo that conveys a busy mood  (p=0.127)
  4. A photo that conveys a relaxed mood  (p=0.119)
  5. A photo that conveys a urban mood  (p=0.057)
[Image 2] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.208)
  2. A photo that conveys a urban mood  (p=0.160)
  3. A photo that conveys a joy mood  (p=0.130)
  4. A photo that conveys a nostalgic mood  (p=0.120)
  5. A photo that conveys a sad mood  (p=0.092)
[Image 3] Top-5 moods:
  1. A photo that conveys a calm mood  (p=0.422)
  2. A photo that conveys a nostalgic mood  (p=0.229)
  3. A photo that conveys a busy mood  (p=0.117)

In [57]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a busy mood: 13
A photo that conveys a nostalgic mood: 12
A photo that conveys a urban mood: 11
A photo that conveys a relaxed mood: 11
A photo that conveys a calm mood: 7
A photo that conveys a joy mood: 7
A photo that conveys a sad mood: 2
A photo that conveys a happy mood: 1
A photo that conveys a good mood: 1


## 피노키오

In [58]:
atmosphere = ['happy', 'peaceful', 'urban', 'gloomy', 'amusement', 'sunny', 'asian']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["피노키오"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a amusement mood  (p=0.327)
  2. A photo that conveys a sunny mood  (p=0.186)
  3. A photo that conveys a happy mood  (p=0.132)
  4. A photo that conveys a peaceful mood  (p=0.115)
  5. A photo that conveys a asian mood  (p=0.098)
[Image 1] Top-5 moods:
  1. A photo that conveys a urban mood  (p=0.403)
  2. A photo that conveys a sunny mood  (p=0.319)
  3. A photo that conveys a amusement mood  (p=0.090)
  4. A photo that conveys a peaceful mood  (p=0.065)
  5. A photo that conveys a asian mood  (p=0.053)
[Image 2] Top-5 moods:
  1. A photo that conveys a amusement mood  (p=0.236)
  2. A photo that conveys a asian mood  (p=0.219)
  3. A photo that conveys a gloomy mood  (p=0.202)
  4. A photo that conveys a urban mood  (p=0.167)
  5. A photo that conveys a happy mood  (p=0.077)
[Image 3] Top-5 moods:
  1. A photo that conveys a asian mood  (p=0.382)
  2. A photo that conveys a gloomy mood  (p=0.186)
  3. A photo that conveys a amusement 

In [59]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a amusement mood: 23
A photo that conveys a asian mood: 18
A photo that conveys a sunny mood: 17
A photo that conveys a gloomy mood: 17
A photo that conveys a urban mood: 16
A photo that conveys a happy mood: 15
A photo that conveys a peaceful mood: 9


## 활판 공방

In [60]:
atmosphere = ['calm', 'happy', 'good', 'sad', 'industrial', 'busy', 'gloomy', 'joy', 'urban', 'serious', 'boredom']
labels = [f"A photo that conveys a {kw} mood" for kw in atmosphere]
images = []
for img_path in image_files["활판 공방"]:
    images.append(Image.open(img_path).convert("RGB"))

inputs = clip_processor(text=labels, images=images, return_tensors="pt", padding=True).to(device)
outputs = clip_model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
top_labels_per_image = []

all_topk_labels = []

for i in range(probs.shape[0]):
    topk_vals, topk_idxs = torch.topk(probs[i], k=5)
    print(f"[Image {i}] Top-5 moods:")
    for rank in range(5):
        label_idx = topk_idxs[rank].item()
        label_str = labels[label_idx]
        score = topk_vals[rank].item()
        all_topk_labels.append(label_str)
        print(f"  {rank+1}. {label_str}  (p={score:.3f})")

[Image 0] Top-5 moods:
  1. A photo that conveys a busy mood  (p=0.581)
  2. A photo that conveys a calm mood  (p=0.145)
  3. A photo that conveys a joy mood  (p=0.061)
  4. A photo that conveys a urban mood  (p=0.042)
  5. A photo that conveys a happy mood  (p=0.032)
[Image 1] Top-5 moods:
  1. A photo that conveys a industrial mood  (p=0.584)
  2. A photo that conveys a busy mood  (p=0.272)
  3. A photo that conveys a calm mood  (p=0.050)
  4. A photo that conveys a joy mood  (p=0.019)
  5. A photo that conveys a gloomy mood  (p=0.014)
[Image 2] Top-5 moods:
  1. A photo that conveys a industrial mood  (p=0.877)
  2. A photo that conveys a urban mood  (p=0.097)
  3. A photo that conveys a gloomy mood  (p=0.009)
  4. A photo that conveys a busy mood  (p=0.006)
  5. A photo that conveys a calm mood  (p=0.004)
[Image 3] Top-5 moods:
  1. A photo that conveys a industrial mood  (p=0.428)
  2. A photo that conveys a busy mood  (p=0.202)
  3. A photo that conveys a gloomy mood  (p=0.068)
 

In [61]:
counter_topk = Counter(all_topk_labels)

for label, cnt in counter_topk.most_common():
    print(f"{label}: {cnt}")

A photo that conveys a busy mood: 8
A photo that conveys a calm mood: 7
A photo that conveys a industrial mood: 7
A photo that conveys a joy mood: 4
A photo that conveys a urban mood: 4
A photo that conveys a gloomy mood: 4
A photo that conveys a sad mood: 2
A photo that conveys a boredom mood: 2
A photo that conveys a happy mood: 1
A photo that conveys a serious mood: 1


# 적용

사전에 랜드마크에 대한 힌트를 주고 사용자가 그 랜드마크를 찾아 찍는 시스템  
랜드마크에 대한 정보가 맞는지는 BLIP이 사진을 통해 체크함  
CLIP 모델은 해당 사진에 대한 분위기 정보 체크  
만약 BLIP 모델에서 랜드마크가 틀렸으면 이러한 감성이 부족하다고 알려주기  
랜드마크가 맞았으면 해당 사진에서 어떤 분위기가 나타나는지를 CLIP을 통해 몇%라고 나타냄


In [82]:
device = "cuda" if torch.cuda.is_available() else "cpu"

keywords = {
    "네모탑": ['calm', 'happy', 'joy', 'peaceful', 'cold', 'gloomy', 'sad', 'urban', 'cloudy', 'foggy'],
    "웅진역사관": ['calm', 'joy', 'good', 'happy', 'busy', 'modern', 'relaxed'],
    "지혜의 숲": ['calm', 'joy', 'relaxed', 'busy', 'good', 'happy'],
    "지혜의숲 조각상": ['calm', 'happy', 'sad', 'relaxed', 'joy', 'urban', 'busy', 'good', 'nostalgic'],
    "피노키오": ['happy', 'peaceful', 'urban', 'gloomy', 'amusement', 'sunny', 'asian'],
    "활판 공방": ['calm', 'happy', 'good', 'sad', 'industrial', 'busy', 'gloomy', 'joy', 'urban', 'serious', 'boredom'],
}

def make_prompts_from_keywords(keywords, landmark):
    prompts = []
    for kw in keywords[landmark]:
        prompts.append(f"A photo that conveys a {kw} mood.")
    return prompts

# 이 부분 BLIP이랑 연동(일단은 임의로 구현)
def blip_predict(img_path):
    possible_landmarks = ["네모탑", "피노키오", "지혜의 숲", "활판 공방"]
    predicted = random.choice(possible_landmarks)
    confidence = round(random.uniform(0.5, 0.95), 2)
    return predicted, confidence


def analyze_mood(img_path, landmark):
    image = Image.open(img_path).convert("RGB")
    prompts = make_prompts_from_keywords(keywords, landmark)

    inputs = clip_processor(text=prompts, images=image, return_tensors="pt", padding=True).to(device)
    outputs = clip_model(**inputs)

    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)

    # 분위기 Top3 출력
    topk = torch.topk(probs, k=3)
    for idx, score in zip(topk.indices[0].tolist(), topk.values[0].tolist()):
        match = re.search(r"conveys a (.+?) mood", prompts[idx])
        kw = match.group(1)
        print(f"  - {kw} ({score*100:.1f}%)")

def main(user_img_path, correct_landmark):
    print("사용자가 사진을 업로드했습니다.")
    print(f"파일 경로: {user_img_path}\n")

    # BLIP 모델(나중에 연결해서 붙이기)
    predicted_landmark, conf = blip_predict(user_img_path)
    print(f"사용자님이 찍으신 사진은 다음 랜드마크에서 찍으셨을 확률이 {conf*100:.1f}%입니다: {predicted_landmark} ")

    if predicted_landmark != correct_landmark:
        print(f"감성 부족ㅜㅜ\n")
        analyze_mood(user_img_path, correct_landmark)
        print(f"이 장소의 대표적인 분위기를 보완해보세요 (예: {keywords[correct_landmark][:3]})")
    else:
        print(f"랜드마크 결과가 올바름!\n")
        print("CLIP 모델이 이 사진의 분위기를 분석합니다...\n")
        analyze_mood(user_img_path, correct_landmark)

for img_path in image_files["네모탑"]:
    main(img_path, correct_landmark="네모탑")
    break

사용자가 사진을 업로드했습니다.
파일 경로: /content/drive/MyDrive/project/data/네모탑/wrtFileImageView.jpg

사용자님이 찍으신 사진은 다음 랜드마크에서 찍으셨을 확률이 90.0%입니다: 피노키오 
감성 부족ㅜㅜ

  - urban (21.9%)
  - gloomy (15.1%)
  - sad (11.9%)
이 장소의 대표적인 분위기를 보완해보세요 (예: ['calm', 'happy', 'joy'])


# Fine Tuning(필요시)
아직 데이터가 없어서 못 돌림

In [None]:
class PAJUDataset(Dataset):
    def __init__(self, clip_processor, is_train):
        self.clip_processor = clip_processor
        self.is_train = is_train
        self.dataset = "path"
        self.class_texts = [
            f"A photo of a {kw} place."
            for kw in self.dataset.class_to_idx.keys()
        ]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        text = self.class_texts[label]
        return {
            "image": image,
            "label": label,
            "text": text,
        }

    def preprocess(self, batch):
        images = [data["image"] for data in batch]
        labels = [data["label"] for data in batch]
        texts = [data["text"] for data in batch]

        inputs = self.clip_processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True
        )

        return {
            "text": texts,
            "label": labels,
            **inputs,
        }

In [None]:
train_dataset = PAJUDataset(clip_processor, is_train=True)
test_dataset = PAJUDataset(clip_processor, is_train=False)
train_dataloader = DataLoader(train_dataset, collate_fn=train_dataset.preprocess, batch_size=512, shuffle=True)
test_dataloader = DataLoader(test_dataset, collate_fn=train_dataset.preprocess, batch_size=512)

In [None]:
def loss_fn(logits_per_image, logits_per_text):
    labels = torch.arange(logits_per_image.shape[0], device=device)
    loss_i = F.cross_entropy(logits_per_image, labels)
    loss_t = F.cross_entropy(logits_per_text, labels)
    loss = (loss_i + loss_t) / 2

    return loss

In [None]:
from torch.optim import AdamW
from tqdm import tqdm

clip_model = clip_model.to("cuda")
optimizer = AdamW(clip_model.parameters(), lr=5e-5)
clip_model.train()

for epoch in tqdm(range(1, 6), position=0, desc="epoch"):
    for batch in tqdm(train_dataloader, position=0, desc="batch", leave=False):
        optimizer.zero_grad()

        outputs = clip_model(
            pixel_values=batch["pixel_values"].to("cuda"), # image input
            input_ids=batch["input_ids"].to("cuda"), # text input
            attention_mask=batch["attention_mask"].to("cuda"), # text mask
        )

        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text # logits_per_text == logits_per_image.T
        loss = loss_fn(logits_per_image, logits_per_text)
        loss.backward()

        optimizer.step()

    print(f"Train loss: {loss}")

In [None]:
all_class_texts = clip_processor.tokenizer(test_dataset.class_texts)
all_class_texts = {k: torch.tensor(v, device="cuda") for k, v in all_class_texts.items()}

clip_model.eval()
correct_count = 0
ce_loss_sum = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        outputs = clip_model(
            pixel_values=batch["pixel_values"].to("cuda"),
            **all_class_texts,
        )

        probs = outputs.logits_per_image.cpu().softmax(dim=1)
        pred = probs.argmax(dim=1)
        label = batch["label"]
        pred = torch.tensor(pred)
        label = torch.tensor(label)

        correct_count += (pred == label).sum()
        ce_loss_sum += F.cross_entropy(probs, label).item()

accuracy = correct_count / len(test_dataloader.dataset)
ce_loss = ce_loss_sum / len(test_dataloader)
print(f"Test CE loss: {ce_loss:.4}, Test accuracy: {accuracy:.4}")