In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q transformers

In [None]:
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel, AutoModelForVisualQuestionAnswering, BlipForConditionalGeneration

# CLIP

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
clip_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", dtype=torch.bfloat16, attn_implementation="sdpa")
clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]

inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True)

outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image # (image, text) (1, 3)
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item():.3f}")

In [None]:
num_params = sum(p.numel() for p in clip_model.parameters())
print(f"Parameters: {num_params / 1e6:.2f}M")

# BLIP(captioning)

In [None]:
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
raw_image

In [None]:
blip_captioning_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_captioning_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = blip_captioning_processor(raw_image, text, return_tensors="pt").to("cuda")

out = blip_captioning_model.generate(**inputs)
print(blip_captioning_processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = blip_captioning_processor(raw_image, return_tensors="pt").to("cuda")

out = blip_captioning_model.generate(**inputs)
print(blip_captioning_processor.decode(out[0], skip_special_tokens=True))
# >>> a woman sitting on the beach with her dog

# BLIP(VQA)


In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
blip_vqa_processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_vqa_model = AutoModelForVisualQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    dtype=torch.float16,
    device_map="auto"
)

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

question = "What is the weather in this image?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

output = blip_vqa_model.generate(**inputs)
blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
num_params = sum(p.numel() for p in blip_vqa_model.parameters())
print(f"Parameters: {num_params / 1e6:.2f}M")

# CLIP(Korean)

In [None]:
clip_korean_model = AutoModel.from_pretrained("Bingsu/clip-vit-base-patch32-ko")
clip_korean_processor = AutoProcessor.from_pretrained("Bingsu/clip-vit-base-patch32-ko")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
labels = ["고양이 두 마리", "개 두 마리"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")

In [None]:
outputs.keys()

In [None]:
num_params = sum(p.numel() for p in clip_korean_model.parameters())
print(f"Parameters: {num_params / 1e6:.2f}M")

In [None]:
url = "https://www.koreafashionnews.com/imgdata/koreafashionnews_com/202412/2024121148366961.png"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
url = "https://www.koreafashionnews.com/imgdata/koreafashionnews_com/202412/2024121148366961.png"
image = Image.open(requests.get(url, stream=True).raw)

question = "What's the atmosphere of this place like?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

output = blip_vqa_model.generate(**inputs)
blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
url = "https://www.koreafashionnews.com/imgdata/koreafashionnews_com/202412/2024121148366961.png"
image = Image.open(requests.get(url, stream=True).raw)

labels = ["이 장소는 고요함", "이 장소는 실내임", "이 장소에 나무가 있음", "이 장소는 바다임"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])

In [None]:
url = "https://www.koreafashionnews.com/imgdata/koreafashionnews_com/202412/2024121148366961.png"
image = Image.open(requests.get(url, stream=True).raw)

labels = ["고요함", "나무"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])

In [None]:
image_path = "/content/drive/MyDrive/project/img.png"
image = Image.open(image_path)
image

In [None]:
labels = ["고요함", "활기참"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
print(logits_per_image)
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])

In [None]:
url = "https://images.ctfassets.net/rric2f17v78a/30aPwszW6aWnCuKStRcKSq/cbd2a474f56407c31ab01ef6e84c75b2/Rest-Profit-Margin.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
url = "https://images.ctfassets.net/rric2f17v78a/30aPwszW6aWnCuKStRcKSq/cbd2a474f56407c31ab01ef6e84c75b2/Rest-Profit-Margin.jpg"
image = Image.open(requests.get(url, stream=True).raw)

question = "What's the atmosphere of this place like?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

output = blip_vqa_model.generate(**inputs)
blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
url = "https://images.ctfassets.net/rric2f17v78a/30aPwszW6aWnCuKStRcKSq/cbd2a474f56407c31ab01ef6e84c75b2/Rest-Profit-Margin.jpg"
image = Image.open(requests.get(url, stream=True).raw)

labels = ["고요함", "활기참", "시끄러움", "바다", "카페", "식당"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])

In [None]:
url = "https://tour.paju.go.kr/upload/tour/2019/1/4/f37d31f0-e5cb-4520-9b77-d7c45e77eba5.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
url = "https://tour.paju.go.kr/upload/tour/2019/1/4/f37d31f0-e5cb-4520-9b77-d7c45e77eba5.jpg"
image = Image.open(requests.get(url, stream=True).raw)

question = "What's the atmosphere of this place like?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

output = blip_vqa_model.generate(**inputs)
blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
url = "https://tour.paju.go.kr/upload/tour/2019/1/4/f37d31f0-e5cb-4520-9b77-d7c45e77eba5.jpg"
image = Image.open(requests.get(url, stream=True).raw)

labels = ["고요함", "활기참", "시끄러움", "서울", "파주", "카페", "식당"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])

In [None]:
url = "https://media.triple.guide/triple-cms/c_limit,f_auto,h_2048,w_2048/3c03f530-c85a-4fce-a240-200f99576398.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
url = "https://media.triple.guide/triple-cms/c_limit,f_auto,h_2048,w_2048/3c03f530-c85a-4fce-a240-200f99576398.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

question = "What's the atmosphere of this place like?"
inputs = blip_vqa_processor(images=image, text=question, return_tensors="pt").to(blip_vqa_model.device, torch.float16)

output = blip_vqa_model.generate(**inputs)
blip_vqa_processor.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
url = "https://media.triple.guide/triple-cms/c_limit,f_auto,h_2048,w_2048/3c03f530-c85a-4fce-a240-200f99576398.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

labels = ["고요함", "활기참", "시끄러움", "파주", "시골", "정겨움", "옛날", "현대"]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item():.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])

In [None]:
url = "https://search.pstatic.net/common/?src=http%3A%2F%2Fblogfiles.naver.net%2FMjAxNzAzMTRfMjYy%2FMDAxNDg5NDgxMjk3ODIx.Lkd-BqCz9qlvNpED2sApe8_3NegO9U4NiwmT_ZQsAO4g.aeD06pO47NfrS9aOO4pR3QJkJVyvQY_eq1Let7OT-Zsg.JPEG.jemiga%2FKakaoTalk_20170314_173941638.jpg&type=sc960_832"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
url = "https://search.pstatic.net/common/?src=http%3A%2F%2Fblogfiles.naver.net%2FMjAxNzAzMTRfMjYy%2FMDAxNDg5NDgxMjk3ODIx.Lkd-BqCz9qlvNpED2sApe8_3NegO9U4NiwmT_ZQsAO4g.aeD06pO47NfrS9aOO4pR3QJkJVyvQY_eq1Let7OT-Zsg.JPEG.jemiga%2FKakaoTalk_20170314_173941638.jpg&type=sc960_832"
image = Image.open(requests.get(url, stream=True).raw)

labels = [
    "아늑함",
    "긴장감",
    "설렘",
    "우울함",
    "활기",
    "고요함",
    "따뜻함",
    "냉랭함",
    "신비로움",
    "낭만",
    "침착함",
    "불안",
    "경쾌함",
    "무거움",
    "차분함",
    "어두움",
    "화기애애",
    "냉정함",
    "몽환적",
    "소란스러움"
]
inputs = clip_korean_processor(text=labels, images=image, return_tensors="pt", padding=True)
with torch.inference_mode():
    outputs = clip_korean_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"그림의 가장 그럴듯한 설명은 {most_likely_label}이고 그럴 확률은 다음과 같음: {probs[0][most_likely_idx].item() * 100:.3f}")
print([f"{p * 100:.4f}" for p in probs[0].tolist()])