In [3]:
# 方法1：不用 pipeline，直接使用 CLIPProcessor 和 CLIPModel
import os
# 设置从https://hf-mirror.com下载模型，否则会从huggingface.co下载
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# 加载CLIP模型
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# 处理图像和文本的工具
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# 打开图像文件
image = Image.open("./rabbit.jpg")

# 定义文本标签
labels = ["a photo of rabbits", "a photo of dogs", "a photo of cars"]
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)

# 将图像和文本输入模型
outputs = model(**inputs)

# 图像与文本之间的相似度分数
logits_per_image = outputs.logits_per_image 

# 计算概率
probs = logits_per_image.softmax(dim=1) 
print(probs)

tensor([[9.9982e-01, 1.3417e-04, 4.3464e-05]], grad_fn=<SoftmaxBackward0>)


In [6]:
# 方法2：用 pipeline 更简洁
import os

# 设置从https://hf-mirror.com下载模型，否则会从huggingface.co下载
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import torch
from transformers import pipeline

# 使用pipeline加载CLIP模型
clip = pipeline(
   task="zero-shot-image-classification",
   model="openai/clip-vit-base-patch32",
   torch_dtype=torch.bfloat16,
   device=0
)

labels = ["a photo of rabbit", "a photo of dogs", "a photo of cars"]

# 返回一个列表，包含每个标签的分数和标签名称
clip("./rabbit.jpg", candidate_labels=labels) 

Device set to use cuda:0


[{'score': 1.0, 'label': 'a photo of rabbit'},
 {'score': 2.1457672119140625e-05, 'label': 'a photo of dogs'},
 {'score': 2.562999725341797e-06, 'label': 'a photo of cars'}]