# Zero-shot image classification

## Zero-shot learning with CLIP

In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt

dset = "rajuptvs/ecommerce_products_clip"
dataset = load_dataset(dset)
print(dataset["train"][0]["Description"])
plt.imshow(dataset["train"][0]["image"])
plt.show()

In [None]:
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

categories = ["shirt", "trousers", "shoes", "dress", "hat", "bag", "watch"]

inputs = processor(text=categories, images=dataset["train"][0]["image"], return_tensors="pt", padding=True)
outputs = model(**inputs)

probs = outputs.logits_per_image.softmax(dim=1)
categories[probs.argmax().item()]

## Automated caption quality assessment

In [None]:
from torchmetrics.functional.multimodal import clip_score

image = dataset["train"][0]["image"]
description = dataset["train"][0]["Description"]

from torchvision.transforms import ToTensor

image = ToTensor()(image)*255
score = clip_score(image, description, "openai/clip-vit-base-patch32")

print(f"CLIP score: {score}")

# Multi-modal sentiment analysis

## Prompting Vision Language Models (VLMs)

In [None]:
from datasets import load_dataset

dset = "RealTimeData/bbc_news_alltime"
dataset = load_dataset(dset, '2017-01', split="train")
image = dataset[87]["top_image"]
content = dataset[87]["content"]
print(content)

In [None]:
from transformers import Qwen2VLForConditionalGeneration
from qwen_vl_utils import process_vision_info

vl_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="auto", torch_dtype="auto" )

In [None]:
from transformers import Qwen2VLProcessor

min_pixels = 224 * 224
max_pixels = 448 * 448
vl_model_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

In [None]:
text_query = f"Is the sentiment of the following content good or bad for the Ford share price: {article_text}. Provide reasoning."
chat_template = [{
  "role": "user",
  "content": [
    {"type": "image", "image": article_image},
    {"type": "text", "text": text_query}
  ]
}]

## Multi-modal sentiment classification with Qwen

In [None]:
text = vl_model_processor.apply_chat_template(chat_template, tokenize=False, add_generation_prompt=True)
image_inputs, _ = process_vision_info(chat_template)
inputs = vl_model_processor(text=[text], images=image_inputs, padding=True, return_tensors="pt")
generated_ids = vl_model.generate(**inputs, max_new_tokens=500)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

In [None]:
output_text = vl_model_processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )
print(output_text[0])

# Zero-shot video classification

## Video audio spliting

In [None]:
from moviepy.editor import VideoFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

ffmpeg_extract_subclip("advert.mp4", 0, 5, "advert_5s.mp4")
video = VideoFileClip("advert_5s.mp4")
audio = video.audio
audio.write_audiofile("advert_5s.mp3")

## Video sentiment analysis with CLIP CLAP

In [None]:
from decord import VideoReader
from PIL import Image

video_reader = VideoReader(video_path)
video = video_reader.get_batch(range(20)).asnumpy()
video = video[:, :, :, ::-1]
video = [Image.fromarray(frame) for frame in video]

from datasets import Dataset, Audio

audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio())
audio_sample = audio_dataset[0]["audio"]["array"]

In [None]:
from transformers import pipeline

emotions = ["joy", "fear", "anger", "sadness", "disgust", "surprise", "neutral"]
image_classifier = pipeline(model="openai/clip-vit-large-patch14", task="zero-shot-image-classification")
predictions = image_classifier(video, candidate_labels=emotions)
scores = [{l['label']: l['score'] for l in prediction} for prediction in predictions]
avg_image_scores = {emotion: sum([s[emotion] for s in scores])/len(scores) for emotion in emotions}
print(f"Average scores: {avg_image_scores}")

In [None]:
audio_class = pipeline(model="laion/clap-htsat-unfused", task="zero-shot-audio-classification")
audio_scores = audio_class(audio_sample, candidate_labels=emotions)
audio_scores = {l['label']: l['score'] for l in audio_scores}
multimodal_scores = {emotion: (avg_image_scores[emotion] + audio_scores[emotion])/2 for emotion in emotions}
print(f"Multimodal scores: {multimodal_scores}")