In [None]:
import os
import sys
import torch
import pandas as pd
import random
from PIL import Image
from tqdm import tqdm

sys.path.append(os.path.join(os.getcwd(), 'CLIP'))
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model = model.to(device)

df = pd.read_csv("captions.csv")
image_folder = "RISCM/resized/"

training_captions = []
training_clip_scores = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Selecting best captions"):
    image_path = os.path.join(image_folder, row["image"])
    image = Image.open(image_path).convert("RGB")
    captions = [str(row[f"caption_{i}"]) for i in range(1, 6)]

    image_input = preprocess(image).unsqueeze(0).to(device)
    text_inputs = clip.tokenize(captions).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)
        similarities = (image_features @ text_features.T).squeeze(0).tolist()

    max_score = max(similarities)
    best_indices = [i for i, s in enumerate(similarities) if s == max_score]
    chosen_index = random.choice(best_indices)

    training_captions.append(captions[chosen_index])
    training_clip_scores.append(max_score)

df["training_caption"] = training_captions
df["training_clip_score"] = training_clip_scores
df.to_csv("captions_cleaned1.csv", index=False)

print("Dataset with best captions are saved.")
