In [None]:
import os
import pandas as pd
import torch
import clip
from PIL import Image
import numpy as np
import faiss

# -------------------------------
# Paths
# -------------------------------
BASE_DIR = r"C:/Users/priya/OneDrive/Desktop/mediaval"
CSV_PATH = os.path.join(BASE_DIR, "newsarticles.csv")
IMAGE_DIR = os.path.join(BASE_DIR, "newsimages_25_v1.1", "newsimages")
FEATURE_DIR = os.path.join(BASE_DIR, "features_batches")

# ⚡ Change for submission
GROUP_NAME = "ELITE_CODERS"
APPROACH_NAME = "CLIP"
OUTPUT_DIR = os.path.join(BASE_DIR, f"{GROUP_NAME}", f"RET_{APPROACH_NAME}_LARGE")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------------
# Load dataset and clean
# -------------------------------
df = pd.read_csv(
    CSV_PATH,
    header=None,
    names=["article_id", "article_url", "article_title", "article_tags", "image_id", "image_url"]
)
df = df.dropna(subset=['image_id', 'article_title']).copy()
df['article_tags'] = df['article_tags'].fillna("")
df['image_id'] = df['image_id'].astype(str).str.strip()
print(f"✅ Loaded {len(df)} rows after cleaning")

# -------------------------------
# Step 1: Load precomputed features
# -------------------------------
all_features, all_ids = [], []

for file in os.listdir(FEATURE_DIR):
    if file.startswith("features_") and file.endswith(".npy"):
        feats = np.load(os.path.join(FEATURE_DIR, file))
        ids_file = file.replace("features_", "ids_").replace(".npy", ".csv")
        ids = pd.read_csv(os.path.join(FEATURE_DIR, ids_file)).squeeze()
        if hasattr(ids, "tolist"):
            ids = ids.tolist()
        else:
            ids = [ids]
        ids = [str(i).strip() for i in ids]  # ensure string
        all_features.append(feats)
        all_ids.extend(ids)

combined_features = np.vstack(all_features).astype("float32")
image_id_list = all_ids
print(f"✅ Loaded {combined_features.shape[0]} feature vectors")

# -------------------------------
# Step 2: Build FAISS Index
# -------------------------------
dim = combined_features.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(combined_features)
print(f"⚙️ FAISS index built with {index.ntotal} vectors")

# -------------------------------
# Step 3: Run retrieval for ALL queries
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

K = 1
unique_titles = df['article_title'].unique()
print(f"\n🔄 Running retrieval for {len(unique_titles)} queries...")

missing_images = []

for query_title in unique_titles:
    query = query_title.strip()
    if not query:
        continue

    # Encode query
    text = clip.tokenize([query[:200]]).to(device)
    with torch.no_grad():
        q_feat = model.encode_text(text)
        q_feat /= q_feat.norm(dim=-1, keepdim=True)

    # Search in FAISS
    search_vector = q_feat.cpu().numpy().astype("float32")
    D, I = index.search(search_vector, K)

    for score, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        image_id = image_id_list[idx]
        row = df[df['image_id'] == str(image_id)].iloc[0]

        # Check for jpg or png
        img_path_jpg = os.path.join(IMAGE_DIR, f"{image_id}.jpg")
        img_path_png = os.path.join(IMAGE_DIR, f"{image_id}.png")

        if os.path.exists(img_path_jpg):
            img_path = img_path_jpg
        elif os.path.exists(img_path_png):
            img_path = img_path_png
        else:
            missing_images.append(image_id)
            print(f"❌ Image not found: {image_id}")
            continue

        # Load + resize + save
        try:
            image = Image.open(img_path).convert("RGB")
            image = image.resize((460, 260))
            save_name = f"{row['article_id']}_{GROUP_NAME}_{APPROACH_NAME}.png"
            save_path = os.path.join(OUTPUT_DIR, save_name)
            image.save(save_path, format="PNG")
            print(f"✅ Saved: {save_name}")
        except Exception as e:
            print(f"❌ Error saving {img_path}: {e}")

print("🎯 Retrieval complete!")
if missing_images:
    print(f"⚠️ Missing {len(missing_images)} images. See list below:")
    print(missing_images)
print("Submission files ready in:", OUTPUT_DIR)


✅ Loaded 8501 rows after cleaning
✅ Loaded 8499 feature vectors
⚙️ FAISS index built with 8499 vectors

🔄 Running retrieval for 8159 queries...
✅ Saved: 5787_ELITE_CODERS_CLIP.png
✅ Saved: 1_ELITE_CODERS_CLIP.png
✅ Saved: 2_ELITE_CODERS_CLIP.png
✅ Saved: 3_ELITE_CODERS_CLIP.png
✅ Saved: 4_ELITE_CODERS_CLIP.png
✅ Saved: 5_ELITE_CODERS_CLIP.png
✅ Saved: 2448_ELITE_CODERS_CLIP.png
✅ Saved: 7_ELITE_CODERS_CLIP.png
✅ Saved: 8_ELITE_CODERS_CLIP.png
✅ Saved: 8185_ELITE_CODERS_CLIP.png
✅ Saved: 10_ELITE_CODERS_CLIP.png
✅ Saved: 11_ELITE_CODERS_CLIP.png
✅ Saved: 12_ELITE_CODERS_CLIP.png
✅ Saved: 8404_ELITE_CODERS_CLIP.png
✅ Saved: 14_ELITE_CODERS_CLIP.png
✅ Saved: 691_ELITE_CODERS_CLIP.png
✅ Saved: 16_ELITE_CODERS_CLIP.png
✅ Saved: 17_ELITE_CODERS_CLIP.png
✅ Saved: 18_ELITE_CODERS_CLIP.png
✅ Saved: 19_ELITE_CODERS_CLIP.png
✅ Saved: 20_ELITE_CODERS_CLIP.png
✅ Saved: 21_ELITE_CODERS_CLIP.png
✅ Saved: 615_ELITE_CODERS_CLIP.png
✅ Saved: 23_ELITE_CODERS_CLIP.png
✅ Saved: 24_ELITE_CODERS_CLIP.png
✅ S