<a href="https://colab.research.google.com/github/kapoor1309/AI_Hackathon-Flickd/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
device='cuda' if torch.cuda.is_available() else "cpu"

In [None]:
device

'cpu'

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
!pip install transformers



In [None]:
import transformers

In [None]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection, CLIPProcessor, CLIPModel, pipeline

I used the valentinafeve/yolos-fashionpedia model from Hugging Face for object detection and a fine-tuned CLIP model (via supervised fine-tuning) to assign similarity scores to the detected objects. For color classification, I utilized an open-source dataset containing approximately 835 color names with corresponding RGB values and applied a KNN classifier on the detected crops to assign each product a representative color.

In [None]:
import os
import torch
import faiss
import json
import numpy as np
import cv2
import pandas as pd
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
from transformers import AutoImageProcessor, AutoModelForObjectDetection, CLIPProcessor, CLIPModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

det_ckpt = "valentinafeve/yolos-fashionpedia"
processor = AutoImageProcessor.from_pretrained(det_ckpt)
det_model = AutoModelForObjectDetection.from_pretrained(det_ckpt).to(device).eval()

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device).eval()
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.load_state_dict(torch.load("/content/clip_model.pth", map_location=device))

index = faiss.read_index("/content/faiss_catalog.index")
with open("/content/metadata.json", "r") as f:
    catalog_metadata = json.load(f)

CATS = [
    'shirt, blouse', 'top, t-shirt, sweatshirt', 'sweater', 'cardigan', 'jacket', 'vest',
    'pants', 'shorts', 'skirt', 'coat', 'dress', 'jumpsuit', 'cape', 'glasses', 'hat',
    'headband, head covering, hair accessory', 'tie', 'glove', 'watch', 'belt', 'leg warmer',
    'tights, stockings', 'sock', 'shoe', 'bag, wallet', 'scarf', 'umbrella', 'hood',
    'collar', 'lapel', 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper',
    'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet', 'ruffle', 'sequin', 'tassel'
]

custom_label_map = {
    "shirt, blouse": "top", "top, t-shirt, sweatshirt": "top", "sweater": "top",
    "cardigan": "top", "vest": "top", "pants": "bottom", "shorts": "bottom",
    "skirt": "bottom", "dress": "dress", "jumpsuit": "dress", "jacket": "jacket",
    "coat": "jacket", "shoe": "shoes", "bag, wallet": "bag", "watch": "accessory",
    "glasses": "accessory", "hat": "accessory", "headband, head covering, hair accessory": "accessory"
}

color_df = pd.read_csv("https://raw.githubusercontent.com/codebrainz/color-names/master/output/colors.csv", names=["name", "hex", "red", "green", "blue"])
color_names = color_df['name'].tolist()
color_rgbs = color_df[['red', 'green', 'blue']].values

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(color_rgbs, color_names)

def detect_dominant_color_knn(pil_image):
    image = np.array(pil_image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    image = cv2.resize(image, (50, 50))
    pixels = image.reshape((-1, 3)).astype(np.float32)
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    _, labels, palette = cv2.kmeans(pixels, 3, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
    _, counts = np.unique(labels, return_counts=True)
    dominant = palette[np.argmax(counts)].astype(int)[::-1]
    return knn.predict([dominant])[0]

video_dir = "/content/drive/MyDrive/AI Hackathon/videos"
os.makedirs("outputs", exist_ok=True)

for file in tqdm(os.listdir(video_dir)):
    if not file.endswith(".mp4"):
        continue

    video_path = os.path.join(video_dir, file)
    video_id = file.replace(".mp4", "")
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, 25, dtype=int)

    all_products = []

    for i, frame_index in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
        ret, frame = cap.read()
        if not ret:
            continue

        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = det_model(**inputs)

        target_sizes = torch.tensor([image.size[::-1]]).to(device)
        results = processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]

        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
            label_name = CATS[label.item()]
            if label_name not in custom_label_map:
                continue

            mapped_label = custom_label_map[label_name]
            x1, y1, x2, y2 = map(int, box.tolist())
            crop = image.crop((x1, y1, x2, y2))

            inputs = clip_processor(images=crop, return_tensors="pt").to(device)
            with torch.no_grad():
                emb = clip_model.get_image_features(**inputs)
                emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
            emb = emb.cpu().numpy().astype('float32')

            D, I = index.search(emb, k=1)
            top_idx = I[0][0]
            sim_score = 1 - D[0][0]
            match = catalog_metadata[top_idx]

            match_type = "exact" if sim_score > 0.9 else "similar" if sim_score > 0.75 else "no_match"
            if match_type == "no_match":
                continue

            product_entry = {
                "frame": int(frame_index),
                "type": mapped_label,
                "color": detect_dominant_color_knn(crop),
                "matched_product_id": match["product_id"],
                "match_type": match_type,
                "confidence": float(round(sim_score, 3))
            }
            all_products.append(product_entry)

    cap.release()

    output = {
        "video_id": video_id,
        "vibes": [],
        "products": all_products
    }

    with open(f"outputs/{video_id}.json", "w") as f:
        json.dump(output, f, indent=4)

    print(f"Saved: outputs/{video_id}.json")


  4%|▍         | 1/24 [01:50<42:19, 110.41s/it]

✅ Saved: outputs/2025-05-22_08-25-12_UTC.json


 25%|██▌       | 6/24 [04:00<10:54, 36.34s/it] 

✅ Saved: outputs/2025-05-27_13-46-16_UTC.json


 33%|███▎      | 8/24 [06:28<13:02, 48.92s/it]

✅ Saved: outputs/2025-05-28_13-40-09_UTC.json


 50%|█████     | 12/24 [08:41<08:14, 41.23s/it]

✅ Saved: outputs/2025-05-28_13-42-32_UTC.json


 58%|█████▊    | 14/24 [11:07<08:15, 49.53s/it]

✅ Saved: outputs/2025-05-31_14-01-37_UTC.json


100%|██████████| 24/24 [13:17<00:00, 33.23s/it]

✅ Saved: outputs/2025-06-02_11-31-19_UTC.json





In [None]:
!pip install langchain langchain-core langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [None]:
!pip install langchain_groq

Collecting langchain_groq
  Downloading langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.4.1 (from langchain_groq)
  Downloading groq-0.26.0-py3-none-any.whl.metadata (15 kB)
Downloading langchain_groq-0.3.2-py3-none-any.whl (15 kB)
Downloading groq-0.26.0-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.6/129.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain_groq
Successfully installed groq-0.26.0 langchain_groq-0.3.2


Leveraging LangChain and Groq, I integrated open-source LLMs to perform zero-shot classification and assign aesthetic vibe categories to each video reel based on their subtitle text.



In [None]:
import os
import json
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = ChatGroq(temperature=0, model_name="gemma2-9b-it", groq_api_key="your-api-key")

prompt = PromptTemplate.from_template(
    '''Given the subtitle/caption:
"{text}"

Select 1–3 matching aesthetic vibes from:
Coquette, Clean Girl, Cottagecore, Streetcore, Y2K, Boho, Party Glam

Return a comma-separated list only.'''
)

chain = prompt | llm | StrOutputParser()

video_dir = "/content/drive/MyDrive/AI Hackathon/videos"
output_dir = "outputs"

for fname in os.listdir(video_dir):
    if not fname.endswith(".txt"):
        continue

    txt_path = os.path.join(video_dir, fname)
    video_id = fname.replace(".txt", "")
    json_path = os.path.join(output_dir, f"{video_id}.json")

    if not os.path.exists(json_path):
        continue

    with open(txt_path, "r") as f:
        text = f.read().strip()

    if not text:
        print(f"Empty text for {video_id}")
        continue

    try:
        vibes = chain.invoke({"text": text}).strip().split(", ")
    except Exception as e:
        print(f"Failed to classify {video_id}: {e}")
        continue

    with open(json_path, "r") as f:
        data = json.load(f)
    data["vibes"] = vibes

    with open(json_path, "w") as f:
        json.dump(data, f, indent=4)

    print(f"Updated vibes for: {video_id}")


✅ Updated vibes for: 2025-05-22_08-25-12_UTC
✅ Updated vibes for: 2025-05-27_13-46-16_UTC
✅ Updated vibes for: 2025-05-28_13-40-09_UTC
✅ Updated vibes for: 2025-05-28_13-42-32_UTC
✅ Updated vibes for: 2025-05-31_14-01-37_UTC
✅ Updated vibes for: 2025-06-02_11-31-19_UTC


In [None]:
!zip -r /content/outputs.zip /content/outputs


updating: content/outputs/ (stored 0%)
updating: content/outputs/2025-05-27_13-46-16_UTC.json (deflated 88%)
updating: content/outputs/2025-05-31_14-01-37_UTC.json (deflated 89%)
updating: content/outputs/2025-06-02_11-31-19_UTC.json (deflated 83%)
updating: content/outputs/2025-05-22_08-25-12_UTC.json (deflated 27%)
updating: content/outputs/2025-05-28_13-40-09_UTC.json (deflated 90%)
updating: content/outputs/2025-05-28_13-42-32_UTC.json (deflated 88%)


In [None]:
from google.colab import files
files.download('/content/outputs.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>