In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import clip
import torch, os, shutil, json
from tqdm import tqdm
from PIL import Image, ImageOps
MAIN_FOLDER = "/content/drive/MyDrive/Colab Notebooks/Maruti"
OUTPUT_BASE = "Maruti"
BLIP_MODEL = "Salesforce/blip-image-captioning-large"
CLIP_MODEL = "ViT-B/32"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
processor = BlipProcessor.from_pretrained(BLIP_MODEL)
blip = BlipForConditionalGeneration.from_pretrained(BLIP_MODEL).to(device)
clip_model, clip_preprocess = clip.load(CLIP_MODEL, device=device)
print("Models loaded.\n")
def resize_letterbox(img, target_w=900, target_h=600):
    img_w, img_h = img.size
    scale = min(target_w / img_w, target_h / img_h)

    new_w = int(img_w * scale)
    new_h = int(img_h * scale)

    img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)

    new_img = Image.new("RGB", (target_w, target_h), (0, 0, 0))
    new_img.paste(img_resized, ((target_w - new_w) // 2, (target_h - new_h) // 2))

    return new_img
def load_safe_image(path):
    try:
        img = Image.open(path)

        img = ImageOps.exif_transpose(img)
        img = img.convert("RGB")

        w, h = img.size
        if w < 50 or h < 50:
            return None

        img = resize_letterbox(img, 900, 600)


        return img

    except Exception:
        return None
def is_image_file(filepath):
    try:
        with Image.open(filepath) as img:
            img.verify()
        return True
    except:
        return False
EXTERIOR_KEYWORDS = [
    "exterior","outside","road","side view","front view","rear view",
    "profile","headlight","wheel","grille","engine","hood","roof"
]

INTERIOR_KEYWORDS = [
    "interior","inside","cabin","cockpit","seat","dashboard","steering",
    "display","screen","infotainment","gear","shifter","sunroof"
]

NON_CAR_KEYWORDS = [
    "person","phone","holding","laptop","logo","advertisement",
    "poster","graphic","youtube","instagram"
]



STRICT_LABELS = {
    "Exterior": [
        "Front-view",
        "Rear-view",
        "Left-View",
        "Right-View",
        "Front-Left",
        "Front-Right",
        "Rear-Left",
        "Rear-Right",
        "Car-Roof",
        "Engine-Motor-Bay",
        "Battery(EV-Hybrid)",
        "Exterior-Closeup",
        "Exterior-General"
    ],
    "Interior": [
        "Dashboard",
        "Infotainment-System",
        "Front-seats",
        "Back-seats",
        "Gearbox",
        "Sunroof",
        "Boot-Trunk",
        "Interior-Closeup",
        "Interior-General"
    ]
}
def safe_label(label):
    return label.replace("/", "-").replace(" ", "-")
def build_clip_text_embeds():
    embeds = {}
    for cat in STRICT_LABELS:
        embeds[cat] = {}
        for label in STRICT_LABELS[cat]:
            token = clip.tokenize(label).to(device)
            with torch.no_grad():
                emb = clip_model.encode_text(token)
                emb = emb / emb.norm(dim=-1, keepdim=True)
            embeds[cat][label] = emb
    return embeds
TEXT_EMBEDS = build_clip_text_embeds()
print("CLIP text embeddings ready.\n")
def blip_classify(caption):
    cap = caption.lower()

    if sum(kw in cap for kw in NON_CAR_KEYWORDS) >= 2:
        return "Others"

    interior = sum(kw in cap for kw in INTERIOR_KEYWORDS)
    exterior = sum(kw in cap for kw in EXTERIOR_KEYWORDS)

    if interior == 0 and exterior == 0:
        return "Others"

    return "Interior" if interior > exterior else "Exterior"



def clip_label(image, category):

    img_tensor = clip_preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        img_emb = clip_model.encode_image(img_tensor)
        img_emb = img_emb / img_emb.norm(dim=-1, keepdim=True)

    best_score = -999
    best_label = f"{category}-General"

    for label, txt_emb in TEXT_EMBEDS[category].items():
        score = float((img_emb @ txt_emb.T).cpu().squeeze())
        if score > best_score:
            best_score = score
            best_label = label

    if best_score < 0.22:
        return f"{category}-Closeup"

    return best_label



def process_model(input_path, output_path):

    for folder in ["Exterior", "Interior", "Others"]:
        os.makedirs(os.path.join(output_path, folder), exist_ok=True)

    files = [f for f in os.listdir(input_path)
             if is_image_file(os.path.join(input_path, f))]

    if not files:
        print("  No images found.")
        return None

    results = {"Exterior":0, "Interior":0, "Others":0}
    log = []

    for img_name in tqdm(files, desc="  Classifying", leave=False):

        img_path = os.path.join(input_path, img_name)
        image = load_safe_image(img_path)

        if image is None:
            shutil.copy(img_path, os.path.join(output_path, "Others", f"error_{img_name}"))
            results["Others"] += 1
            continue

        # Step 1 → BLIP caption
        blip_inputs = processor(images=image, return_tensors="pt").to(device)
        blip_output = blip.generate(**blip_inputs, max_length=50)
        caption = processor.decode(blip_output[0], skip_special_tokens=True)

        # Step 2 → BLIP decides category
        coarse = blip_classify(caption)

        if coarse == "Others":
            shutil.copy(img_path, os.path.join(output_path, "Others", img_name))
            results["Others"] += 1
            continue

        # Step 3 → CLIP fine label
        fine = safe_label(clip_label(image, coarse))

        # Step 4 → rename and save
        base, ext = os.path.splitext(img_name)
        if not ext:
            ext = ".jpg"

        new_name = f"{fine}_{base}{ext}"
        dest_path = os.path.join(output_path, coarse, new_name)

        counter = 1
        while os.path.exists(dest_path):
            new_name = f"{fine}_{counter}_{base}{ext}"
            dest_path = os.path.join(output_path, coarse, new_name)
            counter += 1

        shutil.copy(img_path, dest_path)
        results[coarse] += 1

        log.append({
            "original": img_name,
            "caption": caption,
            "coarse": coarse,
            "label": fine,
            "new_filename": new_name
        })

    with open(os.path.join(output_path, "classification_log.json"), "w") as f:
        json.dump(log, f, indent=2)

    return results



print("\nStarting batch classification...\n")

models = [f for f in os.listdir(MAIN_FOLDER)
          if os.path.isdir(os.path.join(MAIN_FOLDER, f))]

all_results = {}

for model in models:
    in_path  = os.path.join(MAIN_FOLDER, model)
    out_path = os.path.join(OUTPUT_BASE, model)

    if os.path.exists(out_path):
        shutil.rmtree(out_path)

    print(f"\nProcessing model: {model}")
    res = process_model(in_path, out_path)

    if res:
        all_results[model] = res
        print(f" → Interior: {res['Interior']}  Exterior: {res['Exterior']}  Others: {res['Others']}")


print("\nDONE.\n")
print("Output saved to:", OUTPUT_BASE)


Using device: cuda
Models loaded.

CLIP text embeddings ready.


Starting batch classification...


Processing model: invicto




 → Interior: 35  Exterior: 7  Others: 97

Processing model: xl6




 → Interior: 26  Exterior: 9  Others: 92

Processing model: brezza




 → Interior: 15  Exterior: 44  Others: 92

Processing model: victoris




 → Interior: 13  Exterior: 12  Others: 117

Processing model: fronx




 → Interior: 27  Exterior: 18  Others: 98

Processing model: ciaz




 → Interior: 4  Exterior: 1  Others: 75

Processing model: dzire




 → Interior: 15  Exterior: 9  Others: 122

Processing model: ertiga




 → Interior: 15  Exterior: 16  Others: 111

Processing model: s-presso




 → Interior: 14  Exterior: 26  Others: 104

Processing model: baleno




 → Interior: 27  Exterior: 11  Others: 104

Processing model: alto-k10




 → Interior: 12  Exterior: 7  Others: 119

Processing model: celerio




 → Interior: 15  Exterior: 13  Others: 117

Processing model: swift




 → Interior: 33  Exterior: 88  Others: 1447

Processing model: wagon-r




 → Interior: 8  Exterior: 29  Others: 104

Processing model: eeco




 → Interior: 8  Exterior: 6  Others: 119

Processing model: ignis




 → Interior: 11  Exterior: 15  Others: 91

Processing model: e-vitara


                                                                

 → Interior: 14  Exterior: 18  Others: 112

DONE.

Output saved to: Maruti


