In [None]:
import os
import csv
import zipfile
from PIL import Image
from tqdm import tqdm
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from google.colab import drive

# ======================
# 1. SETUP
# ======================
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Initialize model
try:
    MODEL_NAME = "fancyfeast/llama-joycaption-alpha-two-hf-llava"
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    llava_model = LlavaForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
except Exception as e:
    raise RuntimeError(f"Model loading failed: {str(e)}")

# ======================
# 2. CONFIGURATION
# ======================
DRIVE_DATA_ROOT = "/content/drive/MyDrive/monuments_1"
OUTPUT_CSV = "/content/monuments_captions.csv"  # Colab temporary storage
ZIP_PATH = "/content/drive/MyDrive/monuments_captions.zip"  # Final Drive location

PROMPT = """Analyze this monument image. Respond ONLY in this exact pipe-separated format (don't be too specific):
Monument Type | Architecture Type | Material | Texture | Construction Period(century) | Key Features | Lighting
Example: Basilica | Baroque | Marble | Smooth | 17th century | Ornate facade, central arch, tower | Daylight, soft shadows"""

# ======================
# 3. IMAGE DISCOVERY
# ======================
def find_images(root_dir):
    """Recursively find all image files in directory"""
    image_exts = ('.png', '.jpg', '.jpeg', '.webp','.JPG')
    image_paths = []

    for dirpath, _, filenames in os.walk(root_dir):
        for f in filenames:
            if f.lower().endswith(image_exts):
                image_paths.append(os.path.join(dirpath, f))

    print(f"Found {len(image_paths)} images in {root_dir}")
    return image_paths

image_paths = find_images(DRIVE_DATA_ROOT)

# ======================
# 4. CAPTION GENERATION
# ======================
def generate_caption(image_path):
    """Process single image and return structured caption"""
    try:
        image = Image.open(image_path)

        # Prepare conversation
        convo = [
            {"role": "system", "content": "You are a precise architectural image captioner."},
            {"role": "user", "content": PROMPT}
        ]
        convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)

        # Process inputs
        inputs = processor(
            text=[convo_string],
            images=[image],
            return_tensors="pt"
        ).to('cuda')

        # Generate caption
        generate_ids = llava_model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            top_p=0.9,
            num_beams=1,
            early_stopping=True
        )

        # Decode and isolate final response line
        caption = processor.tokenizer.decode(
            generate_ids[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )
        caption = caption.split("[/INST]")[-1].strip()
        lines = caption.splitlines()
        last_line = next((line for line in reversed(lines) if line.strip()), "")
        return last_line

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None
# ======================
# 5. MAIN PROCESSING
# ======================
# Create CSV
with open(OUTPUT_CSV, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["image_path", "caption"])

    # Process with progress bar - try a smaller batch first
    for img_path in tqdm(image_paths[:8647], desc="Generating captions"):  # Start with 100 images
        caption = generate_caption(img_path)
        if caption:
            # Write relative path for portability
            rel_path = os.path.relpath(img_path, DRIVE_DATA_ROOT)
            writer.writerow([rel_path, caption])

# ======================
# 6. COMPRESS & STORE
# ======================
print("\nCompressing results...")
with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(OUTPUT_CSV, arcname="monuments_captions.csv")

# Verify
print(f"\n✅ Done! Results saved to:")
print(f"- Temporary CSV: {OUTPUT_CSV}")
print(f"- Drive ZIP: {ZIP_PATH}")
print(f"Total captions generated: {len(image_paths)}")

# Free up space (optional)
!rm {OUTPUT_CSV}

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Found 8647 images in /content/drive/MyDrive/monuments_1


Generating captions:   1%|          | 100/8647 [05:22<8:03:23,  3.39s/it]