In [None]:
# SCENE ASSESSMENT WITH LLaVA
# Version: 1.0
# Git repo: https://github.com/perezjoan/SAGAI
# This script enables batch visual analysis of image datasets using LLaVA (Large
# Language and Vision Assistant) directly within Google Colab. It works in two
# parts: first, it clones the LLaVA source code from GitHub to set up the model
# architecture and logic; second, it downloads pretrained model weights
# (e.g. llava-v1.6-mistral-7b) from Hugging Face to actually run the model.
# Users can customize the prompt by defining a role, theory, task, and output
# format—allowing the model to follow a consistent and domain-specific evaluation
# protocol. The script then loops through all images in a Google Drive folder,
# applies the visual-language model to each image, and writes the structured
# responses into a unified text report. This makes it suitable for use cases
# such as walkability audits, architectural evaluations, and structured image
# interpretation workflows.

In [None]:
# Clone LLaVA and restart the session
%cd /content
!git clone https://github.com/haotian-liu/LLaVA.git
%cd LLaVA
!pip install -e .

/content
Cloning into 'LLaVA'...
remote: Enumerating objects: 2297, done.[K
remote: Total 2297 (delta 0), reused 0 (delta 0), pack-reused 2297 (from 1)[K
Receiving objects: 100% (2297/2297), 13.71 MiB | 11.91 MiB/s, done.
Resolving deltas: 100% (1404/1404), done.
/content/LLaVA
Obtaining file:///content/LLaVA
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torch==2.1.2 (from llava==1.2.2.post1)
  Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.16.2 (from llava==1.2.2.post1)
  Downloading torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting transformers==4.37.2 (from llava==1.2.2.post1)
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ------------------------ USER PARAMETERS -------------------------------------
# Mount Google Drive and set user paths
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set your case study name and task number
case_study = "nice"  # e.g., "vienna", "nice", etc.
selected_task = "T1"   # Options: "T1" (CATEGORIZATION), "T2" (COUNTING), or "T3" (MEASURING)
display_images = False # Display images for visual verification (optional)

# ------------------------ FOLDERS STRUCTURE -----------------------------------

# Automatically build paths
root_path = f"/content/drive/MyDrive/SAGAI"
image_folder = os.path.join(root_path, f"StreetViewBatchDownload_{case_study.capitalize()}")
output_path = os.path.join(root_path, f"Image_Analysis/Score_Analysis_LLaVA_{case_study.capitalize()}_{selected_task}.csv")
# Ensure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# -------------- LOAD THE MODEL  ----------------------
# Download LLaVA Model and weights from Hugging Face
from huggingface_hub import snapshot_download
snapshot_download(repo_id="liuhaotian/llava-v1.6-mistral-7b", local_dir="/content/lllava-v1.6-mistral-7b")
model_path = "/content/lllava-v1.6-mistral-7b"
from transformers import AutoTokenizer, BitsAndBytesConfig
from llava.utils import disable_torch_init
from llava.model import LlavaLlamaForCausalLM
import torch

# Quantization config (4-bit)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

# Load model and tokenizer from local folder
model = LlavaLlamaForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    low_cpu_mem_usage=True,
    quantization_config=quant_config
)

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

# Load and prepare the vision tower (for image encoding)
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device="cuda")
image_processor = vision_tower.image_processor

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/719k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

You are using a model of type llava_mistral to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# -------------------------- IMAGE ANALYSIS FUNCTION --------------------------
# This function sends an image + prompt to the LLaVA model and returns the response.
# It uses LLaVA's internal conversation templates and image preprocessing tools.

from IPython.display import display
from llava.conversation import conv_templates, SeparatorStyle
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
import os
from PIL import Image

def caption_image(image_file, prompt):
    if image_file.startswith('http'): # Load image from file path or URL
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    # Disable default torch weight initialization for reproducibility
    disable_torch_init()
    # Select LLaVA's conversation format
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles

    # Preprocess image and move to GPU
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()

    # Build prompt with special tokens and user input
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp

    # Fill conversation structure
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)

    # Tokenize full prompt (with image tokens)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(
        raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
    ).unsqueeze(0).cuda()

    # Set stopping condition using template-defined separator
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)

    # Run model inference with mild randomness for better variety
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            do_sample=True,               # Enable sampling (adds variation)
            temperature=0.3,              # Low temperature for controlled creativity
            top_p=0.9,                    # Nucleus sampling within top 90% probable tokens
            max_new_tokens=10,           # Expecting short numeric response
            stopping_criteria=[stopping_criteria],
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode and clean output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    response = output_text.split(roles[1] + ":")[-1].strip()

    return response if response else "[No output generated]"

In [None]:
# TASK 1: CATEGORIZATION

# ------------------------ TASK PROMPT SELECTION ------------------------
# TASK 1: CATEGORIZATION
if selected_task == "T1":
    role_description = (
        "You are an AI trained to visually analyze street-level images. "
        "Your task is to determine whether the environment shown in the image is urban or rural."
    )
    theory_model = (
        "Classification Guide:\n"
        "- 0: Rural area — sparse built environment, natural surroundings, few or no buildings.\n"
        "- 1: Urban area — dense built environment, visible infrastructure, buildings."
    )
    task = (
        "Carefully observe the image and determine whether it depicts a rural or urban environment.\n"
        "Use the classification guide above to assign a score.\n"
        "Return only the classification (0 or 1). Do not explain your answer or add extra text."
    )
    response_format = "Answer format: 0 or 1"
# TASK 2: COUNTING
elif selected_task == "T2":
    role_description = (
        "You are an AI trained to visually analyze street-level images. "
        "Your job is to detect the presence of commercial storefronts, such as shops, restaurants, or businesses."
    )
    theory_model = (
        "Scoring Guide:\n"
        "- 0: No visible shops or commercial storefronts.\n"
        "- 1: One visible shop or storefront.\n"
        "- 2: More than one shop or storefront is visible."
    )
    task = (
        "Look at the image carefully and apply the scoring guide above.\n"
        "Return only the score (0, 1, or 2) based on how many shops are visible.\n"
        "Do not explain your answer or add text. Only output the number."
    )
    response_format = "Answer format: 0, 1, or 2"
# TASK 3: MEASURING
elif selected_task == "T3":
    role_description = (
        "You are an AI trained to visually analyze street-level images. "
        "Your task is to estimate the visible width of a sidewalk."
    )
    theory_model = (
        "Scoring Guide:\n"
        "- 0: No visible sidewalk or the sidewalk is not clearly identifiable.\n"
        "- Otherwise: Return the estimated width of the sidewalk in meters, rounded to the nearest 0.5 (e.g., 1.0, 1.5, 2.0, 2.5, 3.0)."
    )
    task = (
        "Look at the image carefully. If a sidewalk is visible, estimate its width in meters.\n"
        "If no sidewalk is visible or it's unclear, return 0.\n"
        "Do not explain your answer or add any text. Only output a single number."
    )
    response_format = "Answer format: 0 or a number (e.g., 1.0, 1.5, 2.0, 2.5, 3.0)"

else:
    raise ValueError("Invalid selected_task. Choose from: 'T1', 'T2', or 'T3'.")

# Combine into full prompt
prompt = f"""
{role_description}
{theory_model}
{task}
{response_format}
"""

In [None]:
import csv
import time
import pandas as pd

# Start timer
start_time = time.time()

# ------------------ Load already processed image names ------------------
already_processed = set()

# If file exists, load existing rows
if os.path.exists(output_path) and os.stat(output_path).st_size > 0:
    df_existing = pd.read_csv(output_path)
    already_processed = set(df_existing['image_name'].tolist())
    print(f"🔁 Resuming from previous run. Already processed: {len(already_processed)} images.")
else:
    already_processed = set()
    print("🆕 Starting fresh. No existing results or empty file.")

# ------------------ Open CSV for appending ------------------
with open(output_path, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # If file was just created, write header
    if os.stat(output_path).st_size == 0:
        writer.writerow(["image_name", "score"])  # header

    for fname in sorted(os.listdir(image_folder)):
        if not fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            continue

        if fname in already_processed:
            continue  # Skip already processed

        image_path = os.path.join(image_folder, fname)

        if "_NA" in fname:
            print(f"⚠️ Skipping analysis for missing imagery: {fname}")
            writer.writerow([fname, "NA"])
            continue

        print(f"\n🔍 Processing: {fname}")
        try:
            result = caption_image(image_path, prompt)
            print(f"📝 Score: {result}\n")

            # Conditionally display image
            if display_images:
                image = Image.open(image_path)
                display(image)

            writer.writerow([fname, result.strip()])

        except Exception as e:
            print(f"❌ Error processing {fname}: {e}")
            writer.writerow([fname, f"ERROR: {e}"])

# ------------------ Summary ------------------
elapsed_time = time.time() - start_time
print("\n✅ Scoring completed.")
print(f"📁 Scores saved in: {output_path}")
print(f"⏱️ Total runtime: {elapsed_time:.2f} seconds")

🆕 Starting fresh. No existing results or empty file.

🔍 Processing: point_100_0.jpg
📝 Score: 0


🔍 Processing: point_100_180.jpg
📝 Score: 0


🔍 Processing: point_100_270.jpg
📝 Score: 0


🔍 Processing: point_100_90.jpg
📝 Score: 0


🔍 Processing: point_101_0.jpg
📝 Score: 0


🔍 Processing: point_101_180.jpg
📝 Score: 0


🔍 Processing: point_101_270.jpg
📝 Score: 0


🔍 Processing: point_101_90.jpg
📝 Score: 0


🔍 Processing: point_102_0.jpg
📝 Score: 2


🔍 Processing: point_102_180.jpg
📝 Score: 0


🔍 Processing: point_102_270.jpg
📝 Score: 0


🔍 Processing: point_102_90.jpg
📝 Score: 0


🔍 Processing: point_103_0.jpg
📝 Score: 0


🔍 Processing: point_103_180.jpg
📝 Score: 0


🔍 Processing: point_103_270.jpg
📝 Score: 0


🔍 Processing: point_103_90.jpg
📝 Score: 0


🔍 Processing: point_104_0.jpg
📝 Score: 0


🔍 Processing: point_104_180.jpg
📝 Score: 0


🔍 Processing: point_104_270.jpg
📝 Score: 0


🔍 Processing: point_104_90.jpg
📝 Score: 0


🔍 Processing: point_105_0.jpg
📝 Score: 0


🔍 Processing: poin