In [1]:
from google.colab import drive
import os
import sys
import shutil
import subprocess

drive.mount('/content/drive')

# Project paths
PROJECT_PATH = '/content/drive/MyDrive/SVGEditor'
CODE_PATH = f"{PROJECT_PATH}/code"
MODEL_PATH = f"{PROJECT_PATH}/models/qwen-image-edit"
OUTPUT_PATH = f"{PROJECT_PATH}/output"

if CODE_PATH not in sys.path:
    sys.path.insert(0, CODE_PATH)

# Target configuration
TARGET_NAME = "153_B"
TARGET_OUTPUT_PATH = f"{OUTPUT_PATH}/{TARGET_NAME}"

# Input image path (from project resource folder)
INPUT_IMAGE = f"{CODE_PATH}/resource/{TARGET_NAME}.png"

Mounted at /content/drive


In [2]:
# Install dependencies
%pip install diffusers accelerate safetensors transformers huggingface_hub segment-anything \
            opencv-python pillow matplotlib scikit-image scikit-learn cairosvg moviepy \
            shapely networkx lxml skan rdp openai requests lpips opencv-python scikit-image

import shutil
import subprocess

# Clean and clone repository
if os.path.exists(CODE_PATH):
    shutil.rmtree(CODE_PATH)

os.chdir(PROJECT_PATH)
result = subprocess.run(['git', 'clone', 'https://github.com/huanbasara/SVGEditor.git', 'code'],
                       capture_output=True, text=True)
print(f"Repository {'successfully' if result.returncode == 0 else 'failed'} cloned")

# Display latest commit info
os.chdir(CODE_PATH)
commit_info = subprocess.run(['git', 'log', '-1', '--pretty=format:%ci|%s'],
                           capture_output=True, text=True)

if commit_info.returncode == 0:
    commit_time, commit_msg = commit_info.stdout.strip().split('|', 1)
    print(f"Latest commit: {commit_time.split()[0]} - {commit_msg}")

# Add to Python path and reload modules
if CODE_PATH not in sys.path:
    sys.path.insert(0, CODE_PATH)

# Clear custom modules from cache
for base in ['sam_processor', 'svglib', 'utils']:
    to_remove = [m for m in sys.modules if m.startswith(base)]
    for m in to_remove:
        del sys.modules[m]

# Create necessary directories
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(TARGET_OUTPUT_PATH, exist_ok=True)

print("Setup complete!")

Collecting segment-anything
  Downloading segment_anything-1.0-py3-none-any.whl.metadata (487 bytes)
Collecting cairosvg
  Downloading cairosvg-2.8.2-py3-none-any.whl.metadata (2.7 kB)
Collecting skan
  Downloading skan-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting rdp
  Downloading rdp-0.8.tar.gz (4.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Collecting cairocffi (from cairosvg)
  Downloading cairocffi-1.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting cssselect2 (from cairosvg)
  Downloading cssselect2-0.8.0-py3-none-any.whl.metadata (2.9 kB)
Collecting magicgui>=0.7.3 (from skan)
  Downloading magicgui-0.10.1-py3-none-any.whl.metadata (5.5 kB)
Collecting qtpy>=2.4.0 (from magicgui>=0.7.3->skan)
  Downloading QtPy-2.4.3-py3-none-any.whl.metadata (12 kB)
Collecting superqt>=0.7.2 (from superqt[iconify]>=0.7.2->magicgui>=0.7.3->skan)
  Downloading superqt-0.7.6-py3-none-any.whl.metadata 

In [4]:
# ==================== Cell: Batch Evaluation for All Targets and Models ====================

import torch
import os
import pandas as pd
from utils.evaluation_metrics import evaluate_single_image

# All models to evaluate (removed qwen_image_edit_2509 and openai_api)
MODELS_TO_EVALUATE = [
    "sd15_base_img2img",
    "sd15_ip_adapter",
    "instruct_pix2pix",
    "instruct_pix2pix_sdxl",
    "aam_xl_animemix",
    "sdxl_base_img2img",
    "sdxl_ip_adapter",
    "qwen_image_edit"
]

# All targets with their edit instructions and text prompts
# Format: (target_name, edit_instruction, text_source, text_target)
targets = [
    ("41_A", "Change armor to casual dress", "armor", "casual dress"),
    ("41_B", "Change armor to casual dress", "armor", "casual dress"),
    ("65_A", "Remove scarf and reveal neck", "scarf", "no scarf"),
    ("65_B", "Remove scarf and reveal neck", "scarf", "no scarf"),
    ("153_A", "Change long ponytail to short layered hair", "long ponytail hair", "short layered hair"),
    ("153_B", "Change long ponytail to short layered hair", "long ponytail hair", "short layered hair"),
    ("254_A", "Change braided pigtails to straight long hair", "braided pigtails", "straight long hair"),
    ("254_B", "Change braided pigtails to straight long hair", "braided pigtails", "straight long hair"),
    ("710_A", "Change angry expression to happy expression", "not happy expression", "happy expression"),
    ("710_B", "Change angry expression to happy expression", "not happy expression", "happy expression"),
    ("1061_A", "Change conflict to hugging", "not hugging", "hugging"),
    ("1061_B", "Change conflict to hugging", "not hugging", "hugging")
]

# Batch evaluation function
def batch_evaluate_all():
    """Run batch evaluation for all targets and models"""
    results = []
    failed_evaluations = []

    print(f"Starting batch evaluation for {len(targets)} targets and {len(MODELS_TO_EVALUATE)} models...")
    print(f"Total evaluations: {len(targets) * len(MODELS_TO_EVALUATE)}")

    for target_name, edit_instruction, text_source, text_target in targets:
        print(f"\nProcessing target: {target_name}")

        # Input image path
        input_image_path = f"{CODE_PATH}/resource/{target_name}.png"

        for model_name in MODELS_TO_EVALUATE:
            # Output image path
            output_image_path = f"{OUTPUT_PATH}/{model_name}/{target_name}/{target_name}_default.png"

            # Check if output file exists
            if not os.path.exists(output_image_path):
                failed_evaluations.append({
                    'target': target_name,
                    'model': model_name,
                    'reason': 'Output image not found'
                })
                continue

            try:
                # Run evaluation (silent mode)
                scores = evaluate_single_image(
                    img_before_path=input_image_path,
                    img_after_path=output_image_path,
                    text_source=text_source,
                    text_target=text_target,
                    edit_prompt=edit_instruction,
                    device='cuda' if torch.cuda.is_available() else 'cpu'
                )

                # Store results
                result = {
                    'target': target_name,
                    'model': model_name,
                    'edit_instruction': edit_instruction,
                    'text_source': text_source,
                    'text_target': text_target,
                    **scores
                }
                results.append(result)

            except Exception as e:
                failed_evaluations.append({
                    'target': target_name,
                    'model': model_name,
                    'reason': str(e)
                })

    return results, failed_evaluations

# Run batch evaluation
results, failed_evaluations = batch_evaluate_all()

# Convert to DataFrame for analysis
df = pd.DataFrame(results)

# Print summary statistics
print("\n" + "="*100)
print("📊 BATCH EVALUATION SUMMARY")
print("="*100)

print(f"✅ Successful evaluations: {len(results)}")
print(f"❌ Failed evaluations: {len(failed_evaluations)}")

if len(results) > 0:
    # Model-wise statistics (most important - average across all samples)
    print(f"\n🏆 MODEL PERFORMANCE (averaged across all {len(targets)} samples):")
    print(f"{'Model':<25} {'Total':<8} {'Edit':<8} {'Style':<8} {'Struct':<8} {'Aesthetic':<8} {'Count':<6}")
    print("-" * 100)

    model_stats = df.groupby('model').agg({
        'total_score': ['mean', 'std'],
        'edit_compliance': 'mean',
        'style_consistency': 'mean',
        'structural_plausibility': 'mean',
        'aesthetic_quality': 'mean',
        'target': 'count'
    }).round(3)

    # Flatten column names
    model_stats.columns = ['total_mean', 'total_std', 'edit_mean', 'style_mean', 'struct_mean', 'aesthetic_mean', 'count']
    model_stats = model_stats.sort_values('total_mean', ascending=False)

    for model, row in model_stats.iterrows():
        print(f"{model:<25} {row['total_mean']:<8.3f} {row['edit_mean']:<8.3f} "
              f"{row['style_mean']:<8.3f} {row['struct_mean']:<8.3f} "
              f"{row['aesthetic_mean']:<8.3f} {int(row['count']):<6}")

    # Overall statistics
    print(f"\n📈 OVERALL STATISTICS:")
    print(f"   Average Total Score: {df['total_score'].mean():.3f} ± {df['total_score'].std():.3f}")
    print(f"   Average Edit Compliance: {df['edit_compliance'].mean():.3f} ± {df['edit_compliance'].std():.3f}")
    print(f"   Average Style Consistency: {df['style_consistency'].mean():.3f} ± {df['style_consistency'].std():.3f}")
    print(f"   Average Structural Plausibility: {df['structural_plausibility'].mean():.3f} ± {df['structural_plausibility'].std():.3f}")
    print(f"   Average Aesthetic Quality: {df['aesthetic_quality'].mean():.3f} ± {df['aesthetic_quality'].std():.3f}")

    # Target-wise statistics
    print(f"\n🎯 TARGET PERFORMANCE (averaged across all models):")
    target_avg = df.groupby('target')['total_score'].agg(['mean', 'std', 'count']).round(3)
    target_avg = target_avg.sort_values('mean', ascending=False)

    for target, row in target_avg.iterrows():
        print(f"   {target}: {row['mean']:.3f} ± {row['std']:.3f} (n={int(row['count'])})")

    # Detailed results table
    print(f"\n📋 DETAILED RESULTS TABLE:")
    print(f"{'Target':<8} {'Model':<25} {'Total':<8} {'Edit':<8} {'Style':<8} {'Struct':<8} {'Aesthetic':<8}")
    print("-" * 100)

    # Sort by target, then by total score
    df_sorted = df.sort_values(['target', 'total_score'], ascending=[True, False])
    for _, row in df_sorted.iterrows():
        print(f"{row['target']:<8} {row['model']:<25} "
              f"{row['total_score']:<8.3f} "
              f"{row['edit_compliance']:<8.3f} "
              f"{row['style_consistency']:<8.3f} "
              f"{row['structural_plausibility']:<8.3f} "
              f"{row['aesthetic_quality']:<8.3f}")

# Print failed evaluations if any
if failed_evaluations:
    print(f"\n⚠️  FAILED EVALUATIONS ({len(failed_evaluations)}):")
    for failure in failed_evaluations:
        print(f"   {failure['target']} + {failure['model']}: {failure['reason']}")

print("\n✅ Batch evaluation completed!")

Starting batch evaluation for 12 targets and 8 models...
Total evaluations: 96

Processing target: 41_A
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/pytho