In [3]:
import json
from typing import List, Dict

# -------------- CONFIGURATION --------------
# Path to the file containing both outputs from original and fine-tuned models
INPUT_FILE = "model_comparison_v3.json"  # Update path if needed

# Define keys inside the 'dimensions' dictionary to compare
FIELDS_TO_COMPARE = [
    "shapeType",
    "aircraft",
    "diameter",
    "width",
    "length",
    "height",
    "rotation",
    "transparency",
    "markingType",
    "markingColor",
    "markingThickness",
    "dashLength",
    "dashDistance",
    "landingMarker",
    "markerColor",
    "lightColor",
    "lightScale",
    "lightDistance",
    "lightRadius",
    "lightHeight",
    "safetyAreaType",
    "dValue",
    "multiplier",
    "offsetDistance"
]

# -------------- EVALUATION FUNCTION --------------

def compare_fields(original: dict, fine_tuned: dict, keys: List[str]) -> Dict:
    """
    Compares specified keys in the 'dimensions' field of original and fine-tuned outputs.
    Returns a dictionary with:
        - total fields compared
        - matched field count
        - mismatched field count
        - detailed field differences
    """
    orig_dim = original.get("TLOF", [{}])[0].get("dimensions", {})
    ft_dim = fine_tuned.get("TLOF", [{}])[0].get("dimensions", {})

    differences = {}
    matched = 0
    total = 0

    for key in keys:
        val1 = orig_dim.get(key)
        val2 = ft_dim.get(key)
        total += 1
        if val1 == val2:
            matched += 1
        else:
            differences[key] = {"original": val1, "fine_tuned": val2}

    return {
        "total_fields": total,
        "matched_fields": matched,
        "mismatched_fields": total - matched,
        "differences": differences
    }

# -------------- MAIN SCRIPT --------------

def main():
    # Load input file
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        comparisons = json.load(f)

    # Results
    evaluation_results = []

    for item in comparisons:
        result = compare_fields(
            original=item["original_output"],
            fine_tuned=item["fine_tuned_output"],
            keys=FIELDS_TO_COMPARE
        )
        evaluation_results.append({
            "id": item["id"],
            "prompt": item["prompt"],
            "matched_fields": result["matched_fields"],
            "mismatched_fields": result["mismatched_fields"],
            "accuracy_percent": round((result["matched_fields"] / result["total_fields"]) * 100, 2),
            "differences": result["differences"]
        })

    # Save results
    with open("evaluation_field_accuracy.json", "w", encoding="utf-8") as f:
        json.dump(evaluation_results, f, indent=2)

    print(f"✅ Evaluation complete. Results saved to 'evaluation_field_accuracy.json'")
    print(f"🔢 Sample Result:\n{json.dumps(evaluation_results[0], indent=2)}")

if __name__ == "__main__":
    main()


✅ Evaluation complete. Results saved to 'evaluation_field_accuracy.json'
🔢 Sample Result:
{
  "id": 1,
  "prompt": "Generate a rectangular TLOF for a tiltrotor aircraft with 30m x 40m dimensions, elevation 5m, rotation 15 degrees, and 0.6 transparency. Location is [139.6917, 35.6895]. Add a 'V' landing marker in blue, scaled to 8, rotated to 90 degrees.",
  "matched_fields": 22,
  "mismatched_fields": 2,
  "accuracy_percent": 91.67,
  "differences": {
    "height": {
      "original": 0.01,
      "fine_tuned": 0.6
    },
    "markingThickness": {
      "original": 0.5,
      "fine_tuned": 1
    }
  }
}
