In [1]:
import json
import random
import itertools
import math
from typing import Dict, List, Tuple, Any

class TLOFTestDataGenerator:
    """
    High-quality test data generator for TLOF configurations.
    Generates diverse, realistic test examples covering all parameters.
    Output format: {"input": "...", "output": "{\"TLOF\": [...]}", "expected_output": "{\"TLOF\": [...]}"}
    """
    
    def __init__(self):
        # Aircraft types not used in training
        self.aircraft_configs = {
            "helicopter": {"typical_size": (3, 6), "weight_class": "light"},
            "aerial_taxi": {"typical_size": (20, 30), "weight_class": "heavy"},
            "hybrid_airship": {"typical_size": (60, 120), "weight_class": "heavy"},
            "surveillance_uav": {"typical_size": (4, 10), "weight_class": "light"},
            "firefighting_copter": {"typical_size": (16, 24), "weight_class": "medium"},
            "agricultural_drone": {"typical_size": (2, 7), "weight_class": "light"},
            "amphibious_eVTOL": {"typical_size": (10, 18), "weight_class": "medium"},
            "rescue_hovercraft": {"typical_size": (14, 22), "weight_class": "medium"}
        }
        
        self.shape_types = ["Rectangle", "Circle", "Polygon"]
        self.colors = ["white", "yellow", "blue", "red", "green", "black", "purple", "orange", "gray", "brown"]
        self.landing_markers = ["H", "V"]
        self.marking_types = ["solid", "dashed"]
        self.tdpc_types = ["circle", "cross", "square"]
        self.safety_area_types = ["offset", "multiplier"]
        
        # Realistic location clusters (major cities worldwide)
        self.location_clusters = [
            {"center": [37.6173, 55.7558], "name": "Moscow", "radius": 0.3},
            {"center": [121.4737, 31.2304], "name": "Shanghai", "radius": 0.4},
            {"center": [19.0402, 47.4979], "name": "Budapest", "radius": 0.2},
            {"center": [90.4125, 23.8103], "name": "Dhaka", "radius": 0.2},
            {"center": [77.1025, 28.7041], "name": "New Delhi", "radius": 0.3},
            {"center": [30.0444, 31.2357], "name": "Cairo", "radius": 0.3},
            {"center": [18.4241, -33.9249], "name": "Cape Town", "radius": 0.2},
            {"center": [144.9631, -37.8136], "name": "Melbourne", "radius": 0.3},
            {"center": [35.9132, 31.9522], "name": "Amman", "radius": 0.2},
            {"center": [24.7536, 59.4370], "name": "Tallinn", "radius": 0.2}
        ]

    def generate_realistic_coordinates(self) -> List[float]:
        """Generate realistic coordinates near major cities"""
        cluster = random.choice(self.location_clusters)
        center_lng, center_lat = cluster["center"]
        radius = cluster["radius"]
        
        # Generate random offset within radius
        angle = random.uniform(0, 2 * math.pi)
        distance = random.uniform(0, radius)
        
        lng = center_lng + distance * math.cos(angle)
        lat = center_lat + distance * math.sin(angle)
        
        return [round(lng, 4), round(lat, 4)]

    def generate_aircraft_appropriate_dimensions(self, aircraft: str, shape_type: str) -> Tuple[int, int, float]:
        """Generate realistic dimensions based on aircraft type"""
        config = self.aircraft_configs.get(aircraft, self.aircraft_configs["helicopter"])
        min_size, max_size = config["typical_size"]
        
        # Base size with some variation
        base_size = random.randint(min_size, max_size)
        
        if shape_type == "Rectangle":
            # Rectangular TLOFs often have length > width
            width = base_size
            length = random.randint(base_size, int(base_size * 1.5))
        elif shape_type == "Circle":
            # For circles, width = length = diameter
            width = length = base_size
        else:  # Polygon
            # Polygons use average dimension
            width = length = base_size
            
        # Height based on aircraft weight class
        if config["weight_class"] == "light":
            height = round(random.uniform(0.1, 2.0), 2)
        elif config["weight_class"] == "medium":
            height = round(random.uniform(1.0, 4.0), 2)
        else:  # heavy
            height = round(random.uniform(2.0, 5.0), 2)
            
        return width, length, height

    def generate_natural_language_description(self, params: Dict[str, Any]) -> str:
        """Generate natural, varied language descriptions"""
        templates = [
            "Create a {shape} TLOF for {aircraft}",
            "Generate a {shape} landing pad for {aircraft}",
            "Design a {shape} touchdown area for {aircraft}",
            "Build a {shape} TLOF suitable for {aircraft}",
            "I need a {shape} landing surface for {aircraft}"
        ]
        
        template = random.choice(templates)
        description_parts = [template.format(
            shape=params["shape_type"].lower(),
            aircraft=params["aircraft"].replace("_", " ")
        )]
        
        # Add dimensions with natural variation
        if params["shape_type"] == "Rectangle":
            dim_templates = [
                "with dimensions {width}m x {length}m",
                "measuring {width}m by {length}m",
                "sized {width}m × {length}m",
                "with {width}m width and {length}m length"
            ]
            description_parts.append(random.choice(dim_templates).format(
                width=params["width"], length=params["length"]
            ))
        elif params["shape_type"] == "Circle":
            dim_templates = [
                "with {diameter}m diameter",
                "with a diameter of {diameter}m",
                "measuring {diameter}m across"
            ]
            description_parts.append(random.choice(dim_templates).format(
                diameter=params["width"]
            ))
        else:  # Polygon
            description_parts.append(f"with {params['sides']} sides and {params['width']}m width")
        
        # Add optional parameters with natural language
        if params["elevation"] > 0:
            elev_templates = [
                "at {elevation}m elevation",
                "elevated {elevation}m above ground",
                "with base height of {elevation}m",
                "{elevation}m above sea level"
            ]
            description_parts.append(random.choice(elev_templates).format(elevation=params["elevation"]))
        
        if params["rotation"] > 0:
            rot_templates = [
                "rotated {rotation} degrees",
                "with {rotation}° rotation",
                "oriented at {rotation} degrees"
            ]
            description_parts.append(random.choice(rot_templates).format(rotation=params["rotation"]))
        
        if params["transparency"] < 1.0:
            trans_templates = [
                "with {transparency} transparency",
                "at {transparency} opacity",
                "{transparency} transparent"
            ]
            description_parts.append(random.choice(trans_templates).format(transparency=params["transparency"]))
        
        # Add location
        description_parts.append(f"Location coordinates: [{params['position'][0]}, {params['position'][1]}]")
        
        # Add landing marker if enabled
        if params["landing_marker_enabled"]:
            marker_templates = [
                "Add a '{marker}' landing marker in {color}",
                "Include a {color} '{marker}' marker",
                "Place a {color} '{marker}' symbol",
                "With a {color} '{marker}' landing indicator"
            ]
            marker_desc = random.choice(marker_templates).format(
                marker=params["landing_marker"],
                color=params["marker_color"]
            )
            if params["marker_scale"] != 5:
                marker_desc += f" scaled to {params['marker_scale']}"
            if params["marker_rotation"] > 0:
                marker_desc += f" rotated {params['marker_rotation']} degrees"
            description_parts.append(marker_desc)
        
        # Add markings if enabled
        if params["marking_enabled"]:
            marking_templates = [
                "with {type} markings in {color}",
                "featuring {color} {type} boundary lines",
                "including {type} {color} perimeter markings"
            ]
            description_parts.append(random.choice(marking_templates).format(
                type=params["marking_type"],
                color=params["marking_color"]
            ))
        
        # Add lighting if enabled
        if params["light_enabled"]:
            light_templates = [
                "with {color} perimeter lighting",
                "including {color} LED lights around the edge",
                "equipped with {color} boundary lights"
            ]
            description_parts.append(random.choice(light_templates).format(
                color=params["light_color"]
            ))
        
        # Add safety features
        if params["safety_area_enabled"]:
            description_parts.append("with safety area included")
        
        if params["safety_net_enabled"]:
            description_parts.append("including safety netting")
        
        # Join with natural connectors
        connectors = [", ", ". ", ", and ", ". Also, ", ". Include "]
        result = description_parts[0]
        for i, part in enumerate(description_parts[1:], 1):
            if i == len(description_parts) - 1 and len(description_parts) > 2:
                result += ", and " + part
            else:
                result += random.choice(connectors[:2]) + part
        
        return result + "."

    def generate_single_example(self) -> Dict[str, Any]:
        """Generate one high-quality test example"""
        # Select aircraft and appropriate parameters
        aircraft = random.choice(list(self.aircraft_configs.keys()))
        shape_type = random.choice(self.shape_types)
        position = self.generate_realistic_coordinates()
        
        # Generate appropriate dimensions
        width, length, height = self.generate_aircraft_appropriate_dimensions(aircraft, shape_type)
        
        # Basic parameters with realistic constraints
        rotation = random.randint(0, 359)
        transparency = round(random.uniform(0.3, 1.0), 1)  # Usually not too transparent
        elevation = random.choice([0] * 3 + list(range(1, 51)))  # Ground level most common
        
        # Feature enablement (realistic probabilities)
        marking_enabled = random.random() < 0.8  # 80% have markings
        landing_marker_enabled = random.random() < 0.9  # 90% have landing markers
        light_enabled = random.random() < 0.6  # 60% have lighting
        tdpc_enabled = random.random() < 0.3  # 30% have TDPC
        safety_area_enabled = random.random() < 0.4  # 40% have safety area
        safety_net_enabled = random.random() < 0.2  # 20% have safety net
        
        # Generate feature-specific parameters
        marking_type = random.choice(self.marking_types) if marking_enabled else "dashed"
        marking_color = random.choice(self.colors) if marking_enabled else "white"
        marking_thickness = round(random.uniform(0.1, 1.5), 1) if marking_enabled else 0.5
        
        landing_marker = random.choice(self.landing_markers) if landing_marker_enabled else "H"
        marker_color = random.choice(self.colors) if landing_marker_enabled else "white"
        marker_scale = random.randint(1, 20) if landing_marker_enabled else 5
        marker_rotation = random.randint(0, 359) if landing_marker_enabled else 0
        
        light_color = random.choice(self.colors) if light_enabled else "white"
        light_scale = random.randint(-20, 100) if light_enabled else 1
        light_distance = random.randint(1, 50) if light_enabled else 1
        
        # Polygon-specific
        sides = random.randint(4, 8) if shape_type == "Polygon" else 4
        
        # Package parameters for description generation
        params = {
            "aircraft": aircraft,
            "shape_type": shape_type,
            "position": position,
            "width": width,
            "length": length,
            "height": height,
            "rotation": rotation,
            "transparency": transparency,
            "elevation": elevation,
            "sides": sides,
            "marking_enabled": marking_enabled,
            "marking_type": marking_type,
            "marking_color": marking_color,
            "landing_marker_enabled": landing_marker_enabled,
            "landing_marker": landing_marker,
            "marker_color": marker_color,
            "marker_scale": marker_scale,
            "marker_rotation": marker_rotation,
            "light_enabled": light_enabled,
            "light_color": light_color,
            "safety_area_enabled": safety_area_enabled,
            "safety_net_enabled": safety_net_enabled
        }
        
        # Generate natural language description
        user_input = self.generate_natural_language_description(params)
        
        # Generate complete JSON response
        tlof_json = {
            "TLOF": [
                {
                    "position": position,
                    "dimensions": {
                        "unit": "m",
                        "aircraftCategory": False,
                        "aircraft": aircraft,
                        "diameter": width,
                        "isVisible": True,
                        "layerName": f"Generated_TLOF_{aircraft}",
                        "shapeType": shape_type,
                        "scaleCategory": False,
                        "textureScaleU": 1,
                        "textureScaleV": 1,
                        "safetyNetScaleU": 1,
                        "safetyNetScaleV": 1,
                        "sides": sides,
                        "width": width,
                        "length": length,
                        "height": height,
                        "rotation": rotation,
                        "transparency": transparency,
                        "baseHeight": elevation,
                        
                        # Markings
                        "markingsCategory": marking_enabled,
                        "markingType": marking_type,
                        "markingColor": marking_color,
                        "markingThickness": marking_thickness,
                        "dashDistance": round(random.uniform(0.5, 5), 1) if marking_type == "dashed" else 1.5,
                        "dashLength": round(random.uniform(0.5, 5), 1) if marking_type == "dashed" else 1.0,
                        
                        # Landing Marker
                        "landingMarkerCategory": landing_marker_enabled,
                        "landingMarker": landing_marker,
                        "markerScale": marker_scale,
                        "markerThickness": round(random.uniform(0.1, 1.0), 2),
                        "markerRotation": marker_rotation,
                        "markerColor": marker_color,
                        "letterThickness": round(random.uniform(0.05, 0.5), 2),
                        
                        # TDPC
                        "tdpcCategory": tdpc_enabled,
                        "tdpcType": random.choice(self.tdpc_types),
                        "tdpcScale": random.randint(1, 50) if tdpc_enabled else 5,
                        "tdpcThickness": round(random.uniform(0.1, 2.0), 1) if tdpc_enabled else 0.5,
                        "tdpcRotation": random.randint(0, 359) if tdpc_enabled else 0,
                        "tdpcExtrusion": round(random.uniform(0.01, 0.1), 3),
                        "tdpcColor": random.choice(self.colors) if tdpc_enabled else "white",
                        
                        # Lighting
                        "lightCategory": light_enabled,
                        "lightColor": light_color,
                        "lightScale": light_scale,
                        "lightDistance": light_distance,
                        "lightRadius": round(random.uniform(0.1, 1.0), 1),
                        "lightHeight": round(random.uniform(0.1, 0.25), 2),
                        
                        # Safety Area
                        "safetyAreaCategory": safety_area_enabled,
                        "safetyAreaType": random.choice(self.safety_area_types),
                        "dValue": random.randint(5, 20) if safety_area_enabled else 10,
                        "multiplier": round(random.uniform(1.0, 3.0), 1) if safety_area_enabled else 1.5,
                        "offsetDistance": random.randint(1, 20) if safety_area_enabled else 3,
                        
                        # Safety Net
                        "safetyNetCategory": safety_net_enabled,
                        "curveAngle": random.randint(30, 90) if safety_net_enabled else 45,
                        "netHeight": random.randint(10, 30) if safety_net_enabled else 15,
                        "safetyNetTransparency": round(random.uniform(0.3, 0.8), 1),
                        "safetyNetColor": "#FF0000"
                    }
                }
            ]
        }
        
        # Generate compact JSON string
        tlof_json_str = json.dumps(tlof_json, separators=(',', ':'))
        
        return {
            "input": user_input,
            "output": tlof_json_str,
            "expected_output": tlof_json_str
        }

    def generate_dataset(self, num_examples: int = 300) -> List[Dict]:
        """Generate complete test dataset"""
        print(f"Generating {num_examples} high-quality test examples...")
        
        all_examples = []
        for i in range(num_examples):
            example = self.generate_single_example()
            all_examples.append(example)
            
            if (i + 1) % 50 == 0:
                print(f"Generated {i + 1}/{num_examples} examples...")
        
        print(f"✅ Generated {len(all_examples)} test examples")
        
        return all_examples

    def save_dataset(self, test_examples: List[Dict], 
                    filename: str = "tlof_test_data_v2.jsonl"):
        """Save test dataset in JSONL format"""
        
        # Save test data
        with open(filename, 'w') as f:
            for example in test_examples:
                f.write(json.dumps(example) + '\n')
        
        print(f"💾 Saved {len(test_examples)} test examples to {filename}")

    def validate_dataset(self, filename: str) -> bool:
        """Comprehensive dataset validation"""
        print(f"🔍 Validating dataset: {filename}")
        
        valid_count = 0
        total_count = 0
        issues = []
        
        with open(filename, 'r') as f:
            for line_num, line in enumerate(f, 1):
                total_count += 1
                try:
                    data = json.loads(line)
                    
                    # Check structure
                    required_fields = ["input", "output", "expected_output"]
                    missing_fields = [field for field in required_fields if field not in data]
                    if missing_fields:
                        issues.append(f"Line {line_num}: Missing fields: {missing_fields}")
                        continue
                    
                    # Check content exists
                    if not data["input"] or not data["output"] or not data["expected_output"]:
                        issues.append(f"Line {line_num}: Empty input, output, or expected_output")
                        continue
                    
                    # Validate JSON responses
                    try:
                        output_json = json.loads(data["output"])
                        expected_json = json.loads(data["expected_output"])
                        
                        if "TLOF" not in output_json or "TLOF" not in expected_json:
                            issues.append(f"Line {line_num}: Missing TLOF key in output or expected_output")
                            continue
                            
                        # Check if output and expected_output match
                        if data["output"] != data["expected_output"]:
                            issues.append(f"Line {line_num}: output and expected_output don't match")
                            continue
                            
                    except json.JSONDecodeError:
                        issues.append(f"Line {line_num}: Invalid JSON in output or expected_output")
                        continue
                    
                    valid_count += 1
                    
                except json.JSONDecodeError:
                    issues.append(f"Line {line_num}: Invalid JSON structure")
        
        # Report results
        print(f"📊 Validation Results:")
        print(f"   Valid examples: {valid_count}/{total_count}")
        print(f"   Success rate: {(valid_count/total_count)*100:.1f}%")
        
        if issues:
            print(f"⚠️  Found {len(issues)} issues:")
            for issue in issues[:10]:  # Show first 10 issues
                print(f"   - {issue}")
            if len(issues) > 10:
                print(f"   ... and {len(issues) - 10} more issues")
        else:
            print("✅ No validation issues found!")
        
        return len(issues) == 0

def main():
    """Main execution function"""
    print("🚀 TLOF Test Data Generator V2")
    print("=" * 50)
    
    # Initialize generator
    generator = TLOFTestDataGenerator()
    
    # Generate test dataset
    test_examples = generator.generate_dataset(
        num_examples=300  # Generate 300 test examples
    )
    
    # Save dataset
    generator.save_dataset(test_examples)
    
    # Validate dataset
    is_valid = generator.validate_dataset("tlof_test_data_v2.jsonl")
    
    if is_valid:
        print("\n🎉 SUCCESS! High-quality test data is ready!")
        print("\n📋 Dataset Summary:")
        print(f"   Test examples: {len(test_examples)}")
        print("\n🔥 Quality Features:")
        print("   ✅ Realistic aircraft-appropriate dimensions")
        print("   ✅ Natural language variation (5+ templates)")
        print("   ✅ Geographically distributed coordinates")
        print("   ✅ Balanced feature representation")
        print("   ✅ Comprehensive parameter coverage")
        print("   ✅ Validated JSON structure")
        print("   ✅ Format: 'input', 'output', and 'expected_output' fields")
        print("\n▶️  Test data is ready for evaluation!")
    else:
        print("❌ Dataset validation failed. Please check the issues above.")

if __name__ == "__main__":
    main()


🚀 TLOF Test Data Generator V2
Generating 300 high-quality test examples...
Generated 50/300 examples...
Generated 100/300 examples...
Generated 150/300 examples...
Generated 200/300 examples...
Generated 250/300 examples...
Generated 300/300 examples...
✅ Generated 300 test examples
💾 Saved 300 test examples to tlof_test_data_v2.jsonl
🔍 Validating dataset: tlof_test_data_v2.jsonl
📊 Validation Results:
   Valid examples: 300/300
   Success rate: 100.0%
✅ No validation issues found!

🎉 SUCCESS! High-quality test data is ready!

📋 Dataset Summary:
   Test examples: 300

🔥 Quality Features:
   ✅ Realistic aircraft-appropriate dimensions
   ✅ Natural language variation (5+ templates)
   ✅ Geographically distributed coordinates
   ✅ Balanced feature representation
   ✅ Comprehensive parameter coverage
   ✅ Validated JSON structure
   ✅ Format: 'input', 'output', and 'expected_output' fields

▶️  Test data is ready for evaluation!


In [2]:
# Example: Generate and view a single test example
generator = TLOFTestDataGenerator()
example = generator.generate_single_example()

print("Sample test data format:")
print("Input:", example["input"][:100] + "..." if len(example["input"]) > 100 else example["input"])
print("\nOutput (first 200 chars):", example["output"][:200] + "...")
print("\nExpected output (first 200 chars):", example["expected_output"][:200] + "...")
print("\nOutput == Expected output:", example["output"] == example["expected_output"])


Sample test data format:
Input: Create a rectangle TLOF for firefighting copter, with dimensions 24m x 36m, at 4m elevation. oriente...

Output (first 200 chars): {"TLOF":[{"position":[90.4111,23.8096],"dimensions":{"unit":"m","aircraftCategory":false,"aircraft":"firefighting_copter","diameter":24,"isVisible":true,"layerName":"Generated_TLOF_firefighting_copter...

Expected output (first 200 chars): {"TLOF":[{"position":[90.4111,23.8096],"dimensions":{"unit":"m","aircraftCategory":false,"aircraft":"firefighting_copter","diameter":24,"isVisible":true,"layerName":"Generated_TLOF_firefighting_copter...

Output == Expected output: True
