# Data set exploration

## Overview

In [1]:
import os

def parse_instance_file(filepath):
    """
    Parses a single instance file and returns its structured data.

    Args:
        filepath (str): The full path to the instance file.

    Returns:
        dict: A dictionary containing the parsed data with keys:
              'N': Number of nodes
              'A': Number of arcs
              'R': Number of relationships
              'arcs': A list of dictionaries, each representing an arc with keys:
                      'index', 'from_node', 'to_node', 'cost'
              'relationships': A list of dictionaries, each representing a relationship with keys:
                               'index', 'from_trigger', 'to_trigger', 'trigger_arc_index',
                               'from_target', 'to_target', 'target_arc_index', 'new_target_arc_cost'
    """
    data = {
        'N': 0,
        'A': 0,
        'R': 0,
        'arcs': [],
        'relationships': []
    }

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = f.readlines()

            # Parse the first line: |N| |A| |R|
            n_a_r = list(map(int, lines[0].strip().split()))
            data['N'] = n_a_r[0]
            data['A'] = n_a_r[1]
            data['R'] = n_a_r[2]

            current_line_idx = 1

            # Parse arcs
            for _ in range(data['A']):
                arc_parts = list(map(float, lines[current_line_idx].strip().split()))
                data['arcs'].append({
                    'index': int(arc_parts[0]),
                    'from_node': int(arc_parts[1]),
                    'to_node': int(arc_parts[2]),
                    'cost': arc_parts[3]
                })
                current_line_idx += 1

            # Parse relationships
            for _ in range(data['R']):
                rel_parts = list(map(float, lines[current_line_idx].strip().split()))
                data['relationships'].append({
                    'index': int(rel_parts[0]),
                    'from_trigger': int(rel_parts[1]),
                    'to_trigger': int(rel_parts[2]),
                    'trigger_arc_index': int(rel_parts[3]),
                    'from_target': int(rel_parts[4]),
                    'to_target': int(rel_parts[5]),
                    'target_arc_index': int(rel_parts[6]),
                    'new_target_arc_cost': rel_parts[7]
                })
                current_line_idx += 1

    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
    except Exception as e:
        print(f"Error parsing file {filepath}: {e}")

    return data

def explore_datasets(base_folder_path="../instances/", vocal=False):
    """
    Explores all instance files within subdirectories of the specified base folder.
    The structure is expected to be base_folder_path/dataset_name/instance_file.txt

    Args:
        base_folder_path (str): The path to the main 'instances' folder.

    Returns:
        dict: A nested dictionary where the first level keys are dataset names,
              and the second level keys are instance filenames, holding the parsed data.
              Example: {'dataset_name_1': {'instance1.txt': {...}, 'instance2.txt': {...}}, ...}
    """
    all_parsed_datasets = {}
    if not os.path.isdir(base_folder_path):
        print(f"Error: Base folder '{base_folder_path}' not found.")
        return all_parsed_datasets

    # Iterate through each dataset folder inside the base_folder_path
    for dataset_name in os.listdir(base_folder_path):
        dataset_folder_path = os.path.join(base_folder_path, dataset_name)

        # Check if it's actually a directory
        if os.path.isdir(dataset_folder_path):
            print(f"\n--- Exploring Dataset: {dataset_name} ---")
            parsed_instances_in_dataset = {}

            # Iterate through files within the dataset folder
            for filename in os.listdir(dataset_folder_path):
                if filename.endswith(".txt"):
                    filepath = os.path.join(dataset_folder_path, filename)
                    if vocal:
                        print(f"  Parsing instance: {filename}")
                    parsed_data = parse_instance_file(filepath)
                    parsed_instances_in_dataset[filename] = parsed_data

                    # --- Example Exploration for this instance ---
                    if parsed_data and vocal:
                        print(f"    Nodes (N): {parsed_data['N']}")
                        print(f"    Arcs (A): {parsed_data['A']}")
                        print(f"    Relationships (R): {parsed_data['R']}")

                        if parsed_data['arcs']:
                            print(f"    First 3 arcs (if available):")
                            for i, arc in enumerate(parsed_data['arcs'][:3]):
                                print(f"      Arc {i}: From {arc['from_node']} to {arc['to_node']}, Cost {arc['cost']}")

                        if parsed_data['relationships']:
                            print(f"    First 3 relationships (if available):")
                            for i, rel in enumerate(parsed_data['relationships'][:3]):
                                print(f"      Rel {i}: Trigger Arc ({rel['from_trigger']}, {rel['to_trigger']}, Index {rel['trigger_arc_index']}), "
                                      f"Target Arc ({rel['from_target']}, {rel['to_target']}, Index {rel['target_arc_index']}), "
                                      f"New Cost {rel['new_target_arc_cost']}")
                    elif vocal:
                        print(f"    No data parsed for {filename}")

            all_parsed_datasets[dataset_name] = parsed_instances_in_dataset

    return all_parsed_datasets

In [2]:
all_datasets_data = explore_datasets()



--- Exploring Dataset: instances_generic ---

--- Exploring Dataset: instances_release_2 ---

--- Exploring Dataset: instances_release_1 ---


In [None]:
import os
import collections

# Assume parse_instance_file and explore_datasets functions are defined as before
# (They are not included here for brevity, but assume they are in the same script)

def summarize_parsed_data(all_datasets_data):
    """
    Generates a comprehensive summary of the parsed datasets, comparing them
    based on the number of instances, size of instances, and cost metrics.

    Args:
        all_datasets_data (dict): The nested dictionary returned by explore_datasets.
                                  Format: {'dataset_name': {'instance_name': parsed_data, ...}, ...}
    """
    print("\n" + "="*120)
    print("                 DATASET PARSING SUMMARY AND COMPARISON")
    print("="*120)

    if not all_datasets_data:
        print("No datasets were parsed or found.")
        return

    total_datasets = len(all_datasets_data)
    total_instances_across_all = sum(len(instances) for instances in all_datasets_data.values())

    print(f"\nOverall Summary:")
    print(f"----------------")
    print(f"Total number of datasets found: {total_datasets}")
    print(f"Total number of instances parsed across all datasets: {total_instances_across_all}")

    print("\nDetailed Dataset Comparison:")
    print("----------------------------")

    dataset_summaries = {}

    for dataset_name, instances_data in all_datasets_data.items():
        num_instances = len(instances_data)
        nodes_values = []
        arcs_values = []
        relationships_values = []
        avg_arc_costs = []
        avg_relation_costs = []
        relationships_changing_cost_percentages = []
        valid_instances_count = 0

        for instance_name, data in instances_data.items():
            if data and all(k in data for k in ['N', 'A', 'R', 'arcs', 'relationships']):
                nodes_values.append(data['N'])
                arcs_values.append(data['A'])
                relationships_values.append(data['R'])

                # Calculate average arc cost for this instance
                if data['arcs']:
                    total_arc_cost = sum(arc['cost'] for arc in data['arcs'])
                    avg_arc_costs.append(total_arc_cost / len(data['arcs']))
                else:
                    avg_arc_costs.append(0) # Or a suitable default/N/A

                # Calculate average relationship new target arc cost for this instance
                if data['relationships']:
                    total_rel_cost = sum(rel['new_target_arc_cost'] for rel in data['relationships'])
                    avg_relation_costs.append(total_rel_cost / len(data['relationships']))

                    # Calculate percentage of relationships where cost changes
                    original_arc_costs = {arc['index']: arc['cost'] for arc in data['arcs']}
                    relationships_changing_cost = 0
                    for rel in data['relationships']:
                        target_arc_idx = rel['target_arc_index']
                        new_cost = rel['new_target_arc_cost']
                        original_cost = original_arc_costs.get(target_arc_idx)

                        # Check if original_cost exists and if new_cost is different
                        if original_cost is not None and new_cost != original_cost:
                            relationships_changing_cost += 1
                    
                    if len(data['relationships']) > 0:
                        change_percentage = (relationships_changing_cost / len(data['relationships'])) * 100
                        relationships_changing_cost_percentages.append(change_percentage)
                    else:
                        relationships_changing_cost_percentages.append(0) # No relationships means 0% change
                else:
                    avg_relation_costs.append(0) # Or a suitable default/N/A
                    relationships_changing_cost_percentages.append(0)

                valid_instances_count += 1
            else:
                print(f"  Warning: Skipping incomplete data for {dataset_name}/{instance_name}")

        dataset_summaries[dataset_name] = {
            'num_instances': num_instances,
            'valid_instances': valid_instances_count,
            'nodes_stats': {
                'min': min(nodes_values) if nodes_values else 'N/A',
                'max': max(nodes_values) if nodes_values else 'N/A',
                'avg': sum(nodes_values) / len(nodes_values) if nodes_values else 'N/A',
            },
            'arcs_stats': {
                'min': min(arcs_values) if arcs_values else 'N/A',
                'max': max(arcs_values) if arcs_values else 'N/A',
                'avg': sum(arcs_values) / len(arcs_values) if arcs_values else 'N/A',
            },
            'relationships_stats': {
                'min': min(relationships_values) if relationships_values else 'N/A',
                'max': max(relationships_values) if relationships_values else 'N/A',
                'avg': sum(relationships_values) / len(relationships_values) if relationships_values else 'N/A',
            },
            'avg_instance_arc_cost_stats': { # Average of average arc costs per instance
                'min': min(avg_arc_costs) if avg_arc_costs else 'N/A',
                'max': max(avg_arc_costs) if avg_arc_costs else 'N/A',
                'avg': sum(avg_arc_costs) / len(avg_arc_costs) if avg_arc_costs else 'N/A',
            },
            'avg_instance_relation_cost_stats': { # Average of average relation costs per instance
                'min': min(avg_relation_costs) if avg_relation_costs else 'N/A',
                'max': max(avg_relation_costs) if avg_relation_costs else 'N/A',
                'avg': sum(avg_relation_costs) / len(avg_relation_costs) if avg_relation_costs else 'N/A',
            },
            'rel_cost_change_pct_stats': { # Average percentage of relationships changing cost
                'min': min(relationships_changing_cost_percentages) if relationships_changing_cost_percentages else 'N/A',
                'max': max(relationships_changing_cost_percentages) if relationships_changing_cost_percentages else 'N/A',
                'avg': sum(relationships_changing_cost_percentages) / len(relationships_changing_cost_percentages) if relationships_changing_cost_percentages else 'N/A',
            }
        }

    # Print formatted table header
    # Adjusted column widths for new metrics
    print(f"{'Dataset':<20} | {'# Inst.':<9} | {'Nodes (Min/Avg/Max)':<25} | {'Arcs (Min/Avg/Max)':<23} | {'Rels (Min/Avg/Max)':<23} | {'Avg Arc Cost (Avg)':<18} | {'Avg Rel Cost (Avg)':<20} | {'Rel Cost % Change (Avg)':<25}")
    print("-" * 185) # Adjust line length

    for dataset_name, summary in dataset_summaries.items():
        nodes_str = f"{summary['nodes_stats']['min']}/{summary['nodes_stats']['avg']:.0f}/{summary['nodes_stats']['max']}" if isinstance(summary['nodes_stats']['avg'], float) else 'N/A'
        arcs_str = f"{summary['arcs_stats']['min']}/{summary['arcs_stats']['avg']:.0f}/{summary['arcs_stats']['max']}" if isinstance(summary['arcs_stats']['avg'], float) else 'N/A'
        rels_str = f"{summary['relationships_stats']['min']}/{summary['relationships_stats']['avg']:.0f}/{summary['relationships_stats']['max']}" if isinstance(summary['relationships_stats']['avg'], float) else 'N/A'

        avg_arc_cost_str = f"{summary['avg_instance_arc_cost_stats']['avg']:.2f}" if isinstance(summary['avg_instance_arc_cost_stats']['avg'], float) else 'N/A'
        avg_rel_cost_str = f"{summary['avg_instance_relation_cost_stats']['avg']:.2f}" if isinstance(summary['avg_instance_relation_cost_stats']['avg'], float) else 'N/A'
        rel_cost_change_pct_str = f"{summary['rel_cost_change_pct_stats']['avg']:.2f}%" if isinstance(summary['rel_cost_change_pct_stats']['avg'], float) else 'N/A'


        print(f"{dataset_name:<20} | {summary['num_instances']:<9} | {nodes_str:<25} | {arcs_str:<23} | {rels_str:<23} | {avg_arc_cost_str:<18} | {avg_rel_cost_str:<20} | {rel_cost_change_pct_str:<25}")

    print("\n" + "="*120)
    print("End of Summary")
    print("="*120)
    

In [6]:
summarize_parsed_data(all_datasets_data)


                 DATASET PARSING SUMMARY AND COMPARISON

Overall Summary:
----------------
Total number of datasets found: 3
Total number of instances parsed across all datasets: 235

Detailed Dataset Comparison:
----------------------------
Dataset              | # Inst.   | Nodes (Min/Avg/Max)       | Arcs (Min/Avg/Max)      | Rels (Min/Avg/Max)      | Avg Arc Cost (Avg) | Avg Rel Cost (Avg)   | Rel Cost % Change (Avg)  
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
instances_generic    | 180       | 10/18/25                  | 90/320/600              | 100/2092/10000          | 2587.19            | 3042.15              | 100.00%                  
instances_release_2  | 34        | 18/68/142                 | 90/580/1562             | 1144/47834/208020       | 1.00               | 1.08                 | 100.00%                  
instances_releas