In [2]:

def read_dat_file(filepath, cost_key, time_key):
    """
    Reads a tab-delimited .dat file that has columns:
      circuit_name, num_qubits, num_partitions, pe_cost, pe_time

    Returns a dictionary keyed by (circuit_name, num_qubits, num_partitions).
    Each value is a dict: { cost_key: <float or 'N/A'>, time_key: <float or 'N/A'> }.
    """
    data_dict = {}
    with open(filepath, "r") as f:
        # Read and parse header
        header = f.readline().strip().split("\t")
        # Expecting something like:
        # ['circuit_name', 'num_qubits', 'num_partitions', 'pe_cost', 'pe_time']
        # We'll ignore the header beyond ensuring it has at least 5 columns.
        
        for line in f:
            line = line.strip()
            if not line:
                continue  # skip empty lines
            parts = line.split("\t")
            # Expecting [circuit_name, num_qubits, num_partitions, pe_cost, pe_time]
            if len(parts) < 5:
                continue  # skip malformed lines

            circuit_name = parts[0]
            try:
                num_qubits = int(parts[1])
            except ValueError:
                # If there's a problem parsing integer, skip line
                continue
            try:
                num_partitions = int(parts[2])
            except ValueError:
                continue
            
            cost_raw = parts[3]
            time_raw = parts[4]

            # Convert cost/time if not 'N/A'
            if cost_raw.upper() != "N/A":
                try:
                    cost_val = float(cost_raw)
                except ValueError:
                    cost_val = None
            else:
                cost_val = None

            if time_raw.upper() != "N/A":
                try:
                    time_val = float(time_raw)
                except ValueError:
                    time_val = None
            else:
                time_val = None
            
            key = (circuit_name, num_qubits, num_partitions)
            data_dict[key] = {
                cost_key: cost_val,
                time_key: time_val
            }
    return data_dict

def merge_data(dicts):
    """
    Takes a list of dictionaries keyed by
    (circuit_name, num_qubits, num_partitions),
    merges them into a single dictionary with all columns.
    """
    # Build a set of all keys from all dictionaries
    all_keys = set()
    for d in dicts:
        all_keys.update(d.keys())
    
    # Merge into one dictionary
    merged = {}
    for key in all_keys:
        merged[key] = {}
        for d in dicts:
            if key in d:
                merged[key].update(d[key])
    
    return merged



file1 = "/Users/ftb123/MLQCP_FM/benchmarking/dat_files/PE_QASM_large_new.dat"
file2 = "/Users/ftb123/MLQCP_FM/benchmarking/dat_files/ESD_QASM_large_new.dat"
file3 = "/Users/ftb123/MLQCP_FM/benchmarking/dat_files/MLFM-R_QASM_large_new.dat"


# Read data from each file, giving each unique cost/time column names
data1 = read_dat_file(file1, cost_key="part_cost", time_key="pe_time")
data2 = read_dat_file(file2, cost_key="esd_cost", time_key="esd_time")
data3 = read_dat_file(file3, cost_key="r_cost", time_key="r_time")

# Merge them
merged_dict = merge_data([data1, data2, data3])

# Create sorted list of keys to have a consistent order in output
# (Sorting by circuit_name, then by num_qubits, then num_partitions)
sorted_keys = sorted(
    merged_dict.keys(),
    key=lambda k: (k[0], k[1], k[2])  # sort by (circuit_name, num_qubits, num_partitions)
)

# Write to a new .dat file
output_file = "merged_results_large.dat"
with open(output_file, "w") as out:
    # Write a header line
    header_cols = [
        "circuit_name", "num_qubits", "num_partitions",
        "pe_cost", "pe_time",
        "esd_cost", "esd_time",
        "r_cost", "r_time"
    ]
    out.write("\t".join(header_cols) + "\n")
    
    # Write data rows
    for (circuit_name, num_qubits, num_partitions) in sorted_keys:
        row_data = merged_dict[(circuit_name, num_qubits, num_partitions)]
        
        # Retrieve each cost/time, defaulting to 'N/A' if missing
        cost1 = row_data.get("pe_cost", None)
        time1 = row_data.get("pe_time", None)
        cost2 = row_data.get("esd_cost", None)
        time2 = row_data.get("esd_time", None)
        cost3 = row_data.get("r_cost", None)
        time3 = row_data.get("r_time", None)
        
        # Convert None to "N/A" for output, otherwise use str(value)
        cost1_str = str(cost1) if cost1 is not None else "N/A"
        time1_str = str(time1) if time1 is not None else "N/A"
        cost2_str = str(cost2) if cost2 is not None else "N/A"
        time2_str = str(time2) if time2 is not None else "N/A"
        cost3_str = str(cost3) if cost3 is not None else "N/A"
        time3_str = str(time3) if time3 is not None else "N/A"
        
        out.write(
            f"{circuit_name}\t{num_qubits}\t{num_partitions}\t"
            f"{cost1_str}\t{time1_str}\t"
            f"{cost2_str}\t{time2_str}\t"
            f"{cost3_str}\t{time3_str}\n"
        )
