In [18]:
import pandas as pd
import os
import re

In [14]:
run_folders = ['../../output/run1/validation/summary-statistics', '../../output/run2/validation/summary-statistics', 
               '../../output/run3/validation/summary-statistics']
graph_types = ['summary_observation', 'summary_action']

In [None]:
all_dfs = []

model_pattern = re.compile(r'output/([^/]+)/')
method_pattern = re.compile(r'graph/([^/]+)/kg\.ttl') 

for run_index, folder in enumerate(run_folders, start=1):
    for graph_type in graph_types:
        file_path = os.path.join(folder, f"{graph_type}.csv")
        df = pd.read_csv(file_path, index_col=False)
        first_col = df.columns[0]
        df.rename(columns={first_col: 'file_path'}, inplace=True)
        df['file_path'] = df['file_path'].astype(str)
        df['model'] = df['file_path'].apply(
            lambda x: model_pattern.search(x).group(1) if model_pattern.search(x) else 'UNKNOWN'
        )
        df['method'] = df['file_path'].apply(
            lambda x: method_pattern.search(x).group(1) if method_pattern.search(x) else 'UNKNOWN'
        )
        df['graph_type'] = graph_type
        df['run'] = run_index
        cols_order = ['run', 'model', 'graph_type', 'method'] + [
            c for c in df.columns if c not in ['run', 'model', 'graph_type', 'method', 'file_path']
        ]
        df = df[cols_order]
        all_dfs.append(df)

merged_df = pd.concat(all_dfs, ignore_index=True)
percentage_cols = [
    col for col in merged_df.columns
    if merged_df[col].astype(str).str.contains('%').any()
]
for col in percentage_cols:
    merged_df[col] = (
        merged_df[col].astype(str)
        .str.replace('%', '', regex=False)
        .replace('', '0')
        .astype(float) / 100
    )
#merged_df.to_csv('merged_output.csv', index=False)
print(merged_df.head())

   run         model           graph_type    method  Full Parse OK  \
0    1  llava-llama3  summary_observation       dpe           True   
1    1  llava-llama3  summary_observation      i2kg           True   
2    1  llava-llama3  summary_observation      d2kg           True   
3    1  llava-llama3  summary_observation  d2kg-rag           True   
4    1  llama4-scout  summary_observation       dpe           True   

   Total triples in KG  Valid triples  Invalid triples  Distinct classes used  \
0                   17             17                0                      0   
1                   31             31                0                      5   
2                   99             99                0                      6   
3                   89             89                0                      6   
4                   35             35                0                      0   

   Class Compliance  Class Coverage  Distinct properties used  \
0               0.0        