In [11]:
import json
import pandas as pd
import glob
import os


json_files = glob.glob("./*.json")

all_rows = []

In [16]:
# A dict to hold data per model: { model_name: {section_index: (section_title, "title (score)")} }
model_data = {}

all_sections = set()  # To track all section indexes across all models

for file_path in json_files:
    # Extract model name from filename
    model_name = os.path.splitext(os.path.basename(file_path))[0]

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    model_data[model_name] = {}

    for section in data:
        try:
            section_index = int(section["section_index"])
        except ValueError:
            section_index = section["section_index"] 
        section_title = section["section_title"]
        top_match = section.get("top_matches", [])
        if top_match:
            top1 = top_match[0]
            article = str(top1.get("article", "Unknown"))
            clause = top1.get("clause", "")

            if clause:
                article_or_clause = f"{article}-{clause}"
            else:
                article_or_clause = article
            similarity = top1.get("similarity", 0)
            model_data[model_name][section_index] = (section_title, f"{article_or_clause} ({similarity:.4f})")
        else:
            model_data[model_name][section_index] = (section_title, "")

        all_sections.add(section_index)

# Prepare DataFrame rows for each section
rows = []
for section_index in sorted(all_sections):
    # Get the section title from any model (assume all same titles for the same section)
    section_title = None
    for model_name in model_data:
        if section_index in model_data[model_name]:
            section_title = model_data[model_name][section_index][0]
            break

    row = {
        "section_index": section_index,
        "section_title": section_title
    }

    # For each model, get the article/clause + similarity string or empty string if missing
    for model_name in model_data:
        row[model_name] = model_data[model_name].get(section_index, (None, ""))[1]

    rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)
df = df.sort_values("section_index").reset_index(drop=True)

In [17]:
df

Unnamed: 0,section_index,section_title,ikea_llm,ikea_tfidf,ikea_knowledgeGraphV2,ikea_knowledgeGraph,ikea_sentenceTransformer
0,1,1. Who is the responsible controller for the d...,28 (0.4693),87 (0.1111),28-8. (0.3805),30 (0.3899),28 (0.3952)
1,2,2. What data is being processed and from which...,13 (0.4975),33 (0.1820),17-7. (0.4437),14 (0.4291),17 (0.4358)
2,3,3. For which purpose and for how long is the d...,13 (0.5906),15 (0.3178),5-6. (0.6763),5 (0.6618),13 (0.7248)
3,4,4. On which legal basis is personal data being...,6 (0.6778),29 (0.1919),13-8. (0.6850),7 (0.7125),13 (0.6803)
4,5,5. Who will be receiving your data?,28 (0.4983),23 (0.2248),70-10. (0.5533),14 (0.5388),28 (0.5519)
5,6,6. Are you obliged to provide data?,13 (0.4364),37 (0.1188),13-13. (0.4600),14 (0.3903),13 (0.4782)
6,7,7. Is there automated decision-making includin...,22 (0.6317),31 (0.1756),15-9. (0.5376),31 (0.4555),13 (0.5337)
7,8,8. Your rights in processing your personal data,15 (0.6323),15 (0.1729),50-1. (0.5931),14 (0.6057),50 (0.6304)


In [18]:
df.to_clipboard(index=False, excel=True)