In [None]:
import pandas as pd
import json

def load_jsonl_as_dict_orlm(filepath='IndustryOR_orlm_hugging.jsonl'):
    """
    Load a JSONL file into a dictionary with 'id' as the key,
    and a sub-dictionary of {'id', 'question', 'ground_truth'} as the value.
    """
    data_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            item_id = item["id"]
            data_dict[item_id] = {
                'id': item_id,
                'question': item.get('en_question'),
                'ground_truth': item.get('en_answer')
            }
    return data_dict


def load_jsonl_as_dict_dong(filepath='testset_IndustryOR.jsonl'):
    """
    Load a JSONL file into a dictionary with 'id' as the key,
    and a sub-dictionary of {'id', 'question', 'ground_truth'} as the value.
    """
    data_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            item_id = item["id"]
            data_dict[item_id] = {
                'id': item_id,
                'question': item.get('question'),
                'ground_truth': item.get('ground_truth')
            }
    return data_dict

def load_jsonl_as_dict_dong_corrected(filepath='testset_IndustryOR_correct.jsonl'):
    """
    Load a JSONL file into a dictionary with 'id' as the key,
    and a sub-dictionary of {'id', 'question', 'ground_truth'} as the value.
    """
    data_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            item_id = item["id"]
            data_dict[item_id] = {
                'id': item_id,
                'question': item.get('question'),
                'ground_truth': item.get('ground_truth')
            }
    return data_dict



import pandas as pd

# Load your data (all keyed by 'id')
all_data = [
    load_jsonl_as_dict_orlm(),         # index 0 → orlm
    load_jsonl_as_dict_dong(),         # index 1 → dong
    load_jsonl_as_dict_dong_corrected()  # index 2 → ours
]
keys = ['orlm', 'dong', 'ours']

# Collect all unique ids across all files
all_ids = set()
for data in all_data:
    all_ids.update(data.keys())

# Build the summary table with only differing entries
summary_rows = []
for sample_id in all_ids:
    entry_orlm = all_data[0].get(sample_id, {})
    entry_dong = all_data[1].get(sample_id, {})
    entry_ours = all_data[2].get(sample_id, {})

    # Questions
    question_orlm = entry_orlm.get("question")
    question_dong = entry_dong.get("question")
    question_ours = entry_ours.get("question")

    # Ground truths
    gt_orlm = entry_orlm.get("ground_truth")
    gt_dong = entry_dong.get("ground_truth")
    gt_ours = entry_ours.get("ground_truth")

    # Compare orlm vs ours and dong vs ours
    question_diff_orlm = question_orlm != question_ours
    question_diff_dong = question_dong != question_ours
    gt_diff_orlm = gt_orlm != gt_ours
    gt_diff_dong = gt_dong != gt_ours

    if question_diff_orlm or question_diff_dong or gt_diff_orlm or gt_diff_dong:
        row = {
            "id": sample_id,
            "gt_orlm": gt_orlm,
            "gt_dong": gt_dong,
            "gt_ours": gt_ours,
        }

        if question_diff_orlm:
            row["question_orlm"] = question_orlm
            row["question_ours"] = question_ours  # Always include ours for comparison
        else:
            row["question_orlm"] = None

        if question_diff_dong:
            row["question_dong"] = question_dong
            row["question_ours"] = question_ours  # Always include ours for comparison
        else:
            row["question_dong"] = None

        summary_rows.append(row)

# Convert to DataFrame and index by id
df = pd.DataFrame(summary_rows)
df.set_index("id", inplace=True)

# Save or display
print(df.head())
df.to_csv("IndustryOR_comparison_with_orlm_and_dong.csv")




In [None]:
gt_diff = df["gt_file_orlm"] != df["gt_file_ours"]
diff_df = df[gt_diff]
print("Differences in ground truth:")
print(diff_df)
diff_df.to_csv("ground_truth_differences.csv")

In [None]:
import pandas as pd

# Assume df is your DataFrame
# Step 1: Match ids and ensure they are not None
id_match = (
    (df["id_file_orlm"] == df["id_file_ours"]) &
    (df["id_file_orlm"].notna()) &
    (df["id_file_ours"].notna())
)

# Step 2: Among those, check where ground truth values differ
gt_diff = df["gt_file_orlm"] != df["gt_file_ours"]

# Step 3: Apply both conditions
filtered_df = df[id_match & gt_diff]

# Show the resulting rows
print(filtered_df)
