In [27]:
import pandas as pd
import json
import re

In [28]:
# load JSONL
records = []
with open("predictions_baseline.tsv") as f:
    for line in f:
        records.append(json.loads(line))

In [29]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,left,right,match,match_confidence
0,COL make VAL WACO COL model VAL KNF COL series...,COL make VAL CE COL model VAL 337 COL series V...,0,0.999723
1,COL make VAL PIPER COL model VAL PA46R COL ser...,COL make VAL G COL model VAL 32 COL series VAL...,0,0.99978
2,COL make VAL PILATUS COL model VAL PC12 COL se...,COL make VAL A COL model VAL 330 COL series VA...,0,0.999375
3,COL make VAL CESSNA COL model VAL 550 COL seri...,COL make VAL MD COL model VAL 10 COL series VA...,0,0.999685
4,COL make VAL BEECH COL model VAL 18 COL series...,COL make VAL M COL model VAL 4 COL series VAL ...,0,0.999808


In [30]:
gold = pd.read_csv("data/er_magellan/Structured/ditto_aircraft/all_pairs.txt", sep="\t", header=None, names=["left", "right", "gold"])
print(gold.head())

                                                left  \
0  COL make VAL WACO COL model VAL KNF COL series...   
1  COL make VAL PIPER COL model VAL PA46R COL ser...   
2  COL make VAL PILATUS COL model VAL PC12 COL se...   
3  COL make VAL CESSNA COL model VAL 550 COL seri...   
4  COL make VAL BEECH COL model VAL 18 COL series...   

                                               right  gold  
0  COL make VAL CE COL model VAL 337 COL series V...     0  
1  COL make VAL G COL model VAL 32 COL series VAL...     0  
2  COL make VAL A COL model VAL 330 COL series VA...     0  
3  COL make VAL MD COL model VAL 10 COL series VA...     0  
4  COL make VAL M COL model VAL 4 COL series VAL ...     0  


In [31]:
df["gold"] = gold["gold"]

In [32]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_true = df["gold"]
y_pred = df["match"]

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification report:\n", classification_report(y_true, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.9977811514656079

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     31130
           1       0.99      0.99      0.99      3122

    accuracy                           1.00     34252
   macro avg       0.99      0.99      0.99     34252
weighted avg       1.00      1.00      1.00     34252


Confusion matrix:
 [[31097    33]
 [   43  3079]]


In [33]:
errors = df[df["gold"] != df["match"]]
print(errors[["left","right","gold","match","match_confidence"]].head(20))

                                                    left  \
2721   COL make VAL BEECH COL model VAL 33 COL series...   
3643   COL make VAL BOEING COL model VAL 737 COL seri...   
4007   COL make VAL HUGHES COL model VAL 369 COL seri...   
4393   COL make VAL BELL COL model VAL UH1 COL series...   
4455   COL make VAL BOEING COL model VAL 737 COL seri...   
5818   COL make VAL BOEING COL model VAL 737 COL seri...   
6010   COL make VAL CHAMPION COL model VAL 7KC COL se...   
6796   COL make VAL PIPER COL model VAL PA18 COL seri...   
6970   COL make VAL MOONEY COL model VAL M20V COL ser...   
7003   COL make VAL DOUGLAS COL model VAL C53 COL ser...   
7326   COL make VAL BOEING COL model VAL 737 COL seri...   
7902   COL make VAL BOEING COL model VAL 737 COL seri...   
8013   COL make VAL BOEING COL model VAL 737 COL seri...   
8642   COL make VAL AEROSPATIALE COL model VAL AS350 ...   
8943   COL make VAL BOEING COL model VAL 737 COL seri...   
9444   COL make VAL CONVAIR COL model VA

In [34]:
errors

Unnamed: 0,left,right,match,match_confidence,gold
2721,COL make VAL BEECH COL model VAL 33 COL series...,COL make VAL BE COL model VAL 35 COL series VA...,1,0.982970,0
3643,COL make VAL BOEING COL model VAL 737 COL seri...,COL make VAL B COL model VAL 737 COL series VA...,0,0.886307,1
4007,COL make VAL HUGHES COL model VAL 369 COL seri...,COL make VAL HU COL model VAL 369 COL series V...,1,0.985612,0
4393,COL make VAL BELL COL model VAL UH1 COL series...,COL make VAL BHT COL model VAL UH1E COL series...,1,0.892723,0
4455,COL make VAL BOEING COL model VAL 737 COL seri...,COL make VAL B COL model VAL 737 COL series VA...,0,0.846245,1
...,...,...,...,...,...
32119,COL make VAL BOEING COL model VAL 737 COL seri...,COL make VAL B COL model VAL 737 COL series VA...,0,0.940837,1
32623,COL make VAL BELL COL model VAL OH58 COL serie...,COL make VAL BHT COL model VAL 206 COL series ...,0,0.999704,1
33460,COL make VAL BEECH COL model VAL 65 COL series...,COL make VAL BE COL model VAL 65 COL series VA...,1,0.998841,0
33703,COL make VAL BOEING COL model VAL 737 COL seri...,COL make VAL B COL model VAL 737 COL series VA...,0,0.956166,1


In [35]:
errors.to_csv("errors_review.csv", index=False)

In [38]:
def parse_record(record: str):
    """Parse Ditto serialized record into a dict of {field: value}."""
    parts = re.split(r"COL |VAL ", record.strip())
    parts = [p for p in parts if p]  # drop empties
    return {parts[i].strip(): parts[i+1].strip() for i in range(0, len(parts), 2)}

parsed = []

for _, row in errors.iterrows():
    left = parse_record(row["left"])
    right = parse_record(row["right"])
    parsed.append({
        "cictt_make": left.get("make"),
        "make": right.get("make"),
        "cictt_model": left.get("model"),
        "model": right.get("model"),
        "cictt_series": left.get("series"),
        "series": right.get("series"),
        "cictt_cert": left.get("cert"),
        "cert": right.get("cert"),
        "cictt_name": left.get("name"),
        "name": right.get("name"),
        "gold": row["gold"],
        "predicted": row["match"],
        "confidence": row["match_confidence"]
    })

aligned = pd.DataFrame(parsed)

In [39]:
aligned.to_csv("aligned_errors_review.csv", index=False)