In [57]:
import pandas as pd
import json
import re
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [58]:
run_name = "make_model_eval"

# load JSONL
records = []
with open("../aircraft_er_predictions/" + run_name + "_predictions_all.tsv") as f:
    for line in f:
        records.append(json.loads(line))



In [59]:
df = pd.DataFrame(records)
print(len(df))
df.head()

549228


Unnamed: 0,left,right,match,match_confidence
0,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Commander 200\n,0,0.219201
1,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60\n,1,0.999991
2,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aeronca 7-AC\n,0,0.336774
3,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Beech Bonanza 35A/C/D/E/G/...,0,0.999993
4,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Bellanca CH-300\n,0,0.991457


In [60]:
df_raw = pd.read_csv("../data/aircraft_er/" + run_name + "/candidates_raw.txt", sep="," ) #, names=["left_id", "right_id", "gold"])
print(len(df_raw))
df_raw.head()

549228


Unnamed: 0,left_id,right_id,left,right
0,AERMACCHI AL60,7,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Commander 200
1,AERMACCHI AL60,8,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60
2,AERMACCHI AL60,9,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aeronca 7-AC
3,AERMACCHI AL60,10,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Beech Bonanza 35A/C/D/E/G/...
4,AERMACCHI AL60,20,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Bellanca CH-300


In [61]:
df["left_id"] = df_raw["left_id"] 
df["right_id"] = df_raw["right_id"] 


In [62]:
df.head()

Unnamed: 0,left,right,match,match_confidence,left_id,right_id
0,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Commander 200\n,0,0.219201,AERMACCHI AL60,7
1,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60\n,1,0.999991,AERMACCHI AL60,8
2,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aeronca 7-AC\n,0,0.336774,AERMACCHI AL60,9
3,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Beech Bonanza 35A/C/D/E/G/...,0,0.999993,AERMACCHI AL60,10
4,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Bellanca CH-300\n,0,0.991457,AERMACCHI AL60,20


In [63]:
df["description"] = df["right"].str.extract(
    r"COL\s+description\s+VAL\s*(.*?)(?=\n\s*COL|$)",
    expand=False
)

In [64]:
print('all evaluated potential paris: ', len(df))
df_matches = df[df["match"]==1]
print('matching paris: ', len(df_matches))

all evaluated potential paris:  549228
matching paris:  196821


In [65]:
df_matches.head()

Unnamed: 0,left,right,match,match_confidence,left_id,right_id,description
1,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60\n,1,0.999991,AERMACCHI AL60,8,Aero Macchi AL-60
12,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Helio H-250/295/395\n,1,0.953049,AERMACCHI AL60,34,Helio H-250/295/395
25,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Noorduyn UC-64AS\n,1,0.90725,AERMACCHI AL60,65,Noorduyn UC-64AS
34,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Stinson SR-9\n,1,0.987475,AERMACCHI AL60,85,Stinson SR-9
36,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Stinson V-77\n,1,0.948848,AERMACCHI AL60,87,Stinson V-77


In [66]:
df_max = (
    df_matches
      .groupby(['right_id', 'description'], as_index=False)
      .agg({
          'match_confidence':      "max",
          'left_id':              lambda s: pd.unique(s).tolist(),
      })
      .rename(columns={'match_confidence': 'match_confidence_max', 'left_id': 'left_ids'})
      .assign(left_ids_count=lambda d: d['left_ids'].str.len())
)



In [67]:
df_max

Unnamed: 0,right_id,description,match_confidence_max,left_ids,left_ids_count
0,7,Aero Commander 200,0.999994,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",872
1,8,Aero Macchi AL-60,0.999991,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",820
2,9,Aeronca 7-AC,0.999995,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",823
3,10,Beech Bonanza 35A/C/D/E/G/H/J/K/S/V/ 36A,0.999980,"[BEECH 35, BEECH 36, LEARJET 36]",3
4,20,Bellanca CH-300,0.999992,"[AERO COMMANDER 200, AERO COMMANDER 500, AERO ...",416
...,...,...,...,...,...
439,887,B787-800 Dreamliner,0.999992,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",809
440,888,Boeing 737-900ER,0.999996,"[AEROMOT AMT200, AEROMOT AMT300, AMERICAN GENE...",54
441,889,B787-900 Dreamliner,0.999985,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",808
442,890,Antonov 225 (6 Engine),0.999991,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",882


In [68]:
df_join = df_matches.merge(
    df_max.drop(columns=['description']),
    left_on="right_id",
    right_on="right_id",
    how='left'
)
df_join["is_max"] = df_join["match_confidence"]==df_join["match_confidence_max"]

In [69]:
df_join

Unnamed: 0,left,right,match,match_confidence,left_id,right_id,description,match_confidence_max,left_ids,left_ids_count,is_max
0,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60\n,1,0.999991,AERMACCHI AL60,8,Aero Macchi AL-60,0.999991,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",820,True
1,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Helio H-250/295/395\n,1,0.953049,AERMACCHI AL60,34,Helio H-250/295/395,0.999985,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",744,False
2,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Noorduyn UC-64AS\n,1,0.907250,AERMACCHI AL60,65,Noorduyn UC-64AS,0.999996,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",876,False
3,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Stinson SR-9\n,1,0.987475,AERMACCHI AL60,85,Stinson SR-9,0.999995,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",945,False
4,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Stinson V-77\n,1,0.948848,AERMACCHI AL60,87,Stinson V-77,0.999994,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",913,False
...,...,...,...,...,...,...,...,...,...,...,...
196816,COL make VAL ZLIN COL model VAL Z526 COL name ...,COL description VAL Antonov 124\n,1,0.997433,ZLIN Z526,880,Antonov 124,0.999978,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",776,False
196817,COL make VAL ZLIN COL model VAL Z526 COL name ...,COL description VAL Ilyushiin Il-96-400t\n,1,0.999028,ZLIN Z526,881,Ilyushiin Il-96-400t,0.999989,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",966,False
196818,COL make VAL ZLIN COL model VAL Z526 COL name ...,COL description VAL B787-800 Dreamliner\n,1,0.963216,ZLIN Z526,887,B787-800 Dreamliner,0.999992,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",809,False
196819,COL make VAL ZLIN COL model VAL Z526 COL name ...,COL description VAL B787-900 Dreamliner\n,1,0.971513,ZLIN Z526,889,B787-900 Dreamliner,0.999985,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",808,False


In [70]:
df_join.to_csv("../aircraft_er_predictions/" + run_name + "_all_matches.csv", index=False)

In [71]:
df_join_max = df_join[df_join["is_max"]==True]

df_join_max

Unnamed: 0,left,right,match,match_confidence,left_id,right_id,description,match_confidence_max,left_ids,left_ids_count,is_max
0,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60\n,1,0.999991,AERMACCHI AL60,8,Aero Macchi AL-60,0.999991,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",820,True
284,COL make VAL AERO COMMANDER COL model VAL 1121...,COL description VAL Aero Commander 1121\n,1,0.999995,AERO COMMANDER 1121,611,Aero Commander 1121,0.999995,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",651,True
347,COL make VAL AERO COMMANDER COL model VAL 200 ...,COL description VAL Aero Commander 200\n,1,0.999994,AERO COMMANDER 200,7,Aero Commander 200,0.999994,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",872,True
687,COL make VAL AERO COMMANDER COL model VAL 200 ...,COL description VAL British Aerospace BAe-146-...,1,0.999987,AERO COMMANDER 200,867,British Aerospace BAe-146-200,0.999987,"[AERO COMMANDER 1121, AERO COMMANDER 200, AERO...",747,True
1405,COL make VAL AERO COMMANDER COL model VAL 680 ...,COL description VAL Grand Commander 680FL\n,1,0.999995,AERO COMMANDER 680,104,Grand Commander 680FL,0.999995,"[AERO COMMANDER 200, AERO COMMANDER 500, AERO ...",233,True
...,...,...,...,...,...,...,...,...,...,...,...
170265,COL make VAL STINSON COL model VAL SR9 COL nam...,COL description VAL Stinson SR-9\n,1,0.999995,STINSON SR9,85,Stinson SR-9,0.999995,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",945,True
170533,COL make VAL STINSON COL model VAL V77 COL cer...,COL description VAL Stinson V-77\n,1,0.999994,STINSON V77,87,Stinson V-77,0.999994,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",913,True
170924,COL make VAL SWEARINGEN COL model VAL SA226 CO...,COL description VAL Swearingen Metro Merlin\n,1,0.999988,SWEARINGEN SA226,462,Swearingen Metro Merlin,0.999988,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",730,True
176328,COL make VAL TECNAM COL model VAL P2012 COL na...,COL description VAL P2012 Traveler\n,1,0.999993,TECNAM P2012,102,P2012 Traveler,0.999993,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",730,True


In [72]:
df_join_max

Unnamed: 0,left,right,match,match_confidence,left_id,right_id,description,match_confidence_max,left_ids,left_ids_count,is_max
0,COL make VAL AERMACCHI COL model VAL AL60 COL ...,COL description VAL Aero Macchi AL-60\n,1,0.999991,AERMACCHI AL60,8,Aero Macchi AL-60,0.999991,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",820,True
284,COL make VAL AERO COMMANDER COL model VAL 1121...,COL description VAL Aero Commander 1121\n,1,0.999995,AERO COMMANDER 1121,611,Aero Commander 1121,0.999995,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",651,True
347,COL make VAL AERO COMMANDER COL model VAL 200 ...,COL description VAL Aero Commander 200\n,1,0.999994,AERO COMMANDER 200,7,Aero Commander 200,0.999994,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",872,True
687,COL make VAL AERO COMMANDER COL model VAL 200 ...,COL description VAL British Aerospace BAe-146-...,1,0.999987,AERO COMMANDER 200,867,British Aerospace BAe-146-200,0.999987,"[AERO COMMANDER 1121, AERO COMMANDER 200, AERO...",747,True
1405,COL make VAL AERO COMMANDER COL model VAL 680 ...,COL description VAL Grand Commander 680FL\n,1,0.999995,AERO COMMANDER 680,104,Grand Commander 680FL,0.999995,"[AERO COMMANDER 200, AERO COMMANDER 500, AERO ...",233,True
...,...,...,...,...,...,...,...,...,...,...,...
170265,COL make VAL STINSON COL model VAL SR9 COL nam...,COL description VAL Stinson SR-9\n,1,0.999995,STINSON SR9,85,Stinson SR-9,0.999995,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",945,True
170533,COL make VAL STINSON COL model VAL V77 COL cer...,COL description VAL Stinson V-77\n,1,0.999994,STINSON V77,87,Stinson V-77,0.999994,"[AERMACCHI AL60, AERO COMMANDER 100, AERO COMM...",913,True
170924,COL make VAL SWEARINGEN COL model VAL SA226 CO...,COL description VAL Swearingen Metro Merlin\n,1,0.999988,SWEARINGEN SA226,462,Swearingen Metro Merlin,0.999988,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",730,True
176328,COL make VAL TECNAM COL model VAL P2012 COL na...,COL description VAL P2012 Traveler\n,1,0.999993,TECNAM P2012,102,P2012 Traveler,0.999993,"[AERO COMMANDER 100, AERO COMMANDER 1121, AERO...",730,True


In [73]:


df_join_max.to_csv("../aircraft_er_predictions/" + run_name + "_max_matches.csv", index=False)