# Evaluate ClassSim and PD of fgvc.

In [2]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [3]:
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df_fgvc.dat"
PD_SIMILARITY_PATH="results/GMMDisttances_fgvc.dat"

In [5]:
simdf = pd.read_pickle(VALID_SIMILARITY_DICT_PATH)
pddf=pd.read_pickle(PD_SIMILARITY_PATH)

### Build maker-family-variant hierachy

In [7]:
def category_file_to_id_dict(fpath):
    with open(fpath) as f:
        return {cat.rstrip("\n"): id for id, cat in enumerate(f)}

In [8]:
variants_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/variants.txt")
families_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/families.txt")
manufacturers_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/manufacturers.txt")

In [13]:
def file_type_to_var_list(fpath):
    with open(fpath) as f:
         return [line.rstrip("\n").split(" ", 1) for line in f]

In [14]:
file_var_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_variant_val.txt")
file_fam_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_family_val.txt")
file_man_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_manufacturer_val.txt")

In [24]:
variant_to_family_dict = {}
family_to_manufacturer_dict = {}

In [25]:
for idx, _ in enumerate(file_var_list):
    vari, fam, man = file_var_list[idx][1], file_fam_list[idx][1], file_man_list[idx][1]
    variant_to_family_dict[vari] = fam
    family_to_manufacturer_dict[fam] = man

Check dict integrity

In [26]:
variants_to_id_dict.keys() == variant_to_family_dict.keys()

True

In [28]:
families_to_id_dict.keys() == family_to_manufacturer_dict.keys()

True

In [31]:
manufacturers_to_id_dict.keys() == set(family_to_manufacturer_dict.values())

True

In [32]:
file_var_list[0], file_fam_list[0], file_man_list[0]

(['0481847', '707-320'], ['0481847', 'Boeing 707'], ['0481847', 'Boeing'])

In [33]:
variant_to_family_dict['707-320'], family_to_manufacturer_dict['Boeing 707']

('Boeing 707', 'Boeing')

# Is manufacturers similar each other?

first check Boeing by hand

In [240]:
boeing_families = [key for key, val in family_to_manufacturer_dict.items() if val == "Boeing"]

In [241]:
boeing_variants = [key for key, val in variant_to_family_dict.items() if val in boeing_families]

In [242]:
boeing_variants[0:5]

['737-200', '757-200', '747-300', '727-200', '747-400']

In [243]:
boeing_variants_ids = sorted([str(variants_to_id_dict[vari]) for vari in boeing_variants])

In [244]:
pddf['0'].loc[list(set(boeing_variants_ids)-set("0"))]

41    2654.367182
14    2590.393673
16    2650.326974
8     2667.412616
15    2594.210519
3     2492.388843
4     2572.018896
11    2711.677687
10    2766.734586
12    2688.777579
1     3046.101723
13    2743.839675
5     2596.947708
6     2653.518319
7     2507.363585
9     2776.091813
18    2589.787980
17    2577.656687
19    2722.166783
20    2669.536654
2     3208.201787
Name: 0, dtype: float64

In [245]:
# average PD between '0' and other boeing manufacturers
pddf['0'].loc[list(set(boeing_variants_ids)-set("0"))].mean()

2689.5010128967497

In [246]:
# average PD between '0' and all non boeing variants
pddf['0'].loc[list(set(pddf.columns)-set(boeing_variants_ids))].mean()

2802.9170090293455

In [247]:
# average similarity between '0' and other boeing manufacturers
simdf['0'].loc[list(set(boeing_variants_ids)-set("0"))].mean()

0.28911564625850333

In [248]:
# average similarity between '0' and non boeing variants
simdf['0'].loc[list(set(simdf.columns)-set(boeing_variants_ids))].mean()

0.18040293040293034

Compare average among each manufacturer's similarity and whole average.

In [249]:
def manufacturer_to_families(manufacturer):
    return [key for key, val in family_to_manufacturer_dict.items() if val == manufacturer]

def one_family_to_variants(family):
    return [key for key, val in variant_to_family_dict.items() if val == family]

def families_to_variants(families):
    return [key for key, val in variant_to_family_dict.items() if val in families]

# sort here
def variants_to_ids(variants):
    return sorted([str(variants_to_id_dict[vari]) for vari in variants])

def manufacturer_to_variants_ids(manufacturer):
    return variants_to_ids(families_to_variants(manufacturer_to_families(manufacturer)))

def family_to_variants_ids(fam):
    return variants_to_ids(one_family_to_variants(fam))



In [250]:
def sum_distance_from(df, fromone, to_list):
    ser = df[fromone].loc[to_list]
    return ser.sum(), ser.count()

In [251]:
def sum_distance(df, keys):
    rest = set(keys)
    sum_, count_ = 0, 0
    for key in keys:
        rest = rest-set(key)
        if len(rest) > 0:
            onesum, onecount = sum_distance_from(df, key, list(rest))
            sum_ += onesum
            count_ += onecount
    return sum_, count_

def average_distance(df, keys):
    sum_, count_ = sum_distance(df, keys)
    return sum_/count_

In [252]:
def whole_average(df):
    rest = set(df.columns)
    sum_ = 0
    count_ = 0
    for key in df.columns:
        rest = rest - set(key)
        ser = df[key].loc[list(rest)]
        sum_ += ser.sum()
        count_ += ser.count()
    return sum_/count_

In [253]:
average_distance(simdf, boeing_variants_ids)

0.39714285714285719

In [254]:
average_distance(pddf, boeing_variants_ids)

2047.9312082754971

In [255]:
whole_average(simdf)

0.21154240307755376

In [256]:
whole_average(pddf)

2437.7053103806484

In [257]:
one_variant_manufacturers = [man for man in manufacturers_to_id_dict.keys() if len(manufacturer_to_variants_ids(man)) == 1]

In [258]:
one_variant_manufacturers

['Dornier',
 'Robin',
 'Lockheed Martin',
 'Eurofighter',
 'Fairchild',
 'Ilyushin',
 'Yakovlev',
 'Piper',
 'Antonov',
 'Bombardier Aerospace',
 'Panavia',
 'Cirrus Aircraft',
 'Supermarine']

In [259]:
def avg_manufacturer_df(df):
    avg_pair = [(man, average_distance(df, manufacturer_to_variants_ids(man))) for man in manufacturers_to_id_dict.keys() if man not in one_variant_manufacturers]
    return pd.DataFrame({"manufacturers":[man for man,_ in avg_pair], "avg": [avg for _,avg in avg_pair]})[["manufacturers","avg"]]

In [260]:
avgsim = avg_manufacturer_df(simdf)

In [262]:
avgsim["whole"] = whole_average(simdf)
avgsim["larger?"] = avgsim["avg"]>avgsim["whole"]

In [263]:
avgsim

Unnamed: 0,manufacturers,avg,whole,larger?
0,ATR,0.607143,0.211542,True
1,Lockheed Corporation,0.535714,0.211542,True
2,Gulfstream Aerospace,0.857143,0.211542,True
3,McDonnell Douglas,0.355685,0.211542,True
4,Cessna,0.410714,0.211542,True
5,Douglas Aircraft Company,0.580357,0.211542,True
6,Tupolev,0.321429,0.211542,True
7,Beechcraft,0.5,0.211542,True
8,de Havilland,0.485714,0.211542,True
9,Airbus,0.522401,0.211542,True


In [264]:
avgsim["larger?"].sum()

17

In [261]:
avgpd = avg_manufacturer_df(pddf)

In [265]:
avgpd["whole"] = whole_average(pddf)
avgpd["closer?"] = avgpd["avg"]<avgpd["whole"]

In [266]:
avgpd

Unnamed: 0,manufacturers,avg,whole,closer?
0,ATR,941.369542,2437.70531,True
1,Lockheed Corporation,1615.385673,2437.70531,True
2,Gulfstream Aerospace,963.738817,2437.70531,True
3,McDonnell Douglas,2294.98878,2437.70531,True
4,Cessna,2203.575576,2437.70531,True
5,Douglas Aircraft Company,2451.572112,2437.70531,False
6,Tupolev,1056.567427,2437.70531,True
7,Beechcraft,1574.868853,2437.70531,True
8,de Havilland,2701.402261,2437.70531,False
9,Airbus,1793.441364,2437.70531,True


In [267]:
avgpd["closer?"].sum()

15

In [268]:
avgsim["larger?"] == avgpd["closer?"]

0      True
1      True
2      True
3      True
4      True
5     False
6      True
7      True
8     False
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
dtype: bool

Conclusion 1:

Both can basically explain similarity of variants among same manufacturer.

- By PD, 15/17 average of manufacturers is closer than whole mean.
- By ClassSim. 17/17 average of manufactuers is more similar than whole mean

### How about average among families?

In [269]:
one_variant_families = [fam for fam in families_to_id_dict.keys() if len(one_family_to_variants(fam)) == 1]

In [270]:
len(one_variant_families)

54

In [271]:
len(families_to_id_dict.keys())

70

In [272]:
[fam for fam in families_to_id_dict if fam not in one_variant_families]

['A330',
 'Boeing 767',
 'Cessna Citation',
 'A340',
 'Boeing 737',
 'Embraer E-Jet',
 'MD-80',
 'Dash 8',
 'Boeing 747',
 'Boeing 757',
 'Boeing 777',
 'BAE 146',
 'Embraer ERJ 145',
 'Gulfstream',
 'A320',
 'CRJ-700']

In [273]:
family_to_variants_ids("A330")

['27', '28']

In [274]:
def avg_family_df(df):
    avg_pair = [(fam, average_distance(df, family_to_variants_ids(fam))) for fam in families_to_id_dict.keys() if fam not in one_variant_families]
    return pd.DataFrame({"family":[fam for fam,_ in avg_pair], "avg": [avg for _,avg in avg_pair]})[["family","avg"]]

In [275]:
avg_fam_sim = avg_family_df(simdf)

In [276]:
avg_fam_sim["whole"] = whole_average(simdf)
avg_fam_sim["larger?"] = avg_fam_sim["avg"]>avg_fam_sim["whole"]

In [281]:
avg_fam_sim

Unnamed: 0,family,avg,whole,larger?
0,A330,0.785714,0.211542,True
1,Boeing 767,0.777778,0.211542,True
2,Cessna Citation,0.607143,0.211542,True
3,A340,0.723214,0.211542,True
4,Boeing 737,0.688776,0.211542,True
5,Embraer E-Jet,0.809524,0.211542,True
6,MD-80,0.714286,0.211542,True
7,Dash 8,0.464286,0.211542,True
8,Boeing 747,0.571429,0.211542,True
9,Boeing 757,0.678571,0.211542,True


In [282]:
avg_fam_pd = avg_family_df(pddf)

In [283]:
avg_fam_pd["whole"] = whole_average(pddf)
avg_fam_pd["closer?"] = avg_fam_pd["avg"]<avg_fam_pd["whole"]

In [284]:
avg_fam_pd

Unnamed: 0,family,avg,whole,closer?
0,A330,815.631849,2437.70531,True
1,Boeing 767,1289.046835,2437.70531,True
2,Cessna Citation,1189.884357,2437.70531,True
3,A340,1232.226903,2437.70531,True
4,Boeing 737,2201.989026,2437.70531,True
5,Embraer E-Jet,1302.018526,2437.70531,True
6,MD-80,1013.910031,2437.70531,True
7,Dash 8,1414.531627,2437.70531,True
8,Boeing 747,1586.966396,2437.70531,True
9,Boeing 757,1053.980489,2437.70531,True


Conclusion2: Same family is very similar.