# Evaluate ClassSim and PD of fgvc.

In [1]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df_fgvc.dat"
PD_SIMILARITY_PATH="results/GMMDisttances_fgvc.dat"

In [3]:
simdf = pd.read_pickle(VALID_SIMILARITY_DICT_PATH)
pddf=pd.read_pickle(PD_SIMILARITY_PATH)

### Build maker-family-variant hierachy

In [4]:
def category_file_to_id_dict(fpath):
    with open(fpath) as f:
        return {cat.rstrip("\n"): id for id, cat in enumerate(f)}

In [5]:
variants_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/variants.txt")
families_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/families.txt")
manufacturers_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/manufacturers.txt")

In [6]:
def file_type_to_var_list(fpath):
    with open(fpath) as f:
         return [line.rstrip("\n").split(" ", 1) for line in f]

In [7]:
file_var_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_variant_val.txt")
file_fam_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_family_val.txt")
file_man_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_manufacturer_val.txt")

In [8]:
variant_to_family_dict = {}
family_to_manufacturer_dict = {}

In [9]:
for idx, _ in enumerate(file_var_list):
    vari, fam, man = file_var_list[idx][1], file_fam_list[idx][1], file_man_list[idx][1]
    variant_to_family_dict[vari] = fam
    family_to_manufacturer_dict[fam] = man

Check dict integrity

In [10]:
variants_to_id_dict.keys() == variant_to_family_dict.keys()

True

In [11]:
families_to_id_dict.keys() == family_to_manufacturer_dict.keys()

True

In [12]:
manufacturers_to_id_dict.keys() == set(family_to_manufacturer_dict.values())

True

In [13]:
file_var_list[0], file_fam_list[0], file_man_list[0]

(['0481847', '707-320'], ['0481847', 'Boeing 707'], ['0481847', 'Boeing'])

In [14]:
variant_to_family_dict['707-320'], family_to_manufacturer_dict['Boeing 707']

('Boeing 707', 'Boeing')

# Is manufacturers similar each other?

first check Boeing by hand

In [15]:
boeing_families = [key for key, val in family_to_manufacturer_dict.items() if val == "Boeing"]

In [16]:
boeing_variants = [key for key, val in variant_to_family_dict.items() if val in boeing_families]

In [17]:
boeing_variants[0:5]

['737-700', '737-200', '737-600', '757-300', '767-400']

In [18]:
boeing_variants_ids = sorted([str(variants_to_id_dict[vari]) for vari in boeing_variants])

In [19]:
pddf['0'].loc[list(set(boeing_variants_ids)-{"0"})]

16    2650.326974
2     3208.201787
17    2577.656687
7     2507.363585
41    2654.367182
4     2572.018896
6     2653.518319
18    2589.787980
1     3046.101723
14    2590.393673
3     2492.388843
20    2669.536654
8     2667.412616
19    2722.166783
12    2688.777579
15    2594.210519
11    2711.677687
13    2743.839675
10    2766.734586
5     2596.947708
9     2776.091813
Name: 0, dtype: float64

In [20]:
# average PD between '0' and other boeing manufacturers
pddf['0'].loc[list(set(boeing_variants_ids)-{"0"})].mean()

2689.5010128967497

In [21]:
# average PD between '0' and all non boeing variants
pddf['0'].loc[list(set(pddf.columns)-set(boeing_variants_ids))].mean()

2802.9170090293464

In [22]:
# average similarity between '0' and other boeing manufacturers
simdf['0'].loc[list(set(boeing_variants_ids)-{"0"})].mean()

0.28911564625850333

In [23]:
# average similarity between '0' and non boeing variants
simdf['0'].loc[list(set(simdf.columns)-set(boeing_variants_ids))].mean()

0.18040293040293034

Compare average among each manufacturer's similarity and whole average.

In [24]:
def manufacturer_to_families(manufacturer):
    return [key for key, val in family_to_manufacturer_dict.items() if val == manufacturer]

def one_family_to_variants(family):
    return [key for key, val in variant_to_family_dict.items() if val == family]

def families_to_variants(families):
    return [key for key, val in variant_to_family_dict.items() if val in families]

# sort here
def variants_to_ids(variants):
    return sorted([str(variants_to_id_dict[vari]) for vari in variants])

def manufacturer_to_variants_ids(manufacturer):
    return variants_to_ids(families_to_variants(manufacturer_to_families(manufacturer)))

def family_to_variants_ids(fam):
    return variants_to_ids(one_family_to_variants(fam))



In [25]:
def sum_distance_from(df, fromone, to_list):
    ser = df[fromone].loc[to_list]
    return ser.sum(), ser.count()

In [26]:
def sum_distance(df, keys):
    rest = set(keys)
    sum_, count_ = 0, 0
    for key in keys:
        rest = rest-{key}
        if len(rest) > 0:
            onesum, onecount = sum_distance_from(df, key, list(rest))
            sum_ += onesum
            count_ += onecount
    return sum_, count_

def average_distance(df, keys):
    sum_, count_ = sum_distance(df, keys)
    return sum_/count_

In [42]:
def whole_average(df):
    rest = set(df.columns)
    sum_ = 0
    count_ = 0
    for key in df.columns:
        rest = rest - {key}
        if len(rest) > 0:
            ser = df[key].loc[list(rest)]
            sum_ += ser.sum()
            count_ += ser.count()
    return sum_/count_

In [28]:
average_distance(simdf, boeing_variants_ids)

0.42145949288806422

In [29]:
average_distance(pddf, boeing_variants_ids)

2185.9100791014562

In [43]:
whole_average(simdf)

0.20961038961038964

In [44]:
whole_average(pddf)

2461.1847923272503

In [45]:
one_variant_manufacturers = [man for man in manufacturers_to_id_dict.keys() if len(manufacturer_to_variants_ids(man)) == 1]

In [46]:
one_variant_manufacturers

['Robin',
 'Eurofighter',
 'Fairchild',
 'Cirrus Aircraft',
 'Bombardier Aerospace',
 'Yakovlev',
 'Piper',
 'Panavia',
 'Supermarine',
 'Antonov',
 'Lockheed Martin',
 'Ilyushin',
 'Dornier']

In [47]:
def avg_manufacturer_df(df):
    avg_pair = [(man, average_distance(df, manufacturer_to_variants_ids(man))) for man in manufacturers_to_id_dict.keys() if man not in one_variant_manufacturers]
    return pd.DataFrame({"manufacturers":[man for man,_ in avg_pair], "avg": [avg for _,avg in avg_pair]})[["manufacturers","avg"]]

In [48]:
avgsim = avg_manufacturer_df(simdf)

In [49]:
avgsim["whole"] = whole_average(simdf)
avgsim["larger?"] = avgsim["avg"]>avgsim["whole"]

In [50]:
avgsim

Unnamed: 0,manufacturers,avg,whole,larger?
0,Saab,0.285714,0.20961,True
1,Lockheed Corporation,0.214286,0.20961,True
2,Airbus,0.498168,0.20961,True
3,Gulfstream Aerospace,0.857143,0.20961,True
4,Fokker,0.261905,0.20961,True
5,Boeing,0.421459,0.20961,True
6,McDonnell Douglas,0.29932,0.20961,True
7,Canadair,0.642857,0.20961,True
8,Cessna,0.309524,0.20961,True
9,ATR,0.5,0.20961,True


In [51]:
avgsim["larger?"].sum()

16

In [52]:
avgpd = avg_manufacturer_df(pddf)

In [53]:
avgpd["whole"] = whole_average(pddf)
avgpd["closer?"] = avgpd["avg"]<avgpd["whole"]

In [54]:
avgpd

Unnamed: 0,manufacturers,avg,whole,closer?
0,Saab,2191.509674,2461.184792,True
1,Lockheed Corporation,3230.771347,2461.184792,False
2,Airbus,1942.894811,2461.184792,True
3,Gulfstream Aerospace,1927.477633,2461.184792,True
4,Fokker,2210.544017,2461.184792,True
5,Boeing,2185.910079,2461.184792,True
6,McDonnell Douglas,2677.48691,2461.184792,False
7,Canadair,1968.508887,2461.184792,True
8,Cessna,2938.100768,2461.184792,False
9,ATR,1882.739083,2461.184792,True


In [55]:
avgpd["closer?"].sum()

11

In [56]:
avgsim["larger?"] == avgpd["closer?"]

0      True
1     False
2      True
3      True
4      True
5      True
6     False
7      True
8     False
9      True
10    False
11    False
12     True
13     True
14    False
15     True
16    False
dtype: bool

Conclusion 1:

Both can basically explain similarity of variants among same manufacturer.

- By PD, 11/17 average of manufacturers is closer than whole mean.
- By ClassSim. 16/17 average of manufactuers is more similar than whole mean

### How about average among families?

In [57]:
one_variant_families = [fam for fam in families_to_id_dict.keys() if len(one_family_to_variants(fam)) == 1]

In [58]:
len(one_variant_families)

54

In [59]:
len(families_to_id_dict.keys())

70

In [60]:
[fam for fam in families_to_id_dict if fam not in one_variant_families]

['A340',
 'A320',
 'Embraer E-Jet',
 'Cessna Citation',
 'Boeing 757',
 'BAE 146',
 'Dash 8',
 'MD-80',
 'CRJ-700',
 'Boeing 777',
 'Boeing 767',
 'Embraer ERJ 145',
 'Boeing 747',
 'Gulfstream',
 'Boeing 737',
 'A330']

In [65]:
family_to_variants_ids("A330")

['27', '28']

In [66]:
def avg_family_df(df):
    avg_pair = [(fam, average_distance(df, family_to_variants_ids(fam))) for fam in families_to_id_dict.keys() if fam not in one_variant_families]
    return pd.DataFrame({"family":[fam for fam,_ in avg_pair], "avg": [avg for _,avg in avg_pair]})[["family","avg"]]

In [67]:
avg_fam_sim = avg_family_df(simdf)

In [68]:
avg_fam_sim["whole"] = whole_average(simdf)
avg_fam_sim["larger?"] = avg_fam_sim["avg"]>avg_fam_sim["whole"]

In [69]:
avg_fam_sim

Unnamed: 0,family,avg,whole,larger?
0,A340,0.678571,0.20961,True
1,A320,0.821429,0.20961,True
2,Embraer E-Jet,0.761905,0.20961,True
3,Cessna Citation,0.571429,0.20961,True
4,Boeing 757,0.785714,0.20961,True
5,BAE 146,0.785714,0.20961,True
6,Dash 8,0.214286,0.20961,True
7,MD-80,0.571429,0.20961,True
8,CRJ-700,0.928571,0.20961,True
9,Boeing 777,0.642857,0.20961,True


In [70]:
avg_fam_pd = avg_family_df(pddf)

In [71]:
avg_fam_pd["whole"] = whole_average(pddf)
avg_fam_pd["closer?"] = avg_fam_pd["avg"]<avg_fam_pd["whole"]

In [72]:
avg_fam_pd

Unnamed: 0,family,avg,whole,closer?
0,A340,1642.969204,2461.184792,True
1,A320,2161.060661,2461.184792,True
2,Embraer E-Jet,1953.027788,2461.184792,True
3,Cessna Citation,2379.768713,2461.184792,True
4,Boeing 757,2107.960979,2461.184792,True
5,BAE 146,1911.716129,2461.184792,True
6,Dash 8,2829.063253,2461.184792,False
7,MD-80,2027.820062,2461.184792,True
8,CRJ-700,1915.666108,2461.184792,True
9,Boeing 777,1753.764321,2461.184792,True


Conclusion2: Same family is very similar.
But while 15/16 families are closer to whole average with PD, 16/16 with ClassSim.