# Evaluate ClassSim and PD of fgvc.

In [2]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [3]:
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df_fgvc.dat"
PD_SIMILARITY_PATH="results/GMMDisttances_fgvc.dat"

In [5]:
simdf = pd.read_pickle(VALID_SIMILARITY_DICT_PATH)
pddf=pd.read_pickle(PD_SIMILARITY_PATH)

### Build maker-family-variant hierachy

In [7]:
def category_file_to_id_dict(fpath):
    with open(fpath) as f:
        return {cat.rstrip("\n"): id for id, cat in enumerate(f)}

In [8]:
variants_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/variants.txt")
families_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/families.txt")
manufacturers_to_id_dict = category_file_to_id_dict("data_fgvc/fgvc-aircraft-2013b/data/manufacturers.txt")

In [13]:
def file_type_to_var_list(fpath):
    with open(fpath) as f:
         return [line.rstrip("\n").split(" ", 1) for line in f]

In [14]:
file_var_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_variant_val.txt")
file_fam_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_family_val.txt")
file_man_list = file_type_to_var_list("data_fgvc/fgvc-aircraft-2013b/data/images_manufacturer_val.txt")

In [24]:
variant_to_family_dict = {}
family_to_manufacturer_dict = {}

In [25]:
for idx, _ in enumerate(file_var_list):
    vari, fam, man = file_var_list[idx][1], file_fam_list[idx][1], file_man_list[idx][1]
    variant_to_family_dict[vari] = fam
    family_to_manufacturer_dict[fam] = man

Check dict integrity

In [26]:
variants_to_id_dict.keys() == variant_to_family_dict.keys()

True

In [28]:
families_to_id_dict.keys() == family_to_manufacturer_dict.keys()

True

In [31]:
manufacturers_to_id_dict.keys() == set(family_to_manufacturer_dict.values())

True

In [32]:
file_var_list[0], file_fam_list[0], file_man_list[0]

(['0481847', '707-320'], ['0481847', 'Boeing 707'], ['0481847', 'Boeing'])

In [33]:
variant_to_family_dict['707-320'], family_to_manufacturer_dict['Boeing 707']

('Boeing 707', 'Boeing')

# Is manufacturers similar each other?

first check Boeing by hand

In [35]:
boeing_families = [key for key, val in family_to_manufacturer_dict.items() if val == "Boeing"]

In [37]:
boeing_variants = [key for key, val in variant_to_family_dict.items() if val in boeing_families]

In [39]:
boeing_variants[0:5]

['737-200', '757-200', '747-300', '727-200', '747-400']

In [46]:
boeing_variants_ids = sorted([str(variants_to_id_dict[vari]) for vari in boeing_variants])

In [53]:
pddf['0'].iloc[list(set(boeing_variants_ids)-set("0"))]

46    2605.404566
21    2620.881085
23    2723.611698
16    2650.326974
22    2618.020668
11    2711.677687
12    2688.777579
19    2722.166783
18    2589.787980
2     3208.201787
1     3046.101723
20    2669.536654
13    2743.839675
14    2590.393673
15    2594.210519
17    2577.656687
25    2679.386613
24    2640.486420
26    2637.558114
27    2604.871014
10    2766.734586
Name: 0, dtype: float64

In [55]:
# average PD between '0' and other boeing manufacturers
pddf['0'].iloc[list(set(boeing_variants_ids)-set("0"))].mean()

2699.5063088816278

In [65]:
# average PD between '0' and all non boeing variants
pddf['0'].iloc[list(set(pddf.columns)-set(boeing_variants_ids))].mean()

2800.2232754949555

In [63]:
# average similarity between '0' and other boeing manufacturers
simdf['0'].iloc[list(set(boeing_variants_ids)-set("0"))].mean()

0.31632653061224486

In [66]:
# average similarity between '0' and non boeing variants
simdf['0'].iloc[list(set(simdf.columns)-set(boeing_variants_ids))].mean()

0.17307692307692304

Compare average among each manufacturer's similarity and whole average.

In [104]:
def manufacturer_to_families(manufacturer):
    return [key for key, val in family_to_manufacturer_dict.items() if val == manufacturer]

def families_to_variants(families):
    return [key for key, val in variant_to_family_dict.items() if val in families]

# sort here
def variants_to_ids(variants):
    return sorted([str(variants_to_id_dict[vari]) for vari in variants])

def manufacturer_to_variants_ids(manufacturer):
    return variants_to_ids(families_to_variants(manufacturer_to_families(manufacturer)))

In [82]:
def sum_distance_from(df, fromone, to_list):
    ser = df[fromone].iloc[to_list]
    return ser.sum(), ser.count()

In [89]:
def sum_distance(df, keys):
    rest = set(keys)
    sum_, count_ = 0, 0
    for key in keys:
        rest = rest-set(key)
        onesum, onecount = sum_distance_from(df, key, list(rest))
        sum_ += onesum
        count_ += onecount
    return sum_, count_

def average_distance(df, keys):
    sum_, count_ = sum_distance(df, keys)
    return sum_/count_

In [101]:
def whole_average(df):
    rest = set(df.columns)
    sum_ = 0
    count_ = 0
    for key in df.columns:
        rest = rest - set(key)
        ser = df[key].iloc[list(rest)]
        sum_ += ser.sum()
        count_ += ser.count()
    return sum_/count_

In [94]:
average_distance(simdf, boeing_variants_ids)

0.4652747252747253

In [95]:
average_distance(pddf, boeing_variants_ids)

2112.1622851258453

In [102]:
whole_average(simdf)

0.21341069260724918

In [103]:
whole_average(pddf)

2441.335788166723

In [142]:
one_variant_manufacturers = [man for man in manufacturers_to_id_dict.keys() if len(manufacturer_to_variants_ids(man)) == 1]

In [143]:
one_variant_manufacturers

['Dornier',
 'Robin',
 'Lockheed Martin',
 'Eurofighter',
 'Fairchild',
 'Ilyushin',
 'Yakovlev',
 'Piper',
 'Antonov',
 'Bombardier Aerospace',
 'Panavia',
 'Cirrus Aircraft',
 'Supermarine']

In [147]:
def avg_manufacturer_df(df):
    avg_pair = [(man, average_distance(df, manufacturer_to_variants_ids(man))) for man in manufacturers_to_id_dict.keys() if man not in one_variant_manufacturers]
    return pd.DataFrame({"manufacturers":[man for man,_ in avg_pair], "avg": [avg for _,avg in avg_pair]})[["manufacturers","avg"]]

In [149]:
avgsim = avg_manufacturer_df(simdf)

In [162]:
avgpd = avg_manufacturer_df(pddf)

In [153]:
avgsim["whole"] = whole_average(simdf)
avgsim["larger?"] = avgsim["avg"]>avgsim["whole"]

In [158]:
avgsim

Unnamed: 0,manufacturers,avg,whole,larger?
0,ATR,0.089286,0.213411,False
1,Lockheed Corporation,0.178571,0.213411,False
2,Gulfstream Aerospace,0.446429,0.213411,True
3,McDonnell Douglas,0.279883,0.213411,True
4,Cessna,0.191964,0.213411,False
5,Douglas Aircraft Company,0.196429,0.213411,False
6,Tupolev,0.321429,0.213411,True
7,Beechcraft,0.392857,0.213411,True
8,de Havilland,0.308571,0.213411,True
9,Airbus,0.32164,0.213411,True


In [169]:
avgsim["larger?"].sum()

11

In [163]:
avgpd["whole"] = whole_average(pddf)
avgpd["closer?"] = avgpd["avg"]<avgpd["whole"]

In [164]:
avgpd

Unnamed: 0,manufacturers,avg,whole,closer?
0,ATR,2471.333214,2441.335788,False
1,Lockheed Corporation,3267.940094,2441.335788,False
2,Gulfstream Aerospace,1705.62439,2441.335788,True
3,McDonnell Douglas,2327.731791,2441.335788,True
4,Cessna,3006.44108,2441.335788,False
5,Douglas Aircraft Company,3418.337399,2441.335788,False
6,Tupolev,1056.567427,2441.335788,True
7,Beechcraft,2148.496446,2441.335788,True
8,de Havilland,2863.756084,2441.335788,False
9,Airbus,1853.979635,2441.335788,True


In [168]:
avgpd["closer?"].sum()

11

In [167]:
avgsim["larger?"] == avgpd["closer?"]

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8     False
9      True
10     True
11     True
12     True
13     True
14    False
15     True
16     True
dtype: bool