## Analyse distribution in datasets

In [2]:
import json
import pandas as pd


def load_data_as_df(metadata_path: str):
    with open(metadata_path, "r") as f:
        data = json.load(f)
    return pd.DataFrame.from_dict(data, orient="index")

def analyse(metadata_path: str):
    df = load_data_as_df(metadata_path)

    print("=" * 50)
    print(f"Analyzing {metadata_path.split('/')[-1].split('.')[0]}")
    print("=" * 50)

    # start by counting the level 
    level_counts = df["level"].value_counts()
    level_percentages = df["level"].value_counts(normalize=True) * 100
    level_table = pd.DataFrame({
        'Count': level_counts,
        'Percentage': level_percentages
    })

    # count the object names per level (train_object_name and test_object_name)
    train_object_counts_by_level = df.groupby(['level', 'train_object_name']).size().reset_index(name='Count')
    train_object_counts_by_level['Percentage'] = df.groupby(['level', 'train_object_name']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum()).values

    test_object_counts_by_level = df.groupby(['level', 'test_object_name']).size().reset_index(name='Count')
    test_object_counts_by_level['Percentage'] = df.groupby(['level', 'test_object_name']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum()).values

    # table that counts the transformation_domain per level
    transformation_domain_by_level = df.groupby(['level', 'transformation_domain']).size().reset_index(name='Count')
    transformation_domain_by_level['Percentage'] = df.groupby(['level', 'transformation_domain']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum()).values

    display(level_table)
    display(train_object_counts_by_level)
    display(test_object_counts_by_level)
    display(transformation_domain_by_level)

analyse("/home/ubuntu/kiva-iccv/data/train.json")
analyse("/home/ubuntu/kiva-iccv/data/validation.json")


Analyzing train


Unnamed: 0_level_0,Count,Percentage
level,Unnamed: 1_level_1,Unnamed: 2_level_1
kiva-functions-compositionality,2208,80.232558
kiva-functions,368,13.372093
kiva,176,6.395349


Unnamed: 0,level,train_object_name,Count,Percentage
0,kiva,airplane,1,0.568182
1,kiva,barn,1,0.568182
2,kiva,basket,4,2.272727
3,kiva,bass,1,0.568182
4,kiva,birdie,2,1.136364
...,...,...,...,...
276,kiva-functions-compositionality,tea,15,0.679348
277,kiva-functions-compositionality,toaster,23,1.041667
278,kiva-functions-compositionality,train,21,0.951087
279,kiva-functions-compositionality,trainpeople,28,1.268116


Unnamed: 0,level,test_object_name,Count,Percentage
0,kiva,airplane,3,1.704545
1,kiva,barn,1,0.568182
2,kiva,basket,1,0.568182
3,kiva,bass,3,1.704545
4,kiva,birdie,2,1.136364
...,...,...,...,...
275,kiva-functions-compositionality,tea,30,1.358696
276,kiva-functions-compositionality,toaster,22,0.996377
277,kiva-functions-compositionality,train,23,1.041667
278,kiva-functions-compositionality,trainpeople,24,1.086957


Unnamed: 0,level,transformation_domain,Count,Percentage
0,kiva,Counting,64,36.363636
1,kiva,Reflect,32,18.181818
2,kiva,Resizing,32,18.181818
3,kiva,Rotation,48,27.272727
4,kiva-functions,Counting,128,34.782609
5,kiva-functions,Reflect,32,8.695652
6,kiva-functions,Resizing,96,26.086957
7,kiva-functions,Rotation,112,30.434783
8,kiva-functions-compositionality,"Counting,Reflect",256,11.594203
9,kiva-functions-compositionality,"Counting,Resizing",768,34.782609


Analyzing validation


Unnamed: 0_level_0,Count,Percentage
level,Unnamed: 1_level_1,Unnamed: 2_level_1
kiva-functions-compositionality,4278,80.232558
kiva-functions,713,13.372093
kiva,341,6.395349


Unnamed: 0,level,train_object_name,Count,Percentage
0,kiva,airplane,1,0.293255
1,kiva,barn,4,1.173021
2,kiva,basket,2,0.586510
3,kiva,bass,1,0.293255
4,kiva,birdie,5,1.466276
...,...,...,...,...
294,kiva-functions-compositionality,tea,41,0.958392
295,kiva-functions-compositionality,toaster,43,1.005143
296,kiva-functions-compositionality,train,42,0.981767
297,kiva-functions-compositionality,trainpeople,35,0.818139


Unnamed: 0,level,test_object_name,Count,Percentage
0,kiva,airplane,6,1.759531
1,kiva,barn,5,1.466276
2,kiva,basket,3,0.879765
3,kiva,bass,4,1.173021
4,kiva,birdie,1,0.293255
...,...,...,...,...
293,kiva-functions-compositionality,tea,46,1.075269
294,kiva-functions-compositionality,toaster,43,1.005143
295,kiva-functions-compositionality,train,41,0.958392
296,kiva-functions-compositionality,trainpeople,42,0.981767


Unnamed: 0,level,transformation_domain,Count,Percentage
0,kiva,Counting,124,36.363636
1,kiva,Reflect,62,18.181818
2,kiva,Resizing,62,18.181818
3,kiva,Rotation,93,27.272727
4,kiva-functions,Counting,248,34.782609
5,kiva-functions,Reflect,62,8.695652
6,kiva-functions,Resizing,186,26.086957
7,kiva-functions,Rotation,217,30.434783
8,kiva-functions-compositionality,"Counting,Reflect",496,11.594203
9,kiva-functions-compositionality,"Counting,Resizing",1488,34.782609
