In [3]:
import json
import collections
from collections import defaultdict
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from collections import Counter
import math
from statistics import mean

import numpy as np
from numpy import dot
from numpy.linalg import norm

import scipy
from scipy.optimize import linear_sum_assignment

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/anyaji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [6]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [42]:
with open('../full.json') as f:
    sparse = json.load(f)
with open('../dense.json') as f:
    dense = json.load(f)
with open('../dense10.json') as f:
    sparse_74 = json.load(f)

# segmentation agreement

In [43]:
# https://cstheory.stackexchange.com/questions/6569/edit-distance-between-two-partitions/6582#6582
'''
d: {1:str, 2:str, ..., 7:str}
returns s: e.g. [[1,2,5],[4],[3,6,7]]
'''
def make_sets(d):
    s=[]
    rev=defaultdict(list)
    for k,v in d.items():
        rev[v].append(k)
    for ann, ind_set in rev.items():
        s.append(ind_set)
    return s

def weight(s1,s2):
    return len(set(s1)&set(s2)) #weight is the # of pieces matching

'''
l1,l2: e.g. [[1,2,5],[4],[3,6,7]]
returns: len(l1)*len(l2) cost matrix matching elm from l1 to l2
'''
def cost_matrix(l1,l2):
    mat = np.zeros((len(l1),len(l2)))
    for i in range(len(l1)):
        for j in range(len(l2)):
            mat[i][j] = weight(l1[i],l2[j])
    return mat

'''
d1,d2: piece-annotation dictionaries
returns: number, higher value/cost == higher agreement (MAX number of pieces that do not change)
'''
def seg_agreement(d1,d2):
    cost = cost_matrix(make_sets(d1),make_sets(d2))
    row_ind, col_ind = linear_sum_assignment(cost, True) # maximum weight matching
    return cost[row_ind, col_ind].sum()
            
# print(seg_agreement({1:'body',2:'body',3:'body',4:'body',5:'face',6:'side fin',7:'tail fin of whale'},{1:'road',2:'crosswalk',3:'crosswalk',4:'crosswalk',5:'grass',6:'grass',7:'road'}))

In [69]:
#sparse
def stats_SA(data):
    file_to_segagr = defaultdict(float)
    for file, anns in data.items():
        piece_anns = [detail['piece'] for detail in anns]
        mean_agr=0
        l=len(piece_anns)
        for i in range(l-1):
            for j in range(i+1,l):
                mean_agr += seg_agreement(piece_anns[i],piece_anns[j])
        mean_agr /= l*(l-1)/2
        file_to_segagr[file] = mean_agr
    print(len(file_to_segagr))
    return np.mean(np.array(list(file_to_segagr.values()))), np.std(np.array(list(file_to_segagr.values())))

In [62]:
stats_SA(sparse)

1016


(5.30105981070548, 0.6178078374155623)

In [63]:
stats_SA(dense)

74


(5.092532628701481, 0.5311708922677416)

In [64]:
stats_SA(sparse_74)

74


(5.338957138957139, 0.7690646011005382)

# shape / part ND

In [73]:
def clean(x):
    words = tokenize(x.lower())
    wl = [stemmer.stem(w) for w in words if w not in stop_words and (w.islower() or w.isalnum())]
    return wl

def naming_div(anns, is_whole_anns):
    cleaned_ann_list = []

    for ann in anns:
        if is_whole_anns:
            cleaned_ann = clean(ann['whole'])
            cleaned_ann_list.append(cleaned_ann)
#             print(ann['whole'])
        else:
            cleaned_ann = []
            parts_set = set(ann['piece'].values()) # parts, excluding duplicates
            for word in parts_set:
                wl = clean(word)
                cleaned_ann+= wl
            cleaned_ann_list.append(cleaned_ann)
#             print(list(set(ann['piece'].values())))

    nd = 0
    num_ann = len(cleaned_ann_list)
    # each annotation
    for i in range(len(cleaned_ann_list)):
        frq = 0
        # each word in one annotation
        wl = cleaned_ann_list[i]
        for w in wl:
            appeared=0
            for j in range(len(cleaned_ann_list)):
                if j!=i:
                    wll = cleaned_ann_list[j]
                    if w in wll:
                        appeared+=1
            frq += 1 - appeared / (num_ann-1) #proportion of the word appearing in other annotations
        if len(wl) != 0:
            nd += frq/len(wl) # nd += mean frq (1-p) of each annotation
    return nd/num_ann

In [74]:
def stats_ND(data, is_whole):
    file_to_nd = {} # {(divergence, unique set), ...}
    for file, anns in data.items():
        file_to_nd[file] = naming_div(anns, is_whole)
    print(len(file_to_nd))
    return np.mean(np.array(list(file_to_nd.values()))), np.std(np.array(list(file_to_nd.values())))

### shape ND

In [81]:
stats_ND(sparse, True)

1016


(0.9083859841856889, 0.10913124904487614)

In [76]:
stats_ND(dense, True)

74


(0.9254725233684035, 0.05837862268068055)

In [77]:
stats_ND(sparse_74, True)

74


(0.8954085579085579, 0.14878097698694126)

### part ND

In [78]:
stats_ND(sparse, False)

1016


(0.7647113108587915, 0.18553479097236833)

In [79]:
stats_ND(dense, False)

74


(0.7900945754417004, 0.14607280818213772)

In [80]:
stats_ND(sparse_74, False)

74


(0.7270269866519867, 0.19591484995477443)