In [1]:
import json
import collections
from collections import defaultdict
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from collections import Counter
import math
from statistics import mean

import numpy as np
from numpy import dot
from numpy.linalg import norm

import scipy
from scipy.optimize import linear_sum_assignment
from scipy import stats
from scipy.stats import bootstrap

import pickle

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/anyaji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [4]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [5]:
with open('../sparse-1006.json') as f:
    sparse = json.load(f)
with open('../dense-74.json') as f:
    dense = json.load(f)
with open('../sparse-74.json') as f:
    sparse_74 = json.load(f)

In [6]:
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": ["Times"],
    'axes.labelsize': 'x-large',
    'axes.titlesize':'x-large',
    'xtick.labelsize':'x-large',
    'ytick.labelsize':'x-large',
    'font.size': 20
})

In [7]:
def clean(x):
    words = tokenize(x.lower())
    wl = [stemmer.stem(w) for w in words if w not in stop_words and (w.islower() or w.isalnum())]
    return wl

In [8]:
def save_pickle(savename, d):
    with open('./pickles/'+savename+'.pkl', 'wb') as handle:
        pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
def unpickle(savename):
    with open('./pickles/'+savename+'.pkl', 'rb') as handle:
        return pickle.load(handle)

In [10]:
sampled = [
  "page2-34.svg",
  "page9-46.svg",
  "page3-85.svg",
  "page7-107.svg",
  "page8-159.svg",
  "page6-203.svg",
  "page2-112.svg",
  "page1-116.svg",
  "page1-69.svg",
  "page8-234.svg",
  "page8-21.svg",
  "page5-75.svg",
  "page1-0.svg",
  "page5-59.svg",
  "page3-121.svg",
  "page6-164.svg",
  "page4-128.svg",
  "page5-136.svg",
  "page6-99.svg",
  "page7-14.svg",
  "page5-128.svg",
  "page9-27.svg",
  "page7-105.svg",
  "page6-162.svg",
  "page9-13.svg",
  "page1-128.svg",
  "page5-186.svg",
  "page3-72.svg",
  "page4-157.svg",
  "page3-182.svg",
  "page7-197.svg",
  "page7-180.svg",
  "page6-143.svg",
  "page7-81.svg",
  "page3-136.svg",
  "page5-64.svg",
  "page7-218.svg",
  "page3-128.svg",
  "page7-26.svg",
  "page6-78.svg",
  "page4-24.svg",
  "page5-153.svg",
  "page7-248.svg",
  "page5-244.svg",
  "page4-93.svg",
  "page5-28.svg",
  "page8-235.svg",
  "page5-200.svg",
  "page2-131.svg",
  "page8-183.svg",
  "page1-119.svg",
  "page5-232.svg",
  "page1-129.svg",
  "page4-162.svg",
  "page3-41.svg",
  "page6-180.svg",
  "page6-149.svg",
  "page1-105.svg",
  "page4-10.svg",
  "page5-178.svg",
  "page2-137.svg",
  "page3-35.svg",
  "page-A.svg",
  "page-B.svg",
  "page-C.svg",
  "page-D.svg",
  "page-E.svg",
  "page-F.svg",
  "page-G.svg",
  "page-H.svg",
  "page-I.svg",
  "page-J.svg",
  "page-K.svg",
  "page-L.svg",
]

# Naming Divergence

In [11]:
def clean_anns(data, is_whole):
    '''
    [data]: {file:[{whole:..., part:{'1':xxx,'2':xxx,...}}]}
    returns: {file: [[cleaned, anns, one],[...],...]}
    '''
    file_to_cleaned_ann_list = {}
    if is_whole:
        for file, anns_dicts in data.items():
            cleaned_ann_list=[]
            for ann in anns_dicts:
                whole_anns_data = ann['whole']
                cleaned_ann = clean(whole_anns_data)
                cleaned_ann_list.append(list(cleaned_ann))
            file_to_cleaned_ann_list[file] = cleaned_ann_list
    else:
        for file, anns_dicts in data.items():
            cleaned_ann_list=[]
            for ann in anns_dicts:
                cleaned_ann = []
                piece_anns_data = ann['piece']
                parts_set = set(piece_anns_data.values()) # parts, excluding duplicates, {head, dog body, tail}
                for word in parts_set:
                    wl = clean(word) # [dog, body]
                    cleaned_ann+= wl # [head, dog, body]
                cleaned_ann_list.append(cleaned_ann)
            file_to_cleaned_ann_list[file] = cleaned_ann_list
        
    return file_to_cleaned_ann_list

In [12]:
SPARSE_WHOLE=clean_anns(sparse, True)

In [13]:
DENSE_WHOLE=clean_anns(dense, True)

In [14]:
SPARSE74_WHOLE=clean_anns(sparse_74, True)

In [15]:
SPARSE_PART=clean_anns(sparse, False)

In [16]:
DENSE_PART=clean_anns(dense, False)

In [17]:
SPARSE74_PART=clean_anns(sparse_74, False)

In [82]:
def calc_ND(cleaned_ann_list):
    '''
    [cleaned_ann_list]: a list of lists of cleaned annotations, e.g. [[dog], [hous, hill], ...]
    '''
    nd = 0 
    num_ann = len(cleaned_ann_list)
    # each annotation
    for i in range(num_ann):
        frq = 0
        # each word in one annotation
        wl = cleaned_ann_list[i]
        for w in wl:
            appeared=0
            for j in range(num_ann):
                if j!=i:
                    wll = cleaned_ann_list[j]
                    if w in wll:
                        appeared+=1
            frq += 1 - appeared / (num_ann-1) #proportion of the word appearing in other annotations
        if len(wl) != 0:
            nd+=frq/len(wl)  # nd += mean frq (1-p) of each annotation
    rs=nd/num_ann
    assert rs<=1
    RESULT.append(rs)
    return rs

In [61]:
def bootstrap_ND(file_to_clean_anns_list, resample):
    rerun=[] # all same distributions
    file_to_nd = {}
    k=0
    global RESULT
    RESULT=[]
    for file, cleaned_anns_list in file_to_clean_anns_list.items():
        if k%20==0:
            print(k)
        k+=1
        
        input_data=(cleaned_anns_list,)
        res = bootstrap(input_data, calc_ND, confidence_level=0.95, vectorized=False, n_resamples=resample, method='percentile') 
         
        try:
            assert len(RESULT)==resample
        except:
            print('***', len(RESULT), file)
        ci_l,ci_u=np.percentile(RESULT, [2.5, 97.5])
        m=np.mean(RESULT)
        file_to_nd[file] = (ci_l, ci_u, m)
        RESULT=[]
        
    return file_to_nd

### sparse

In [45]:
#SPARSE
file_to_nd = bootstrap_ND(file_to_clean_anns_list=SPARSE_WHOLE, resample=1000)
len(file_to_nd)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000


1016

In [46]:
save_pickle('CUSTOM_sparse_whole_1000', file_to_nd)

In [62]:
file_to_nd = bootstrap_ND(file_to_clean_anns_list=SPARSE_PART, resample=1000)
len(file_to_nd)

0
*** 3000 page1-116
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000


1016

### temp fix for page1-116: it automatically turned into 10x3 np.array...

In [63]:
file_to_nd['page1-116']

(0.15000000000000005, 0.7574074074074074, 0.5039521693121694)

In [64]:
class Wrapper:
    def __init__(self, data):
        self.data=data
    def get_data(self):
        return self.data

In [76]:
def TEMP_calc_ND(cleaned_ann_list):
    '''
    [cleaned_ann_list]: a list of lists of cleaned annotations, e.g. [Wrapper[dog], Wrapper[hous, hill], ...]
    '''
    nd = 0 
    num_ann = len(cleaned_ann_list)
    
    # each annotation
    for i in range(num_ann):
        frq = 0
        # each word in one annotation
        wl = cleaned_ann_list[i].data
        for w in wl:
            appeared=0
            for j in range(num_ann):
                if j!=i:
                    wll = cleaned_ann_list[j].data
                    if w in wll:
                        appeared+=1
            frq += 1 - appeared / (num_ann-1) #proportion of the word appearing in other annotations
        if len(wl) != 0:
            nd+=frq/len(wl)  # nd += mean frq (1-p) of each annotation
    TEMP_RESULT.append(nd/num_ann)
    return nd/num_ann

In [77]:
wrappers=[Wrapper(a) for a in SPARSE_PART['page1-116']]

In [78]:
TEMP_RESULT=[]
res=bootstrap((wrappers,), TEMP_calc_ND, confidence_level=0.95, vectorized=False, n_resamples=1000, method='percentile') 

print(len(TEMP_RESULT))
assert len(TEMP_RESULT)==1000
ci_l,ci_u=np.percentile(TEMP_RESULT, [2.5, 97.5])
m=np.mean(TEMP_RESULT)
file_to_nd['page1-116'] = (ci_l, ci_u, m)
file_to_nd['page1-116']

1000


(0.4222222222222222, 0.7925925925925925, 0.6449851851851852)

In [79]:
save_pickle('CUSTOM_sparse_part_1000', file_to_nd)

### dense

In [80]:
#DENSE
dense_file_to_nd = bootstrap_ND(file_to_clean_anns_list=DENSE_WHOLE, resample=1000)
len(dense_file_to_nd)
save_pickle('CUSTOM_dense_whole_1000', dense_file_to_nd)

0
20
40
60


In [81]:
dense_file_to_nd = bootstrap_ND(file_to_clean_anns_list=DENSE_PART, resample=1000)
len(dense_file_to_nd)
save_pickle('CUSTOM_dense_part_1000', dense_file_to_nd)

0
20
40
60


In [83]:
dense_file_to_nd

{'page7-218': (0.8264002029882535, 0.9197545700087633, 0.8796221885294999),
 'page1-129': (0.9324262431750634, 0.9637721594443296, 0.951103598209966),
 'page-G': (0.4928875661375659, 0.6707766203703702, 0.5816236225348725),
 'page5-200': (0.7468890278353724, 0.8693076109556064, 0.8139776576934595),
 'page-L': (0.5134702168389038, 0.6759923285889196, 0.5952376322994253),
 'page-K': (0.6414147192827748, 0.796886056330501, 0.7190443665490888),
 'page-I': (0.444355264404507, 0.6391613911481335, 0.5438078850433145),
 'page3-41': (0.5582371902861292, 0.7594656787787686, 0.6668301392920496),
 'page4-162': (0.4987949668256271, 0.7641708134632664, 0.6398846499412536),
 'page-J': (0.7154141414141413, 0.8682225629308962, 0.8024456087328308),
 'page5-128': (0.7131146634055442, 0.8745626116158091, 0.8044877979656806),
 'page4-24': (0.4064828757458473, 0.6009429714677358, 0.5007036317587025),
 'page9-46': (0.8689831557186117, 0.9385724580712791, 0.9105177090612626),
 'page-E': (0.6290672999839667, 0