# Code for Reproducing Experimental Results

## Import libraries and functions

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import numpy as np
import pandas as pd
import pickle as pkl

from utils import mae_quant_level
from utils import mae_tail_synthetic, mae_varyk_real, mae_varyk_synthetic, mae_varyn_real, mae_varyn_synthetic, quant_synthetic
from utils import supe_tail_synthetic, supe_varyn_real, supe_varyn_synthetic, supe_varyk_real, supe_varyk_synthetic
from utils import plot_dist_est, plot_quant_error, plot_quant_level, plot_stat_bound

In [None]:
prefix = 'results/mae'
prefix_df = 'results/supe'

os.system(f'mkdir -p {prefix}')
os.system(f'mkdir -p {prefix_df}')
os.system(f'mkdir -p results/real')

## Synthetic data

### Statistical error bound for frontier integral

We first vary the sample size.

In [None]:
supp = 1000
nrange = np.logspace(4, 5, 12).astype(int)
dist_pairs = [
    [('zipf', 2), ('uniform', '')], [('zipf', 2), ('zipf', 2)],
    [('zipf', 0), ('zipf', 0)], [('uniform', ''), ('dirichlet', '')]]
b_varyn = mae_varyn_synthetic(supp, nrange, dist_pairs, prefix=prefix, save=True)

Figure 9 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-zipf2-uniform-supp1000',
    'nvary-zipf2-zipf2-supp1000',
    'nvary-zipf0-zipf0-supp1000',
    'nvary-uniform-dirichlet-supp1000']
b_varyn = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-bound-nvary.pdf'
plot_stat_bound(b_varyn, xlabels, titles, const=50, fname=fname, save=False)

We then vary the support size.

In [None]:
n = 20000
krange = np.logspace(1, 4, 12).astype(int)
dist_pairs = [
    [('zipf', 2), ('uniform', '')], [('zipf', 2), ('zipf', 2)],
    [('zipf', 0), ('zipf', 0)], [('uniform', ''), ('dirichlet', '')]]
b_varyk = mae_varyk_synthetic(krange, n, dist_pairs, prefix=prefix, save=True)

Figure 10 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-zipf2-uniform-size20000',
    'kvary-zipf2-zipf2-size20000',
    'kvary-zipf0-zipf0-size20000',
    'kvary-uniform-dirichlet-size20000']
b_varyk = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-bound-kvary.pdf'
plot_stat_bound(b_varyk, xlabels, titles, const=50, fname=fname, save=False)

Finally, we vary the tail decay index of the distribution $Q$.

In [None]:
n = 10000
supp = 1000
orderq = np.linspace(0.5, 2, 12)
dists = [('uniform', ''), ('dirichlet', ''), ('zipf', 1), ('zipf', 2)]
b_tail = mae_tail_synthetic(supp, n, orderq, dists, prefix=prefix, save=True)

Figure 11 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'qvary-uniform-supp1000-size10000',
    'qvary-dirichlet-supp1000-size10000',
    'qvary-zipf1-supp1000-size10000',
    'qvary-zipf2-supp1000-size10000']
b_tail = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Tail decay']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-bound-qvary.pdf'
plot_stat_bound(b_tail, xlabels, titles, const=100, fname=fname, save=False, log_scale=False)

Figure 4 in the main text can be reproduced as follows.

In [None]:
const=100
fnames = [
    'nvary-zipf2-zipf2-supp1000',
    'kvary-zipf2-zipf2-size20000',
    'qvary-uniform-supp1000-size10000',
    'qvary-zipf2-supp1000-size10000']
dfs = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Sample size', 'Support size', 'Tail decay', 'Tail decay']
titles = [
    r'(a) $k=10^3$',
    r'(b) $n=2\times 10^4$',
    r'(c) $k=10^3, n=10^4$',
    r'(d) $k=10^3, n=10^4$']
fname = 'graphs/synthetic-bound.pdf'
plot_stat_bound(dfs, xlabels, titles, const=const, fname=fname, save=False)

### Statistical error bound for divergence frontiers

We first vary the sample size.

In [None]:
supp = 1000
nrange = np.logspace(4, 5, 12).astype(int)
dist_pairs = [
    [('zipf', 2), ('uniform', '')], [('zipf', 2), ('zipf', 2)],
    [('zipf', 0), ('zipf', 0)], [('uniform', ''), ('dirichlet', '')]]
lambdas = np.linspace(0.01, 0.99, 100)
b_varyn_df = supe_varyn_synthetic(supp, nrange, dist_pairs, lambdas, nrepeat=100, prefix=prefix_df, save=True)

Figure 9 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-zipf2-uniform-supp1000-df',
    'nvary-zipf2-zipf2-supp1000-df',
    'nvary-zipf0-zipf0-supp1000-df',
    'nvary-uniform-dirichlet-supp1000-df']
b_varyn_df = [pd.read_pickle(f'{prefix_df}/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-bound-nvary-df.pdf'
plot_stat_bound(b_varyn_df, xlabels, titles, const=10, fname=fname, save=False)

We then vary the support size.

In [None]:
n = 20000
krange = np.logspace(1, 4, 12).astype(int)
dist_pairs = [
    [('zipf', 2), ('uniform', '')], [('zipf', 2), ('zipf', 2)],
    [('zipf', 0), ('zipf', 0)], [('uniform', ''), ('dirichlet', '')]]
lambdas = np.linspace(0.01, 0.99, 100)
b_varyk_df = supe_varyk_synthetic(krange, n, dist_pairs, lambdas, nrepeat=100, prefix=prefix_df, save=True)

Figure 10 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-zipf2-uniform-size20000-df',
    'kvary-zipf2-zipf2-size20000-df',
    'kvary-zipf0-zipf0-size20000-df',
    'kvary-uniform-dirichlet-size20000-df']
b_varyk_df = [pd.read_pickle(f'{prefix_df}/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-bound-kvary-df.pdf'
plot_stat_bound(b_varyk_df, xlabels, titles, const=10, fname=fname, save=False)

Finally, we vary the tail decay index of the distribution $Q$.

In [None]:
n = 10000
supp = 1000
orderq = np.linspace(0.5, 2, 12)
dists = [('uniform', ''), ('dirichlet', ''), ('zipf', 1), ('zipf', 2)]
lambdas = np.linspace(0.01, 0.99, 100)
b_tail_df = supe_tail_synthetic(supp, n, orderq, dists, lambdas, nrepeat=100, prefix=prefix_df, save=True)

Figure 11 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'qvary-uniform-supp1000-size10000-df',
    'qvary-dirichlet-supp1000-size10000-df',
    'qvary-zipf1-supp1000-size10000-df',
    'qvary-zipf2-supp1000-size10000-df']
b_tail_df = [pd.read_pickle(f'{prefix_df}/{file}.pkl') for file in fnames]

xlabels = ['Tail decay']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-bound-qvary-df.pdf'
plot_stat_bound(b_tail_df, xlabels, titles, const=7, fname=fname, save=False, log_scale=False)

### Distribution estimators for frontier integral

We first vary the sample size.

In [None]:
supp = 1000
nrange = np.logspace(4, 5, 12).astype(int)
dist_pairs = [
    [('zipf', 1), ('step', '')], [('zipf', 0), ('dirichlet', '')],
    [('zipf', 2), ('uniform', '')], [('zipf', 1), ('zipf', 1)]]
est_varyn = mae_varyn_synthetic(supp, nrange, dist_pairs, prefix=prefix, save=True)

Figure 12 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-zipf1-step-supp1000',
    'nvary-zipf0-dirichlet-supp1000',
    'nvary-zipf2-uniform-supp1000',
    'nvary-zipf1-zipf1-supp1000']
est_varyn = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-smoothing-nvary.pdf'
plot_dist_est(est_varyn, xlabels, titles, fname=fname, save=False)

We then vary the support size.

In [None]:
n = 20000
krange = np.logspace(1, 4, 12).astype(int)
dist_pairs = [
    [('zipf', 1), ('step', '')], [('zipf', 0), ('dirichlet', '')],
    [('zipf', 2), ('uniform', '')], [('zipf', 1), ('zipf', 1)]]
est_varyk = mae_varyk_synthetic(krange, n, dist_pairs, prefix=prefix, save=True)

Figure 13 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-zipf1-step-size20000',
    'kvary-zipf0-dirichlet-size20000',
    'kvary-zipf2-uniform-size20000',
    'kvary-zipf1-zipf1-size20000']
est_varyk = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-smoothing-kvary.pdf'
plot_dist_est(est_varyk, xlabels, titles, fname=fname, save=False)

Finally, we vary the tail decay index of the distribution $Q$.

In [None]:
n = 10000
supp = 1000
orderq = np.linspace(0.5, 2, 12)
dists = [('uniform', ''), ('step', ''), ('zipf', 0), ('zipf', 2)]
est_tail = mae_tail_synthetic(supp, n, orderq, dists, prefix=prefix, save=True)

Figure 14 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'qvary-uniform-supp1000-size10000',
    'qvary-step-supp1000-size10000',
    'qvary-zipf0-supp1000-size10000',
    'qvary-zipf2-supp1000-size10000']
est_tail = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Tail decay']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-smoothing-qvary.pdf'
plot_dist_est(est_tail, xlabels, titles, fname=fname, save=False, log_scale=False)

Figure 6 in the main text can be reproduced as follows.

In [None]:
fnames = [
    'nvary-zipf0-dirichlet-supp1000',
    'kvary-zipf0-dirichlet-size20000',
    'qvary-uniform-supp1000-size10000',
    'qvary-zipf2-supp1000-size10000']
dfs = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Sample size', 'Support size', 'Tail decay', 'Tail decay']
titles = [
    r'(a) $k=10^3$',
    r'(b) $n=2\times 10^4$',
    r'(c) $k=10^3, n=10^4$',
    r'(d) $k=10^3, n=10^4$']
fname = 'graphs/synthetic-smoothing.pdf'
plot_dist_est(dfs, xlabels, titles, fname=fname, save=False)

### Distribution estimators for divergence frontiers

We first vary the sample size.

In [None]:
supp = 1000
nrange = np.logspace(4, 5, 12).astype(int)
dist_pairs = [
    [('zipf', 1), ('step', '')], [('zipf', 0), ('dirichlet', '')],
    [('zipf', 2), ('uniform', '')], [('zipf', 1), ('zipf', 1)]]
lambdas = np.linspace(0.01, 0.99, 100)
est_varyn_df = supe_varyn_synthetic(supp, nrange, dist_pairs, lambdas, prefix=prefix_df, save=True)

Figure 12 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-zipf1-step-supp1000-df',
    'nvary-zipf0-dirichlet-supp1000-df',
    'nvary-zipf2-uniform-supp1000-df',
    'nvary-zipf1-zipf1-supp1000-df']
est_varyn_df = [pd.read_pickle(f'{prefix_df}/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-smoothing-nvary-df.pdf'
plot_dist_est(est_varyn_df, xlabels, titles, fname=fname, save=False)

We then vary the support size.

In [None]:
n = 20000
krange = np.logspace(1, 4, 12).astype(int)
dist_pairs = [
    [('zipf', 1), ('step', '')], [('zipf', 0), ('dirichlet', '')],
    [('zipf', 2), ('uniform', '')], [('zipf', 1), ('zipf', 1)]]
lambdas = np.linspace(0.01, 0.99, 100)
est_varyk_df = supe_varyk_synthetic(krange, n, dist_pairs, lambdas, prefix=prefix_df, save=True)

Figure 13 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-zipf1-step-size20000-df',
    'kvary-zipf0-dirichlet-size20000-df',
    'kvary-zipf2-uniform-size20000-df',
    'kvary-zipf1-zipf1-size20000-df']
est_varyk_df = [pd.read_pickle(f'{prefix_df}/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-smoothing-kvary-df.pdf'
plot_dist_est(est_varyk_df, xlabels, titles, fname=fname, save=False)

Finally, we vary the tail decay index of the distribution $Q$.

In [None]:
n = 10000
supp = 1000
orderq = np.linspace(0.5, 2, 12)
dists = [('uniform', ''), ('step', ''), ('zipf', 0), ('zipf', 2)]
lambdas = np.linspace(0.01, 0.99, 100)
est_tail_df = supe_tail_synthetic(supp, n, orderq, dists, lambdas, prefix=prefix_df, save=True)

Figure 14 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'qvary-uniform-supp1000-size10000-df',
    'qvary-step-supp1000-size10000-df',
    'qvary-zipf0-supp1000-size10000-df',
    'qvary-zipf2-supp1000-size10000-df']
est_tail_df = [pd.read_pickle(f'{prefix_df}/{file}.pkl') for file in fnames]

xlabels = ['Tail decay']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-smoothing-qvary-df.pdf'
plot_dist_est(est_tail_df, xlabels, titles, fname=fname, save=False, log_scale=False)

### Quantization error

In [None]:
supp = 1000
krange = np.logspace(1, 2.7, 12).astype(int)
dist_pairs = [
    [('uniform', ''), ('dirichlet', '')], [('zipf', 0), ('dirichlet', '')],
    [('zipf', 2), ('step', '')], [('zipf', 1), ('zipf', 2)]]
quant = quant_synthetic(supp, krange, dist_pairs, prefix=prefix, save=True)

Figure 15 in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'quant-uniform-dirichlet-supp1000',
    'quant-zipf0-dirichlet-supp1000',
    'quant-zipf2-step-supp1000',
    'quant-zipf1-zipf2-supp1000']
quant = [pd.read_pickle(f'{prefix}/{file}.pkl') for file in fnames]

xlabels = ['Number of bins']*4
titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-quant.pdf'
plot_quant_error(quant, xlabels, titles, fname=fname, save=False)

### Quantization level

In [None]:
nrange = np.logspace(2, 5, 10).astype(int)
dim = 2
dists = ['normal', 'normal', 't', 't']
pars = [[(np.zeros(dim), np.identity(dim)),
         (np.ones(dim), np.identity(dim))],
        [(np.zeros(dim), np.identity(dim)),
         (np.zeros(dim), 5*np.identity(dim))],
        [(np.zeros(dim), np.identity(dim), 4),
         (np.ones(dim), 5*np.identity(dim), 4)],
        [(np.zeros(dim), np.identity(dim), 4),
         (np.zeros(dim), 5*np.identity(dim), 4)]]
true_fis = [0.2769, 0.3008, 0.2094, 0.3376]
fnames = ['est-fi-normal-mean-dim2-nrates4-const5-kmeans',
          'est-fi-t-mean-dim2-nrates4-const10-kmeans',
          'est-fi-t-mean-dim2-nrates4-const5-kmeans',
          'est-fi-t-var-dim2-nrates4-const10-kmeans']

dfs = mae_quant_level(nrange, dists, pars, true_fis, nrepeat=10,
                      prefix='results/mae', fnames=fnames, save=True)

Figure 8 in the main text can be reproduced as follows.

In [None]:
nrates = 4
fnames = ['est-fi-normal-mean-dim2-nrates4-const5-kmeans',
          'est-fi-normal-var-dim2-nrates4-const10-kmeans',
          'est-fi-t-mean-dim2-nrates4-const5-kmeans',
          'est-fi-t-var-dim2-nrates4-const10-kmeans']
dfs = [np.loadtxt(f'results/mae/{file}.txt') for file in fnames]

titles = ['(a)', '(b)', '(c)', '(d)']
fname = 'graphs/synthetic-cont-quant-level.pdf'
plot_quant_level(dfs, nrates, titles, fname=fname, save=False)

## Real data

In [None]:
# download data
os.system('wget https://www.stat.washington.edu/~liu16/divergence-frontier-bounds/parsed_outputs.zip')
os.system('unzip parsed_outputs.zip')

### Statistical error bound for frontier integral

We first vary the sample size.

In [None]:
nrange = np.logspace(2, 4.4, 12).astype(int)
nrepeat = 100

# load vision data 
outs = pkl.load(open('parsed_outputs/cifar10_lattice.p', 'rb'))

# 128 bins and 1024 bins
o = outs[100000][128]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_vision_1 = mae_varyn_real(p, q, nrange, nrepeat)
varyn_vision_1.to_pickle('results/real/nvary-vision-supp128.pkl')

# end of training
o = outs[100000][1024]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_vision_2 = mae_varyn_real(p, q, nrange, nrepeat)
varyn_vision_2.to_pickle('results/real/nvary-vision-supp1024.pkl')

In [None]:
# load nlp data 
outs = pkl.load(open('parsed_outputs/wikitext103.p', 'rb'))

# start of training
o = outs[2000][64]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_nlp_1 = mae_varyn_real(p, q, nrange, nrepeat)
varyn_nlp_1.to_pickle('results/real/nvary-nlp-supp64.pkl')

# end of training
o = outs[2000][2048]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_nlp_2 = mae_varyn_real(p, q, nrange, nrepeat)
varyn_nlp_2.to_pickle('results/real/nvary-nlp-supp2048.pkl')

Figure 16 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-vision-supp128',
    'nvary-vision-supp1024',
    'nvary-nlp-supp64',
    'nvary-nlp-supp2048']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = [r'(a) Images ($k=128$)', r'(b) Images ($k=1024$)',
          r'(c) Text ($k=64$)', r'(d) Text ($k=2048$)']
# dfs = [varyn_vision_1, varyn_vision_2, varyn_nlp_1, varyn_nlp_2]

fname = 'graphs/real-bound-n.pdf'
plot_stat_bound(dfs, xlabels, titles, const=15, fname=fname, save=False)

We then vary the support size.

In [None]:
# load vision data 
outs = pkl.load(open('parsed_outputs/cifar10_lattice.p', 'rb'))

o = outs[100000]
varyk_vision_1 = mae_varyk_real(o, n=1000)
varyk_vision_2 = mae_varyk_real(o, n=10000)
varyk_vision_1.to_pickle('results/real/kvary-vision-size1000.pkl')
varyk_vision_2.to_pickle('results/real/kvary-vision-size10000.pkl')

# load text data
outs = pkl.load(open('parsed_outputs/wikitext103.p', 'rb'))

o = outs[2000]
varyk_nlp_1 = mae_varyk_real(o, n=1000)
varyk_nlp_2 = mae_varyk_real(o, n=10000)
varyk_nlp_1.to_pickle('results/real/kvary-nlp-size1000.pkl')
varyk_nlp_2.to_pickle('results/real/kvary-nlp-size10000.pkl')

Figure 17 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-vision-size1000',
    'kvary-vision-size10000',
    'kvary-nlp-size1000',
    'kvary-nlp-size10000']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = [r'(a) Images ($n=1000$)', r'(b) Images ($n=10000$)',
          r'(c) Text ($n=1000$)', r'(d) Text ($n=10000$)']
# dfs = [varyk_vision_1, varyk_vision_2, varyk_nlp_1, varyk_nlp_2]

fname = 'graphs/real-bound-k.pdf'
plot_stat_bound(dfs, xlabels, titles, const=15, fname=fname, save=False)

### Statistical error bound for divergence frontiers

We first vary the sample size.

In [None]:
nrange = np.logspace(2, 4.4, 12).astype(int)
lambdas = np.linspace(0.01, 0.99, 100)
nrepeat = 100

# load vision data 
outs = pkl.load(open('parsed_outputs/cifar10_lattice.p', 'rb'))

# 128 bins and 1024 bins
o = outs[100000][128]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_vision_1 = supe_varyn_real(p, q, nrange, lambdas, nrepeat)
varyn_vision_1.to_pickle('results/real/nvary-vision-supp128-df.pkl')

# end of training
o = outs[100000][1024]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_vision_2 = supe_varyn_real(p, q, nrange, lambdas, nrepeat)
varyn_vision_2.to_pickle('results/real/nvary-vision-supp1024-df.pkl')

In [None]:
# load nlp data
outs = pkl.load(open('parsed_outputs/wikitext103.p', 'rb'))

# start of training
o = outs[2000][64]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_nlp_1 = supe_varyn_real(p, q, nrange, lambdas, nrepeat)
varyn_nlp_1.to_pickle('results/real/nvary-nlp-supp64-df.pkl')

# end of training
o = outs[2000][2048]
p = o.p_hist
q = o.q_hist
print(p.shape, q.shape, np.linalg.norm(p), np.linalg.norm(q))
varyn_nlp_2 = supe_varyn_real(p, q, nrange, lambdas, nrepeat)
varyn_nlp_2.to_pickle('results/real/nvary-nlp-supp2048-df.pkl')


Figure 16 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-vision-supp128-df',
    'nvary-vision-supp1024-df',
    'nvary-nlp-supp64-df',
    'nvary-nlp-supp2048-df']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = [r'(a) Images ($k=128$)', r'(b) Images ($k=1024$)',
          r'(c) Text ($k=64$)', r'(d) Text ($k=2048$)']
fname = 'graphs/real-bound-n-df.pdf'
plot_stat_bound(dfs, xlabels, titles, const=7, fname=fname, save=False)

We then vary the support size.

In [None]:
lambdas = np.linspace(0.01, 0.99, 100)
nrepeat = 100

# load vision data
outs = pkl.load(open('parsed_outputs/cifar10_lattice.p', 'rb'))

o = outs[100000]
varyk_vision_1 = supe_varyk_real(o, 1000, lambdas, nrepeat)
varyk_vision_2 = supe_varyk_real(o, 10000, lambdas, nrepeat)
varyk_vision_1.to_pickle('results/real/kvary-vision-size1000-df.pkl')
varyk_vision_2.to_pickle('results/real/kvary-vision-size10000-df.pkl')

# load text data
outs = pkl.load(open('parsed_outputs/wikitext103.p', 'rb'))

o = outs[2000]
varyk_nlp_1 = supe_varyk_real(o, 1000, lambdas, nrepeat)
varyk_nlp_2 = supe_varyk_real(o, 10000, lambdas, nrepeat)
varyk_nlp_1.to_pickle('results/real/kvary-nlp-size1000-df.pkl')
varyk_nlp_2.to_pickle('results/real/kvary-nlp-size10000-df.pkl')

Figure 17 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-vision-size1000-df',
    'kvary-vision-size10000-df',
    'kvary-nlp-size1000-df',
    'kvary-nlp-size10000-df']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = [r'(a) Images ($n=1000$)', r'(b) Images ($n=10000$)',
          r'(c) Text ($n=1000$)', r'(d) Text ($n=10000$)']
fname = 'graphs/real-bound-k-df.pdf'
plot_stat_bound(dfs, xlabels, titles, const=7, fname=fname, save=False)

### Distribution estimators for frontier integral

We first vary the sample size.

Figure 18 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-vision-supp128',
    'nvary-vision-supp1024',
    'nvary-nlp-supp64',
    'nvary-nlp-supp2048']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = [r'(a) Images ($k=128$)', r'(b) Images ($k=1024$)',
          r'(c) Text ($k=64$)', r'(d) Text ($k=2048$)']
fname = 'graphs/real-smoothing-n.pdf'
plot_dist_est(dfs, xlabels, titles, fname=fname, save=False)

We then vary the support size.

Figure 19 (top) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-vision-size1000',
    'kvary-vision-size10000',
    'kvary-nlp-size1000',
    'kvary-nlp-size10000']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = [r'(a) Images ($n=1000$)', r'(b) Images ($n=10000$)',
          r'(c) Text ($n=1000$)', r'(d) Text ($n=10000$)']
# dfs = [varyk_vision_1, varyk_vision_2, varyk_nlp_1, varyk_nlp_2]

fname = 'graphs/real-smoothing-k.pdf'
plot_dist_est(dfs, xlabels, titles, fname=fname, save=False)

Figure 7 in the main text can be reproduced as follows.

In [None]:
fnames = [
    'nvary-vision-supp128',
    'nvary-nlp-supp2048',
    'kvary-vision-size1000',
    'kvary-nlp-size10000']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*2 + ['Support size']*2
titles = [r'(a) Images ($k=128$)', r'(b) Text ($k=2048$)',
          r'(c) Images ($n=1000$)', r'(d) Text ($n=10000$)']
fname = 'graphs/real-smoothing.pdf'
plot_dist_est(dfs, xlabels, titles, fname=fname, save=False)

### Distribution estimators for divergence frontiers

We first vary the sampel size.

Figure 18 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'nvary-vision-supp128-df',
    'nvary-vision-supp1024-df',
    'nvary-nlp-supp64-df',
    'nvary-nlp-supp2048-df']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Sample size']*4
titles = [r'(a) Images ($k=128$)', r'(b) Images ($k=1024$)',
          r'(c) Text ($k=64$)', r'(d) Text ($k=2048$)']
fname = 'graphs/real-smoothing-n-df.pdf'
plot_dist_est(dfs, xlabels, titles, fname=fname, save=False)

We then vary the support size.

Figure 19 (bottom) in the appendix can be reproduced as follows.

In [None]:
fnames = [
    'kvary-vision-size1000-df',
    'kvary-vision-size10000-df',
    'kvary-nlp-size1000-df',
    'kvary-nlp-size10000-df']
dfs = [pd.read_pickle(f'results/real/{file}.pkl') for file in fnames]

xlabels = ['Support size']*4
titles = [r'(a) Images ($n=1000$)', r'(b) Images ($n=10000$)',
          r'(c) Text ($n=1000$)', r'(d) Text ($n=10000$)']
fname = 'graphs/real-smoothing-k-df.pdf'
plot_dist_est(dfs, xlabels, titles, fname=fname, save=False)

### TODO: Correlations between K-means and Lattice quantization

On the image data.

In [None]:
k_list = [8, 16, 32, 64, 128, 256, 512, 1024]

with open('parsed_outputs/cifar10_lattice.p', 'rb') as f:
    outs_new_lattice = pkl.load(f)
o1 = outs_new_lattice[100000] ### from lattice
o1.keys()
arr1 = [o1[k].line_mauve for k in k_list]
print(arr1)

In [None]:
with open('parsed_outputs/cifar10_kmeans.p', 'rb') as f:
    outs_new_lattice = pkl.load(f)
o2 = outs_new_lattice[100000] ### from lattice
o2.keys()
arr2 = [o2[k].line_mauve for k in k_list]
print(arr2)

In [None]:
from scipy.stats import spearmanr, pearsonr
print(spearmanr(arr1, arr2))
print(pearsonr(arr1, arr2))

On the text data.

In [None]:
k_list = [8, 16, 32, 64, 128, 256, 512, 1024]

with open('parsed_outputs/wikitext103_lattice.p', 'rb') as f:
    outs_new_lattice = pkl.load(f)
o1 = outs_new_lattice[1800] ### from lattice
arr1 = [o1[k].line_mauve for k in k_list]
print(arr1)

In [None]:
with open('parsed_outputs/wikitext103_kmeans_checkpoint-1800.pkl', 'rb') as f:
    outs_new_kmeans = pkl.load(f)
arr2 = [outs_new_kmeans[f'mauve_k={k}'].line_mauve for k in k_list]
print(arr2)

In [None]:
from scipy.stats import spearmanr, pearsonr
print(spearmanr(arr1, arr2))
print(pearsonr(arr1, arr2))