In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from galton import galton_rvs
from galton import galton_rvs_ratio

In [None]:
from galton import n_nails
from galton import n_rows

theta_0 = -0.8
theta_1 = -0.6

# Density estimation with histograms

In [None]:
samples_0, _, scores_0, _ = galton_rvs(theta_0, n_runs=20000, random_state=1234)
p_estimated_0, _ = np.histogram(
    samples_0, bins=n_nails, range=(0, n_nails), density=True
)

samples_1, _, scores_1, _ = galton_rvs(theta_1, n_runs=20000, random_state=1234)
p_estimated_1, _ = np.histogram(
    samples_1, bins=n_nails, range=(0, n_nails), density=True
)

In [None]:
plt.step(
    range(len(p_estimated_0)), p_estimated_0, c="r", lw=1.5, label=r"$p(x|\theta_0)$"
)
plt.step(
    range(len(p_estimated_0)), p_estimated_1, c="b", lw=1.5, label=r"$p(x|\theta_1)$"
)

plt.xlabel("x")
plt.ylabel("p(x)")
plt.legend()
plt.show()

# NDE + SCANDAL

In [None]:
from galton import check_random_state
from keras.utils import to_categorical


def draw_nde(n_samples, n_thetas=10, random_state=0):
    n_traces_per_theta = n_samples // n_thetas

    all_x = []
    all_t_xz = []
    all_thetas = []

    for k, theta in enumerate(np.linspace(-1.0, -0.4, n_thetas)):
        x, log_p_xz, t_xz, _ = galton_rvs(
            theta, n_runs=n_traces_per_theta, random_state=random_state + k
        )
        all_x.append(x)
        all_t_xz.append(t_xz)
        all_thetas.append(theta * np.ones(n_traces_per_theta))

    all_x = np.array(all_x).reshape(-1, 1)
    all_x = to_categorical(all_x, num_classes=n_nails)
    all_t_xz = np.array(all_t_xz).reshape(-1, 1)
    all_thetas = np.array(all_thetas).reshape(-1, 1)

    rng = check_random_state(random_state)
    choices = rng.choice(len(all_x), len(all_x), replace=False)

    return all_x[choices], all_t_xz[choices], all_thetas[choices]

In [None]:
x, t_xz, theta = draw_nde(10000)

In [None]:
n_features = 1 
hidden_size = 20
n_outputs = n_nails

In [None]:
from keras.optimizers import Adam
from keras.layers import Input, Dense, Lambda, dot
from keras.models import Model
from keras.utils import to_categorical
import keras.backend as K


def make_nde(lr=0.001):
    theta = Input(shape=(1,))
    h = Dense(hidden_size, activation="tanh")(theta)
    phat_theta = Dense(n_outputs, activation="softmax")(h)

    model = Model(inputs=[theta], outputs=[phat_theta])
    opt = Adam(lr=lr)
    model.compile(loss=["categorical_crossentropy"], optimizer=opt)

    return model


def make_scandal(lr=0.001):
    theta = Input(shape=(1,))
    h = Dense(hidden_size, activation="tanh")(theta)
    phat_theta = Dense(n_outputs, activation="softmax")(h)

    x = Input(shape=(n_outputs,))
    phat_x_theta = dot([x, phat_theta], axes=1)
    t_x = Lambda(
        lambda exp: K.gradients(K.log(exp[0]), [exp[1]])[0], output_shape=(n_features,)
    )([phat_x_theta, theta])

    model = Model(inputs=[theta, x], outputs=[phat_theta, t_x])
    opt = Adam(lr=lr)
    model.compile(loss=["categorical_crossentropy", "mse"], optimizer=opt)

    return model

In [None]:
nde = make_nde()
nde.fit(theta, x,
        batch_size=32, epochs=50, verbose=0)

In [None]:
scandal = make_scandal()
scandal.fit([theta, x], [x, t_xz], batch_size=32, epochs=50)

In [None]:
plt.step(range(n_nails), p_estimated_0, label="histogram")
plt.step(range(n_nails), nde.predict(np.array([theta_0])).ravel(), label="nde")
plt.step(
    range(n_nails),
    scandal.predict([np.array([[theta_0]]), np.zeros((1, n_nails))])[0][0],
    label="scandal",
)
plt.legend()
plt.title(r"$\theta=%.2f$" % theta_0)
plt.show()

In [None]:
plt.step(range(n_nails), p_estimated_1, label="histogram")
plt.step(range(n_nails), nde.predict(np.array([theta_1])).ravel(), label="nde")
plt.step(
    range(n_nails),
    scandal.predict([np.array([[theta_1]]), np.zeros((1, n_nails))])[0][0],
    label="scandal",
)
plt.legend()
plt.title(r"$\theta=%.2f$" % theta_1)
plt.show()

# CARL

In [None]:
def make_carl(lr=0.001):
    theta0 = Input(shape=(1,))
    
    h = Dense(hidden_size, activation="tanh")(theta0)
    log_rhat_thetas = Dense(n_outputs, activation="linear")(h)
    rhat_thetas = Lambda(lambda exp: K.exp(exp))(log_rhat_thetas)

    x = Input(shape=(n_outputs,))
    rhat_x_thetas = dot([x, rhat_thetas], axes=1)
    
    shat = Lambda(lambda r: 1. / (1. + r))(rhat_x_thetas)

    model = Model(inputs=[theta0, x], outputs=[shat])
    opt = Adam(lr=lr)
    model.compile(loss=["binary_crossentropy"], optimizer=opt)
    
    return model


def make_carl_alt(lr=0.001):
    theta0 = Input(shape=(1,))
    x = Input(shape=(1,))
    
    h = Dense(hidden_size, activation="tanh")(
        concatenate([theta0, x])
    )
    h = Dense(hidden_size, activation="tanh")(h)
    log_rhat = Dense(1, activation="linear")(h)
    rhat = Lambda(lambda exp: K.exp(exp))(log_rhat)
    shat = Lambda(lambda r: 1. / (1. + r))(rhat)

    model = Model(inputs=[theta0, x], outputs=[shat])
    opt = Adam(lr=lr)
    model.compile(loss=["binary_crossentropy"], optimizer=opt)
    
    return model

In [None]:
def log_r_from_s(s, epsilon=1.e-6):
    return np.log((1. - s + epsilon) / (s + epsilon))

In [None]:
def draw_ratio(n_samples, n_thetas=10, random_state=0,
               draw_from_theta0=True, draw_from_theta1=True, categorical_x=True):
    n_traces_per_theta = n_samples // n_thetas // (int(draw_from_theta0) + int(draw_from_theta1))

    all_x = []
    all_theta = []
    all_y = []
    all_r_xz = []
    all_t_xz_0 = []
    
    for k, theta in enumerate(np.linspace(-1.0, -0.4, n_thetas)):
        # Draw according to theta0
        if draw_from_theta0:
            x, log_p_xz_0, log_p_xz_1, t_xz_0, _, _ = galton_rvs_ratio(
                theta_0, theta_1, n_runs=n_traces_per_theta, random_state=random_state
            )
            all_x.append(x)
            all_y += [0] * n_traces_per_theta
            all_r_xz.append(log_p_xz_0 - log_p_xz_1)
            all_t_xz_0.append(t_xz_0)
            all_theta += [theta] * n_traces_per_theta
        
        # Draw according to theta1
        if draw_from_theta1:
            x, log_p_xz_1, log_p_xz_0, _, t_xz_0, _ = galton_rvs_ratio(
                theta_1, theta_0, n_runs=n_traces_per_theta, random_state=random_state
            )
            all_x.append(x)
            all_y += [1] * len(x)
            all_r_xz.append(log_p_xz_0 - log_p_xz_1)
            all_t_xz_0.append(t_xz_0)
            all_theta += [theta] * n_traces_per_theta
        
    all_x = np.array(all_x).reshape(-1, 1)
    if categorical_x:
        all_x = to_categorical(all_x, num_classes=n_nails)
    all_theta = np.array(all_theta).reshape(-1, 1)
    all_y = np.array(all_y)
    all_r_xz = np.array(all_r_xz).reshape(-1, 1)
    all_r_xz = np.exp(all_r_xz)
    all_t_xz_0 = np.array(all_t_xz_0).reshape(-1, 1)

    return all_x, all_theta, all_y, all_r_xz, all_t_xz_0

In [None]:
# Calibration
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.isotonic import IsotonicRegression

def make_calibrated_carl_alt(theta, x, y):
    cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2)

    classifiers = []
    calibrators = []

    for train, calibrate in cv.split(x, y):
        # Train classifier
        clf = make_carl_alt()
        clf.fit([theta[train], x[train]],
                y[train],
                batch_size=32, epochs=50, verbose=0)
        classifiers.append(clf)

        # Isotonic calibration
        T = clf.predict([theta[calibrate], x[calibrate]])
        cal = IsotonicRegression(y_min=None, y_max=None,
                                 increasing=True, out_of_bounds='clip')
        cal.fit(T.flatten(), y[calibrate].flatten())
        calibrate = lambda s : cal.transform(s.flatten())
        calibrators.append(calibrate)

    def calibrated_carl_prediction(theta_, x_):
        x_ = x_.astype(np.float)
        s = np.zeros_like(x_)
        for clf, calibrate in zip(classifiers, calibrators):
            s += calibrate(clf.predict([theta_, x_]))
        s /= len(classifiers)
        return s
    
    return calibrated_carl_prediction

In [None]:
all_x, all_theta, all_y, _, _ = draw_ratio(5000)
all_theta_x = np.hstack([all_theta, all_x])

all_x_alt, all_theta_alt, all_y_alt, _, _ = draw_ratio(5000, categorical_x=False)
all_theta_x_alt = np.hstack([all_theta_alt, all_x_alt])


In [None]:
carl = make_carl()
carl.fit([all_theta, all_x],
         all_y, 
         batch_size=32, epochs=50, verbose=0)

In [None]:
carl_alt = make_carl_alt()
carl_alt.fit([all_theta_alt, all_x_alt],
                all_y_alt, 
                batch_size=32, epochs=50, verbose=0)

In [None]:
carl_calibrated = make_calibrated_carl_alt(all_theta_alt, all_x_alt, all_y_alt)

In [None]:
log_r_true = np.log(p_estimated_0[5:-5]) - np.log(p_estimated_1[5:-5])

x_test = to_categorical(np.array(range(n_nails)), num_classes=n_nails)
x_test_alt = np.array(range(n_nails))
theta0_test = theta_0 * np.ones((n_nails, 1))

log_r_carl = log_r_from_s(
    carl.predict([theta0_test, x_test])[5:-5]
)
log_r_carl_alt = log_r_from_s(
    carl_alt.predict([theta0_test, x_test_alt])[5:-5]
)
log_r_carl_calibrated = log_r_from_s(
    carl_calibrated(theta0_test, x_test_alt)[5:-5]
)

plt.step(range(5, n_nails - 5), log_r_true, label="Truth")
plt.step(range(5, n_nails - 5), log_r_carl, label="CARL")
plt.step(range(5, n_nails - 5), log_r_carl_alt, label="CARL (old architecture)")
plt.step(range(5, n_nails - 5), log_r_carl_calibrated, label="Calibrated CARL")
plt.legend()
plt.show()

# ROLR + RASCAL

In [None]:
all_x, all_thetas, _, all_r_xz, all_t_xz_0 = draw_ratio(5000, draw_from_theta0=False)
all_x_alt, all_thetas_alt, _, all_r_xz_alt, all_t_xz_0_alt = draw_ratio(5000, categorical_x=False, draw_from_theta0=False)


In [None]:
from keras.losses import mean_squared_error
from keras.layers import concatenate


def make_rolr(lr=0.001):
    theta0 = Input(shape=(1,))

    h = Dense(hidden_size, activation="tanh")(theta0)
    log_rhat_thetas = Dense(n_outputs, activation="linear")(h)
    rhat_thetas = Lambda(lambda exp: K.exp(exp))(log_rhat_thetas)

    x = Input(shape=(n_outputs,))
    rhat_x_thetas = dot([x, rhat_thetas], axes=1)

    model = Model(inputs=[theta0, x], outputs=[rhat_x_thetas])
    opt = Adam(lr=lr)
    model.compile(loss=["mse"], optimizer=opt)

    return model


def make_rascal(lr=0.001):
    theta0 = Input(shape=(1,))

    h = Dense(hidden_size, activation="tanh")(theta0)
    log_rhat_thetas = Dense(n_outputs, activation="linear")(h)
    rhat_thetas = Lambda(lambda exp: K.exp(exp))(log_rhat_thetas)

    x = Input(shape=(n_outputs,))
    rhat_x_thetas = dot([x, rhat_thetas], axes=1)

    log_rhat_x_theta = dot([x, log_rhat_thetas], axes=1)
    t_x_0 = Lambda(
        lambda exp: K.gradients(exp[0], [exp[1]])[0], output_shape=(n_features,)
    )([log_rhat_x_theta, theta0])

    model = Model(inputs=[theta0, x], outputs=[rhat_x_thetas, t_x_0])
    opt = Adam(lr=lr)
    model.compile(loss=["mse", "mse"], loss_weights=[1.0, 0.5], optimizer=opt)

    return model


def make_rolr_alt(lr=0.001):
    theta0 = Input(shape=(1,))
    x = Input(shape=(1,))

    h = Dense(hidden_size, activation="tanh")(
        concatenate([theta0, x])
    )
    h = Dense(hidden_size, activation="tanh")(h)
    log_rhat = Dense(1, activation="linear")(h)
    rhat = Lambda(lambda exp: K.exp(exp))(log_rhat)

    model = Model(inputs=[theta0, x], outputs=[rhat])
    opt = Adam(lr=lr)
    model.compile(loss=["mse"], optimizer=opt)

    return model


def make_rascal_alt(lr=0.001):
    theta0 = Input(shape=(1,))
    x = Input(shape=(1,))

    h = Dense(hidden_size, activation="tanh")(
        concatenate([theta0, x])
    )
    h = Dense(hidden_size, activation="tanh")(h)
    log_rhat = Dense(1, activation="linear")(h)
    rhat = Lambda(lambda exp: K.exp(exp))(log_rhat)
    
    that_0 = Lambda(
        lambda arg: K.gradients(arg[0], [arg[1]])[0], output_shape=(n_features,)
    )([log_rhat, theta0])

    model = Model(inputs=[theta0, x], outputs=[rhat, that_0])
    opt = Adam(lr=lr)
    model.compile(loss=["mse", "mse"], loss_weights=[1.0, 0.5], optimizer=opt)

    return model

In [None]:
rolr = make_rolr()
rolr.fit([all_thetas, all_x], 
         all_r_xz, 
         batch_size=32, epochs=50, verbose=0)

In [None]:
rascal = make_rascal()
rascal.fit([all_thetas, all_x], 
           [all_r_xz, all_t_xz_0], 
           batch_size=32, epochs=50, verbose=0)

In [None]:
rolr_alt = make_rolr_alt()
rolr_alt.fit([all_thetas_alt, all_x_alt], 
         all_r_xz, 
         batch_size=32, epochs=50, verbose=0)

In [None]:
rascal_alt = make_rascal_alt()
rascal_alt.fit([all_thetas_alt, all_x_alt], 
           [all_r_xz_alt, all_t_xz_0_alt], 
           batch_size=32, epochs=50, verbose=0)

In [None]:
r_true = np.exp(np.log(p_estimated_0[5:-5]) - np.log(p_estimated_1[5:-5]))

x_test = to_categorical(np.array(range(n_nails)), num_classes=n_nails)
x_test_alt = np.array(range(n_nails))
theta0_test = np.array(theta_0 * np.ones(n_nails))

r_rolr = rolr.predict([theta0_test, x_test])[5:-5]
r_rascal = rascal.predict([theta0_test, x_test])[0][5:-5]
r_rolr_alt = rolr_alt.predict([theta0_test, x_test_alt])[5:-5]
r_rascal_alt = rascal_alt.predict([theta0_test, x_test_alt])[0][5:-5]

plt.step(range(5, n_nails - 5), r_true, label="Truth")
plt.step(range(5, n_nails - 5), r_rolr, label="ROLR")
plt.step(range(5, n_nails - 5), r_rascal, label="RASCAL")
plt.step(range(5, n_nails - 5), r_rolr_alt, label="ROLR (old architecture)")
plt.step(range(5, n_nails - 5), r_rascal_alt, label="RASCAL (old architecture)")
plt.legend()
plt.show()

---

# Model comparison

In [181]:
def compute_mse(p_train, p_estimated):
    p_ratio_train = np.log(p_train[0]) - np.log(p_train[1])
    p_ratio = np.log(p_estimated[0]) - np.log(p_estimated[1])
    mse = p_ratio_train - p_ratio
    mse[np.logical_or(np.isinf(mse), np.isnan(mse))] = 0.
    mse = np.dot(mse ** 2, p_estimated[1])
    return mse


def compute_mse_ratio(ratio, p_estimated, log=True):
    p_ratio = np.log(p_estimated[0]) - np.log(p_estimated[1])
    if log:
        p_ratio_train = np.log(ratio)
    else:
        p_ratio_train = ratio
    mse = p_ratio_train - p_ratio
    mse[np.logical_or(np.isinf(mse), np.isnan(mse))] = 0.
    mse = np.dot(mse ** 2, p_estimated[1])
    return mse

In [None]:
mses_nde = []
mses_scandal = []
mses_carl = []
mses_carl_alt = []
mses_carl_calibrated = []
mses_rolr = []
mses_rolr_alt = []
mses_rascal = []
mses_rascal_alt = []

p_estimated = np.array([p_estimated_0, p_estimated_1])
samples_size = [100, 200, 500, 1000, 2000, 5000, 10000]
theta0_test = theta_0 * np.ones(n_nails)
x_test = to_categorical(np.array(range(n_nails)), num_classes=n_nails)
x_test_alt = np.array(range(n_nails))

for n in samples_size:
    print("{0} samples".format(n))
    
    # NDE - SCANDAL
    x, t_xz, theta = draw(n)
    nde = make_nde()
    nde.fit(
        theta,
        x,
        batch_size=32,
        epochs=50,
        verbose=0
    )
    scandal = make_scandal()
    scandal.fit(
        [theta, x],
        [x, t_xz],
        batch_size=32,
        epochs=50,
        verbose=0
    )
    nde_pred = nde.predict([theta_0, theta_1])
    scandal_pred = scandal.predict(
        [np.array([[theta_0], [theta_1]]), np.zeros((2, n_nails))]
    )[0]
    mse_nde = compute_mse(nde_pred, p_estimated)
    mse_scandal = compute_mse(scandal_pred, p_estimated)
    
    # CARL
    all_x, all_thetas, all_y, _, _ = draw_ratio(n)
    carl = make_carl()
    carl.fit(
        [all_thetas, all_x],
        all_y,
        batch_size=32,
        epochs=50,
        verbose=0
    )
    carl_pred = np.exp(log_r_from_s(
        carl.predict([theta0_test, x_test])[5:-5]
    ))
    mse_carl = compute_mse_ratio(
        carl_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # CARL old architecture
    all_x_alt, all_thetas_alt, all_y_alt, _, _ = draw_ratio(n, categorical_x=False)
    carl_alt = make_carl_alt()
    carl_alt.fit(
        [all_thetas_alt, all_x_alt],
        all_y_alt,
        batch_size=32,
        epochs=50,
        verbose=0
    )
    carl_alt_pred = np.exp(log_r_from_s(
        carl_alt.predict([theta0_test, x_test_alt])[5:-5]
    ))
    mse_carl_alt = compute_mse_ratio(
        carl_alt_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # CARL old architecture calibrated
    carl_calibrated = make_calibrated_carl_alt(
        all_thetas_alt,
        all_x_alt,
        all_y_alt
    )
    carl_calibrated_pred = np.exp(log_r_from_s(
        carl_calibrated(theta0_test, x_test_alt)[5:-5]
    ))
    mse_carl_calibrated = compute_mse_ratio(
        carl_calibrated_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # ROLR
    all_x, all_thetas, _, all_r_xz, all_t_xz_0 = draw_ratio(n, draw_from_theta0=False)
    rolr = make_rolr()
    rolr.fit(
        [all_thetas, all_x],
        all_r_xz,
        batch_size=32,
        epochs=50,
        verbose=0
    )
    rolr_pred = rolr.predict(
        [theta0_test, x_test]
    )[5:-5]
    mse_rolr = compute_mse_ratio(
        rolr_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # RASCAL
    rascal = make_rascal()
    rascal.fit(
        [all_thetas, all_x],
        [all_r_xz, all_t_xz_0],
        batch_size=32,
        epochs=50,
        verbose=0,
    )
    rascal_pred = rascal.predict(
        [theta0_test, x_test]
    )[0][5:-5]
    mse_rascal = compute_mse_ratio(
        rascal_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # ROLR old architecture
    all_x_alt, all_thetas_alt, _, all_r_xz_alt, all_t_xz_0_alt = draw_ratio(n,
                                                                            categorical_x=False,
                                                                            draw_from_theta0=False)
    rolr_alt = make_rolr_alt()
    rolr_alt.fit(
        [all_thetas_alt, all_x_alt],
        all_r_xz_alt,
        batch_size=32,
        epochs=50,
        verbose=0
    )
    rolr_alt_pred = rolr_alt.predict(
        [theta0_test, x_test_alt]
    )[5:-5]
    mse_rolr_alt = compute_mse_ratio(
        rolr_alt_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # RASCAL old architecture
    rascal_alt = make_rascal_alt()
    rascal_alt.fit(
        [all_thetas_alt, all_x_alt],
        [all_r_xz_alt, all_t_xz_0_alt],
        batch_size=32,
        epochs=50,
        verbose=0
    )
    rascal_alt_pred = rascal_alt.predict(
        [theta0_test, x_test_alt]
    )[0][5:-5]
    mse_rascal_alt = compute_mse_ratio(
        rascal_alt_pred.flatten(),
        p_estimated[:, 5:-5],
        log=True
    )
    
    # Save
    mses_nde.append(mse_nde)
    mses_rascal.append(mse_rascal)
    mses_rascal_alt.append(mse_rascal_alt)
    mses_carl.append(mse_carl)
    mses_carl_alt.append(mse_carl_alt)
    mses_carl_calibrated.append(mse_carl_calibrated)
    mses_rolr.append(mse_rolr)
    mses_rolr_alt.append(mse_rolr_alt)
    mses_scandal.append(mse_scandal)

In [None]:
from utils import calculate_height, adjust_margins

In [None]:
colors     = ['orange', 'darkgreen',  'red', 'mediumblue', '#be96ff', '#CC002E', '0.65', 'C0', 'C1', 'C2', 'C3']
linestyles = [     ':',        '--',    ':',         '-.',      '-.',      '--'] + ['-'] * 5
linewidths = [1.5] * 11
markers    = ['o'] * 11

In [None]:
mses_nde = np.array(mses_nde)
mses_scandal = np.array(mses_scandal)
mses_rolr = np.array(mses_rolr)
mses_rolr_alt = np.array(mses_rolr_alt)
mses_rascal = np.array(mses_rascal)
mses_rascal_alt = np.array(mses_rascal_alt)
mses_carl = np.array(mses_carl)
mses_carl_alt = np.array(mses_carl_alt)
mses_carl_calibrated = np.array(mses_carl_calibrated)

In [None]:
#fig = plt.figure(figsize=(4.5,calculate_height(1,4.5,extra_top_space=False)))
fig = plt.figure(figsize=(6.,6.))
ax = plt.gca()

mses = [mses_nde, mses_carl, mses_carl_alt, mses_carl_calibrated,
        mses_scandal, mses_rolr, mses_rolr_alt, mses_rascal, mses_rascal_alt]
labels = ['NDE', 'CARL raw', 'CARL raw old', 'CARL calibrated old',
          'SCANDAL', 'ROLR', 'ROLR old', 'RASCAL', 'RASCAL old']

for s in range(9):
    plt.plot(samples_size, mses[s], 
             ms=4., marker=markers[s],
             color=colors[s],
             lw=linewidths[s], ls=linestyles[s])
    plt.plot([], [],
             color=colors[s],
             lw=linewidths[s], ls=linestyles[s],
             label=labels[s])

ax.set_xscale("log", nonposx='clip')
ax.set_yscale("log", nonposy='clip')
plt.xlabel(r"Training sample size")
plt.ylabel(r'$E_{x\sim p_1(x)} [(\log r(x|\theta_0, \theta_1) - \log \hat{r}(x|\theta_0, \theta_1))^2]$')
plt.legend(frameon=False)

plt.tight_layout()
#adjust_margins(1,4.5,extra_top_space=False)
plt.savefig('plinko_models_comparison.pdf')