# Reproducing Experiments for ETIC

Lang Liu

03/03/2022

## Import Packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr
import ot
import seaborn as sns

from ind_tests import AdaptiveETICTest, ETICTest, HSICTest, L1Test, MutualInfoTest
from ind_tests import get_random_feature
from sklearn.decomposition import PCA
from urllib.request import urlretrieve
from utils import generate_data, load_data, median_dist

COLORS = plt.rcParams['axes.prop_cycle'].by_key()['color']

mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['axes.labelsize'] = 18
mpl.rcParams['legend.fontsize'] = 18
mpl.rcParams['axes.titlesize'] = 18
mpl.rcParams['lines.markersize'] = 7.5

LINESTYLE = ['-', '--', '-.', (0, (1, 1)), '-', '--']
MARKER = ['8', 's', '', '', '^', '8']
TEST = ['ETIC', 'ETIC-RF', 'HSIC']

## Functions and Constants

In [None]:
NPERMS = 2  # number of permutations in the permutation test
ALPHA = 0.05  # significance level
NREPS = 2  # number of repetitions
SYN_REGS = [0.25, 0.5, 1.0, 2.0, 4.0]  # hyperparameters in the synthetic examples
BON_REGS = [0.25, 4.0]  # hyperparameters for the Bonferroni-type adaptive ETIC test
REAL_REGS = np.round(np.linspace(0.25, 4, 12), 2)  # hyperparameters in the real example

In [None]:
def cost_mat(x, y, weight=1.0):
    return ot.dist(x, y) / weight


def gram_mat(x, y, kpar):
    return np.exp(-ot.dist(x, y)/kpar)

In [None]:
def run_etic(X, Y, regs, cost_mat):
    xmed, ymed = median_dist(X, Y)
    xcost = cost_mat(X, X, regs[0]*xmed)
    ycost = cost_mat(Y, Y, regs[1]*ymed)
    mytest = ETICTest(1.0)
    decision = mytest.decision(xcost, ycost, ALPHA, NPERMS)
    return decision


# Adaptive ETIC
def run_aetic_test(X, Y, regs, cost_mat):
    xmed, ymed = median_dist(X, Y)
    xcost = cost_mat(X, X, xmed)
    ycost = cost_mat(Y, Y, ymed)
    mytest = AdaptiveETICTest(regs)
    decision = mytest.decision(xcost, ycost, ALPHA, NPERMS)
    return decision


# Bonferroni adaptive ETIC
def run_baetic_test(X, Y, regs, cost_mat):
    ntests = len(regs)**2
    xmed, ymed = median_dist(X, Y)
    xcost = cost_mat(X, X, xmed)
    ycost = cost_mat(Y, Y, ymed)
    mytest = ETICTest(1.0)
    decision = []
    for xeps in regs:
        for yeps in regs:
            decision.append(mytest.decision(
                xcost/xeps, ycost/yeps, ALPHA/ntests, NPERMS))
    return np.max(decision)


def run_eticrf(X, Y, regs, nfeat, npc=None):
    if npc:
        pca = PCA(n_components=npc)
        X = pca.fit_transform(X)
        Y = pca.fit_transform(Y)
    xmed, ymed = median_dist(X, Y)
    xfeat = get_random_feature(X, nfeat, regs[0]*xmed)
    yfeat = get_random_feature(Y, nfeat, regs[1]*ymed)
    mytest = ETICTest(1.0)
    decision = mytest.decision_with_rf(xfeat, yfeat, ALPHA, NPERMS)
    return decision


def run_hsic(X, Y, gram_mat, kpars):
    xkpar, ykpar = median_dist(X, Y)
    xgram = gram_mat(X, X, xkpar*kpars[0])
    ygram = gram_mat(Y, Y, ykpar*kpars[1])
    mytest = HSICTest()
    decision = mytest.decision(xgram, ygram, ALPHA, NPERMS)
    return decision


def run_info(X, Y, nparts, test):
    if test == 'l1':
        mytest = L1Test()
    elif test == 'mi':
        mytest = MutualInfoTest()
    decision = mytest.decision(X, Y, ALPHA, NPERMS, nparts=nparts)
    return decision

In [None]:
def plot_synthtic(x, powers, xlabel, save=False, fname=None):
    fig, axes = plt.subplots(nrows=1, ncols=len(powers), sharey=True)
    fig.set_figheight(4)
    fig.set_figwidth(4*len(powers)-1.3)
    axes[0].set_ylabel('Power')

    for i, power in enumerate(powers):
        for j, par in enumerate(SYN_REGS):
            axes[i].plot(
                x, power[j], label=f'r = {par}',
                color=COLORS[j], linestyle=LINESTYLE[j], marker=MARKER[j])
            # axes[i].fill_between(x, y-y_std, y+y_std, color=COLORS[0], alpha=0.3)
        
        axes[i].set_xlabel(xlabel)
        axes[i].set_title(TEST[i])

    handles, labels = axes[0].get_legend_handles_labels()
    fig.tight_layout(rect=[0, 0.08, 1, 1])  # L, B, R, T
    lgd = fig.legend(
        handles, labels, loc='lower center',
        bbox_to_anchor=(0.5, -0.02), ncol=5)

    if save:
        fig.savefig(fname, bbox_extra_artists=[lgd], bbox_inches='tight')
        
        
def plot_synthetic_aeot(xs, powers, xlabels, label, save=False, fname=None):
    fig, axes = plt.subplots(nrows=1, ncols=len(xs), sharey=False)
    fig.set_figheight(4)
    fig.set_figwidth(4*len(xs)+1.0)
    axes[0].set_ylabel('Power')
    
    for i, power in enumerate(powers):
        for j, par in enumerate(SYN_REGS):
            axes[i].plot(
                xs[i], power[j], label=f'r = {par}',
                color=COLORS[j], linestyle=LINESTYLE[j], marker=MARKER[j])
        
        axes[i].plot(
            xs[i], power[j+1], label=label,
            color=COLORS[j+1], linestyle=LINESTYLE[j+1], marker=MARKER[j+1])
        
        axes[i].set_xlabel(xlabels[i])

    handles, labels = axes[0].get_legend_handles_labels()
    fig.tight_layout(rect=[0, 0.0, 1, 1])  # L, B, R, T
    lgd = fig.legend(
        handles, labels, loc='center right',
        bbox_to_anchor=(1.27, 0.5))

    if save:
        fig.savefig(fname, bbox_extra_artists=[lgd], bbox_inches='tight')
        

def plot_synthtic_info(xs, powers, xlabels, labels, save=False, fname=None):
    fig, axes = plt.subplots(nrows=1, ncols=len(xs), sharey=False)
    fig.set_figheight(4)
    fig.set_figwidth(4*len(xs)+1)
    axes[0].set_ylabel('Power')
    
    # left plot

    for i, power in enumerate(powers):
        for j, label in enumerate(labels):
            axes[i].plot(
                xs[i], power[j], label=label,
                color=COLORS[j], linestyle=LINESTYLE[j], marker=MARKER[j])
            # axes[i].fill_between(x, y-y_std, y+y_std, color=COLORS[0], alpha=0.3)
        
        axes[i].set_xlabel(xlabels[i])

    handles, labels = axes[0].get_legend_handles_labels()
    fig.tight_layout(rect=[0, 0.0, 1, 1])  # L, B, R, T
    lgd = fig.legend(
        handles, labels, loc='center right',
        bbox_to_anchor=(1.24, 0.5))

    if save:
        fig.savefig(fname, bbox_extra_artists=[lgd], bbox_inches='tight')
        
        
def plot_real(powers, titles, save=False, fname=None):
    fig, axes = plt.subplots(1, len(powers), sharey=True)
    fig.set_figheight(4.7)
    fig.set_figwidth(5*len(powers))
    cbar_ax = fig.add_axes([.90, .09, .02, .82])
    
    vmin = np.min(powers)
    vmax = np.max(powers)
    for i, (power, title) in enumerate(zip(powers, titles)):
        sns.heatmap(power, ax=axes[i], vmin=vmin, vmax=vmax, cbar_ax=cbar_ax)
        axes[i].set_title(title)

    fig.tight_layout(rect=[0, 0, .9, 1])
    if save:
        fig.savefig(fname, bbox_inches='tight')

## Linear Dependency Model

Consider a simple linear dependency model, i.e.,
$$
X \sim \mathcal{N}_d(0, I_d) \quad \text{and} \quad Y = X_1 + Z,
$$
where $X_1$ is the first coordinate of $X$ and $Z \sim \mathcal{N}(0, 1)$ is independent of $X$.
We fix the sample size $n = 50$ and vary $d \in \{1, 2, \dots, 10\}$.

In [None]:
n = 50
ndims = 10
dims = range(1, ndims+1)
dist = 'normal-linear'
par = 0.0
nfeat = 100

We first run the ETIC test.

In [None]:
etic = np.zeros((len(SYN_REGS), ndims, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, d in enumerate(dims):
            X, Y = generate_data(n, [d, 1], dist, par)
            etic[i, j, rep] = run_etic(X, Y, [reg, reg], cost_mat)

etic = np.mean(etic, axis=2)

We then run the ETIC-RF test.

In [None]:
etic_rf = np.zeros((len(SYN_REGS), ndims, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, d in enumerate(dims):
            X, Y = generate_data(n, [d, 1], dist, par)
            etic_rf[i, j, rep] = run_eticrf(X, Y, [reg, reg], nfeat)

etic_rf = np.mean(etic_rf, axis=2)

Finally, we run the HSIC test.

In [None]:
hsic = np.zeros((len(SYN_REGS), ndims, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, d in enumerate(dims):
            X, Y = generate_data(n, [d, 1], dist, par)
            hsic[i, j, rep] = run_hsic(X, Y, gram_mat, [reg, reg])

hsic = np.mean(hsic, axis=2)

Plot the results (cf. Figure 1 in the paper).

In [None]:
plot_synthtic(dims, [etic, etic_rf, hsic], 'Dimension')

We also run the two adaptive ETIC tests.

In [None]:
aetic = np.zeros((1, ndims, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, d in enumerate(dims):
        X, Y = generate_data(n, [d, 1], dist, par)
        aetic[0, j, rep] = run_etic(X, Y, SYN_REGS, cost_mat)

aetic = np.mean(aetic, axis=2)
linear_aetic = np.concatenate([etic, aetic])

In [None]:
baetic = np.zeros((1, ndims, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, d in enumerate(dims):
        X, Y = generate_data(n, [d, 1], dist, par)
        baetic[0, j, rep] = run_etic(X, Y, BON_REGS, cost_mat)

baetic = np.mean(baetic, axis=2)
linear_baetic = np.concatenate([etic, baetic])

## Gaussian Sign Model

Consider a Gaussian sign model, i.e.,
$$
X \sim \mathcal{N}_d(0, I_d) \quad \text{and} \quad Y = \lvert Z \rvert \prod_{i=1}^d \text{sgn}(X_i),
$$
where $\text{sgn}(\cdot)$ is the sign function and $Z \sim \mathcal{N}(0, 1)$ is independent of $X$.
We fix $d = 3$ and vary $n \in [100, 500]$.

In [None]:
d = 3
nsizes = 10
ns = np.linspace(100, 500, nsizes, dtype=int)
dist = 'normal-sign'
par = 0.0
nfeat = 100

We first run the ETIC test.

In [None]:
etic = np.zeros((len(SYN_REGS), nsizes, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, n in enumerate(ns):
            X, Y = generate_data(n, [d, 1], dist, par)
            etic[i, j, rep] = run_etic(X, Y, [reg, reg], cost_mat)

etic = np.mean(etic, axis=2)

We then run the ETIC-RF test.

In [None]:
etic_rf = np.zeros((len(SYN_REGS), nsizes, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, n in enumerate(ns):
            X, Y = generate_data(n, [d, 1], dist, par)
            etic_rf[i, j, rep] = run_eticrf(X, Y, [reg, reg], nfeat)

etic_rf = np.mean(etic_rf, axis=2)

Finally, we run the HSIC test.

In [None]:
hsic = np.zeros((len(SYN_REGS), nsizes, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, n in enumerate(ns):
            X, Y = generate_data(n, [d, 1], dist, par)
            hsic[i, j, rep] = run_hsic(X, Y, gram_mat, [reg, reg])

hsic = np.mean(hsic, axis=2)

Plot the results (cf. Figure 2 in the paper).

In [None]:
plot_synthtic(ns, [etic, etic_rf, hsic], 'Sample size')

We also run the $L_1$ test and the Log-likelihood test.

In [None]:
nparts = 3

l1 = np.zeros((1, nsizes, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, n in enumerate(ns):
        X, Y = generate_data(n, [d, 1], dist, par)
        l1[0, j, rep] = run_info(X, Y, nparts, 'l1')
l1 = np.mean(l1, axis=2)


mi = np.zeros((1, nsizes, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, n in enumerate(ns):
        X, Y = generate_data(n, [d, 1], dist, par)
        mi[0, j, rep] = run_info(X, Y, nparts, 'mi')
mi = np.mean(mi, axis=2)

sign = np.concatenate([etic[0:1], etic_rf[0:1], l1, mi])

## Subspace Dependency Model

We construct our data by the following steps:

1. Generate $n$ i.i.d. copies of two random variables following independently $0.5\mathcal{N}(0.98, 0.04) + 0.5\mathcal{N}(-0.98, 0.04)$.
2. Mix the two random variables by a rotation matrix parametrized by $\theta \in [0, \pi/4]$.
3. Append $\mathcal{N}_{d-1}(0, I_{d-1})$ to each of the two mixtures.
4. Multiply each vector by an independent random $d$-dimensional orthogonal matrix.

We fix $n = 64$, $d = 2$ and vary $\theta \in [0, \pi/4]$.

In [None]:
n = 64
d = 2
npars = 12
pars = np.linspace(0, 1, npars)
dist = 'g'
nfeat = 100

We first run the ETIC test.

In [None]:
etic = np.zeros((len(SYN_REGS), npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, par in enumerate(pars*np.pi/4):
            X, Y = generate_data(n, [d, d], dist, par)
            etic[i, j, rep] = run_etic(X, Y, [reg, reg], cost_mat)

etic = np.mean(etic, axis=2)

We then run the ETIC-RF test.

In [None]:
etic_rf = np.zeros((len(SYN_REGS), npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, par in enumerate(pars*np.pi/4):
            X, Y = generate_data(n, [d, d], dist, par)
            etic_rf[i, j, rep] = run_eticrf(X, Y, [reg, reg], nfeat)

etic_rf = np.mean(etic_rf, axis=2)

Finally, we run the HSIC test.

In [None]:
hsic = np.zeros((len(SYN_REGS), npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for i, reg in enumerate(SYN_REGS):
        for j, par in enumerate(pars*np.pi/4):
            X, Y = generate_data(n, [d, d], dist, par)
            hsic[i, j, rep] = run_hsic(X, Y, gram_mat, [reg, reg])

hsic = np.mean(hsic, axis=2)

Plot the results (cf. Figure 3 in the paper).

In [None]:
plot_synthtic(pars, [etic, etic_rf, hsic], r'$\theta$')

We also run the two adaptive ETIC tests.

In [None]:
aetic = np.zeros((1, npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, par in enumerate(pars*np.pi/4):
        X, Y = generate_data(n, [d, d], dist, par)
        aetic[0, j, rep] = run_etic(X, Y, SYN_REGS, cost_mat)

aetic = np.mean(aetic, axis=2)
subspace_aetic = np.concatenate([etic, aetic])

In [None]:
baetic = np.zeros((1, npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, par in enumerate(pars*np.pi/4):
        X, Y = generate_data(n, [d, d], dist, par)
        baetic[0, j, rep] = run_etic(X, Y, BON_REGS, cost_mat)

baetic = np.mean(baetic, axis=2)
subspace_baetic = np.concatenate([etic, baetic])

Plot the results of the adative ETIC test (cf. Figure 7 in the paper).

In [None]:
plot_synthetic_aeot(
    [dims, pars], [linear_aetic, subspace_aetic],
    ['Dimension', r'$\theta$'], 'Adaptive')

Plot the results of the Bonferroni adative ETIC test (cf. Figure 8 in the paper).

In [None]:
plot_synthetic_aeot(
    [dims, pars], [linear_baetic, subspace_baetic],
    ['Dimension', r'$\theta$'], 'Bonferroni')

Moreover, we run the $L_1$ test and the Log-likelihood test.

In [None]:
nparts = 3

l1 = np.zeros((1, npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, par in enumerate(pars*np.pi/4):
        X, Y = generate_data(n, [d, d], dist, par)
        l1[0, j, rep] = run_info(X, Y, nparts, 'l1')
l1 = np.mean(l1, axis=2)

mi = np.zeros((1, npars, NREPS))
for rep in range(NREPS):
    npr.seed(rep)
    for j, par in enumerate(pars*np.pi/4):
        X, Y = generate_data(n, [d, d], dist, par)
        mi[0, j, rep] = run_info(X, Y, nparts, 'mi')
mi = np.mean(mi, axis=2)

subspace = np.concatenate([etic[0:1], etic_rf[0:1], l1, mi])

Plot the results of the baseline tests.

In [None]:
plot_synthtic_info([ns, pars], [sign, subspace], ['Sample size', r'$\theta$'], ['ETIC', 'ETIC-RF', 'L1', 'Log-lik'])

## Bilingual Text

We now investigate the performance of the ETIC test on bilingual data.
Our dataset is taken from the parallel European Parliament corpus [1] which consists of a large number of documents of the same content in different languages.

We randomly select $n = 64$ English documents and a paragraph in each document from the corpus.
We then

1. pair each paragraph with the corresponding paragraph in French to form the dependent sample;
2. pair each paragraph with a random paragraph in the same document in French to form the partially dependent sample;
3. pair each paragraph with a random paragraph in French to form the independent sample.

Wew use LaBSE [2] to embed all the paragraphs into a common feature embedding space of dimension 768 and provide the embeddings on this [website](https://sites.stat.washington.edu/people/liu16/etic/en-fr-embed.zip).

In [None]:
# download
urlretrieve('https://sites.stat.washington.edu/people/liu16/etic/en-fr-embed.zip', 'en-fr-embed.zip')
# unzip
os.system('unzip en-fr-embed.zip')
# load
data = [load_data(f'en-fr-embed/size64-part{i}.txt') for i in range(1, NREPS+1)]

We first run the ETIC test.

In [None]:
etic = np.zeros((len(REAL_REGS), len(REAL_REGS), NREPS, len(data[0])))
for rep in range(NREPS):
    for i, reg1 in enumerate(REAL_REGS):
        for j, reg2 in enumerate(REAL_REGS):
            npr.seed(rep)
            res = np.array([run_etic(X, Y, [reg1, reg2], cost_mat) for (X, Y) in data[rep]])
            etic[i, j, rep] = res

etic = np.mean(etic, axis=2)

We then run the HSIC test.

In [None]:
hsic = np.zeros((len(REAL_REGS), len(REAL_REGS), NREPS, len(data[0])))
for rep in range(NREPS):
    for i, reg1 in enumerate(REAL_REGS):
        for j, reg2 in enumerate(REAL_REGS):
            npr.seed(rep)
            res = np.array([run_hsic(X, Y, gram_mat, [reg1, reg2]) for (X, Y) in data[rep]])
            hsic[i, j, rep] = res

hsic = np.mean(hsic, axis=2)

Print the results on the dependent sample.

In [None]:
print('ETIC')
print(etic[:, :, 0])
print('HSIC')
print(hsic[:, :, 0])

Print the results on the independent sample.

In [None]:
print('ETIC')
print(etic[:, :, 2])
print('HSIC')
print(hsic[:, :, 2])

Plot the results on the partially dependent sample (cf. Figure 4 in the paper).

In [None]:
plot_real([etic[:, :, 1], hsic[:, :, 1]], ['ETIC', 'HSIC'])

For the ETIC-RF test, we reduce the dimension by the principal component analysis before we run the test.
We experiment with different numbers of random features $p$ and principal components $d'$ and examine their effect on the performance of the ETIC-RF test.

We fix $p = 700$ and consider $d' \in \{10 ,20\}$.

In [None]:
nfeat = 700

npc = 10
etic_rf_1 = np.zeros((len(REAL_REGS), len(REAL_REGS), NREPS, len(data[0])))
for rep in range(NREPS):
    for i, reg1 in enumerate(REAL_REGS):
        for j, reg2 in enumerate(REAL_REGS):
            npr.seed(rep)
            res = np.array([run_eticrf(X, Y, [reg1, reg2], nfeat, npc) for (X, Y) in data[rep]])
            etic_rf_1[i, j, rep] = res

etic_rf_1 = np.mean(etic_rf_1, axis=2)

npc = 20
etic_rf_2 = np.zeros((len(REAL_REGS), len(REAL_REGS), NREPS, len(data[0])))
for rep in range(NREPS):
    for i, reg1 in enumerate(REAL_REGS):
        for j, reg2 in enumerate(REAL_REGS):
            npr.seed(rep)
            res = np.array([run_eticrf(X, Y, [reg1, reg2], nfeat, npc) for (X, Y) in data[rep]])
            etic_rf_2[i, j, rep] = res

etic_rf_2 = np.mean(etic_rf_2, axis=2)

Plot the results on the partially dependent sample (cf. Figure 5 in the paper).

In [None]:
plot_real([etic_rf_1[:, :, 1], etic_rf_2[:, :, 1]], ['d\' = 10', 'd\' = 20'])

We fix $d' = 10$ and consider $p \in \{700, 1500\}$.

In [None]:
npc = 10

nfeat = 700
etic_rf_1 = np.zeros((len(REAL_REGS), len(REAL_REGS), NREPS, len(data[0])))
for rep in range(NREPS):
    for i, reg1 in enumerate(REAL_REGS):
        for j, reg2 in enumerate(REAL_REGS):
            npr.seed(rep)
            res = np.array([run_eticrf(X, Y, [reg1, reg2], nfeat, npc) for (X, Y) in data[rep]])
            etic_rf_1[i, j, rep] = res

etic_rf_1 = np.mean(etic_rf_1, axis=2)

nfeat = 1500
etic_rf_2 = np.zeros((len(REAL_REGS), len(REAL_REGS), NREPS, len(data[0])))
for rep in range(NREPS):
    for i, reg1 in enumerate(REAL_REGS):
        for j, reg2 in enumerate(REAL_REGS):
            npr.seed(rep)
            res = np.array([run_eticrf(X, Y, [reg1, reg2], nfeat, npc) for (X, Y) in data[rep]])
            etic_rf_2[i, j, rep] = res

etic_rf_2 = np.mean(etic_rf_2, axis=2)

Plot the results on the partially dependent sample (cf. Figure 6 in the paper).

In [None]:
plot_real([etic_rf_1[:, :, 1], etic_rf_2[:, :, 1]], ['p = 700', 'p = 1500'])

### References

[1] P. Koehn. Europarl: A parallel corpus for statistical machine translation. In *Proceedings of Machine Translation Summit*, 2005.

[2] F. Feng, Y. Yang, D. CCer, N. Arivazhagan, and W. Wang. Language-agnostic BERT sentence em-
bedding. *ArXiv Preprint*, 2020.