# Visualizing cross-lingual textual similarity 

In [None]:
!pip install sentence_transformers datasets transformers umap-learn

Tatoeba is a collection of sentences and translations.

In [None]:
from datasets import load_dataset
import pandas as pd

data = load_dataset("xtreme", "tatoeba.rus", split="validation")
pd.DataFrame(data)[["source_sentence", "target_sentence"]]

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("stsb-xlm-r-multilingual")

In [None]:
K = 30
emb = model.encode(data["source_sentence"][:K] + data["target_sentence"][:K])
len(emb), len(emb[0])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import umap
import pylab

X = umap.UMAP(n_components=2, random_state=42).fit_transform(emb)
idx = np.arange(len(emb))

fig, ax = plt.subplots(figsize=(12, 12))
ax.set_facecolor("whitesmoke")

cm = pylab.get_cmap("prism")
colors = list(cm(1.0 * i / K) for i in range(K))

for i in idx:
    if i < K:
        ax.annotate("RUS-" + str(i), (X[i, 0], X[i, 1]), c=colors[i])
        ax.plot((X[i, 0], X[i + K, 0]), (X[i, 1], X[i + K, 1]), "k:")
    else:
        ax.annotate("EN-" + str(i % K), (X[i, 0], X[i, 1]), c=colors[i % K])

let us compare entire sentence pairs

In [None]:
source_emb = model.encode(data["source_sentence"])
target_emb = model.encode(data["target_sentence"])

In [None]:
from scipy import spatial
from matplotlib import pyplot

sims = [1 - spatial.distance.cosine(s, t) for s, t in zip(source_emb, target_emb)]
pyplot.hist(sims, bins=100, range=(0.8, 1))
pyplot.show()

In [None]:
np.mean(sims), np.std(sims)