### Cross-lingual sentence similarity using TensorFlow

Based on https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/cross_lingual_similarity_with_tf_hub_multilingual_universal_encoder.ipynb#scrollTo=MSeY-MUQo2Ha

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
%%capture
#@title Setup Environment
# Install the latest Tensorflow version.
#!pip install "tensorflow-text==2.8.*"
#!pip install bokeh
#!pip install simpleneighbors[annoy]
#!pip install tqdm

In [5]:
#@title Setup common imports and functions
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise

#from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange

def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         plot_width=1200, plot_height=600,
                         xaxis_font_size='12pt', yaxis_font_size='12pt'):

    assert len(embeddings_1) == len(labels_1)
    assert len(embeddings_2) == len(labels_2)

    # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
    sim = 1 - np.arccos(
        sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                   embeddings_2))/np.pi

    embeddings_1_col, embeddings_2_col, sim_col = [], [], []
    for i in range(len(embeddings_1)):
        for j in range(len(embeddings_2)):
            embeddings_1_col.append(labels_1[i])
            embeddings_2_col.append(labels_2[j])
            sim_col.append(sim[i][j])
    df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                      columns=['embeddings_1', 'embeddings_2', 'sim'])

    mapper = bokeh.models.LinearColorMapper(
        palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
        high=df.sim.max())

    p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                              x_axis_location="above",
                              y_range=[*reversed(labels_2)],
                              plot_width=plot_width, plot_height=plot_height,
                              tools="save",toolbar_location='below', tooltips=[
            ('pair', '@embeddings_1 ||| @embeddings_2'),
            ('sim', '@sim')])
    p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
           fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

    p.title.text_font_size = '12pt'
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 16
    p.xaxis.major_label_text_font_size = xaxis_font_size
    p.xaxis.major_label_orientation = 0.25 * np.pi
    p.yaxis.major_label_text_font_size = yaxis_font_size
    p.min_border_right = 300

    bokeh.io.output_notebook()
    bokeh.io.show(p)

In [67]:
tqdm.pandas()

In [71]:
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
#module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'
#@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']

module_url = "https://tfhub.dev/google/LaBSE/2"

model = hub.load(module_url)

def embed_text(input):
    return model(input)

In [75]:
model("Testing 123")

ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * Testing 123
    * False
    * None
  Keyword arguments: {}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids')}
    * True
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids')}
    * False
    * None
  Keyword arguments: {}

Option 3:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_type_ids'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_word_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_mask')}
    * True
    * None
  Keyword arguments: {}

Option 4:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_type_ids'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_word_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_mask')}
    * False
    * None
  Keyword arguments: {}

In [72]:
# Some texts of different lengths in different languages.
arabic_sentences = ['كلب', 'الجراء لطيفة.', 'أستمتع بالمشي لمسافات طويلة على طول الشاطئ مع كلبي.']
chinese_sentences = ['狗', '小狗很好。', '我喜欢和我的狗一起沿着海滩散步。']
english_sentences = ['dog', 'Puppies are nice.', 'I enjoy taking long walks along the beach with my dog.']
french_sentences = ['chien', 'Les chiots sont gentils.', 'J\'aime faire de longues promenades sur la plage avec mon chien.']
german_sentences = ['Hund', 'Welpen sind nett.', 'Ich genieße lange Spaziergänge am Strand entlang mit meinem Hund.']
italian_sentences = ['cane', 'I cuccioli sono carini.', 'Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.']
japanese_sentences = ['犬', '子犬はいいです', '私は犬と一緒にビーチを散歩するのが好きです']
korean_sentences = ['개', '강아지가 좋다.', '나는 나의 개와 해변을 따라 길게 산책하는 것을 즐긴다.']
russian_sentences = ['собака', 'Милые щенки.', 'Мне нравится подолгу гулять по пляжу со своей собакой.']
spanish_sentences = ['perro', 'Los cachorros son agradables.', 'Disfruto de dar largos paseos por la playa con mi perro.']

# Multilingual example
multilingual_example = ["Willkommen zu einfachen, aber", "verrassend krachtige", "multilingüe", "compréhension du langage naturel", "модели.", "大家是什么意思" , "보다 중요한", ".اللغة التي يتحدثونها"]
multilingual_example_in_en =  ["Welcome to simple yet", "surprisingly powerful", "multilingual", "natural language understanding", "models.", "What people mean", "matters more than", "the language they speak."]

In [73]:
# Compute embeddings.
ar_result = embed_text(arabic_sentences)
en_result = embed_text(english_sentences)
es_result = embed_text(spanish_sentences)
de_result = embed_text(german_sentences)
fr_result = embed_text(french_sentences)
it_result = embed_text(italian_sentences)
ja_result = embed_text(japanese_sentences)
ko_result = embed_text(korean_sentences)
ru_result = embed_text(russian_sentences)
zh_result = embed_text(chinese_sentences)

multilingual_result = embed_text(multilingual_example)
multilingual_in_en_result = embed_text(multilingual_example_in_en)

ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * ['كلب', 'الجراء لطيفة.', 'أستمتع بالمشي لمسافات طويلة على طول الشاطئ مع كلبي.']
    * False
    * None
  Keyword arguments: {}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids')}
    * True
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids')}
    * False
    * None
  Keyword arguments: {}

Option 3:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_type_ids'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_word_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_mask')}
    * True
    * None
  Keyword arguments: {}

Option 4:
  Positional arguments (3 total):
    * {'input_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_type_ids'), 'input_word_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_word_ids'), 'input_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/input_mask')}
    * False
    * None
  Keyword arguments: {}

In [None]:
visualize_similarity(multilingual_in_en_result, multilingual_result,
                     multilingual_example_in_en, multilingual_example,
                     "Multilingual Universal Sentence Encoder for Semantic Retrieval (Yang et al., 2019)")

In [23]:
#visualize_similarity(zh_result, ko_result, chinese_sentences, korean_sentences, 'Chinese-Korean Similarity')

In [11]:
# So now try it with aligned texts

In [12]:
aligned_path = "../Texts_Aligned/de_tge.en_fowkes/"

In [15]:
cur_fpath = os.path.join(aligned_path, "ch01.align.txt")

In [20]:
cur_df = pd.read_csv(cur_fpath, delimiter='\t', header=None, names=['de','en','alignment_id'])

In [43]:
cur_df.fillna("", inplace=True)

In [45]:
cur_df.iloc[0]

de                                                    
en                         Chapter 1: The Commodity 1.
alignment_id    ch01_clean.de_tge-ch01_clean.en_fowkes
Name: 0, dtype: object

In [46]:
de_result = embed_text(cur_df.iloc[2]['de'])
en_result = embed_text(cur_df.iloc[2]['en'])

In [36]:
def compute_sim(l1_tensor, l2_tensor):
    # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
    sim = 1 - np.arccos(sklearn.metrics.pairwise.cosine_similarity(l1_tensor, l2_tensor))/np.pi
    return float(sim)

In [48]:
compute_row_sim = lambda row: compute_sim(embed_text(row['de']), embed_text(row['en']))

In [49]:
cur_df['sim'] = cur_df.apply(compute_row_sim, axis=1)

In [50]:
cur_df

Unnamed: 0,de,en,alignment_id,sim
0,,Chapter 1: The Commodity 1.,ch01_clean.de_tge-ch01_clean.en_fowkes,0.497484
1,ERSTES KAPITEL Die Ware 1. Die zwei Faktoren d...,THE TWO FACTORS OF THE COMMODITY: USE-VALUE AN...,ch01_clean.de_tge-ch01_clean.en_fowkes,0.757847
2,Unsere Untersuchung beginnt daher mit der Anal...,Our investigation therefore begins with the an...,ch01_clean.de_tge-ch01_clean.en_fowkes,0.859049
3,"Die Ware ist zunächst ein äußerer Gegenstand, ...","The commodity is, first of all, an external ob...",ch01_clean.de_tge-ch01_clean.en_fowkes,0.812498
4,"Die Natur dieser Bedürfnisse, ob sie z.B. dem ...","The nature of these needs, whether they arise,...",ch01_clean.de_tge-ch01_clean.en_fowkes,0.753188
...,...,...,...,...
613,Ein Mensch oder ein Gemeinwesen ist reich; ein...,"A man or a community is rich, a pearl or a dia...",ch01_clean.de_tge-ch01_clean.en_fowkes,0.813036
614,Bisher hat noch kein Chemiker Tauschwert in Pe...,So far no chemist has ever discovered exchange...,ch01_clean.de_tge-ch01_clean.en_fowkes,0.734134
615,Die ökonomischen Entdecker dieser chemischen S...,The economists who have discovered this chemic...,ch01_clean.de_tge-ch01_clean.en_fowkes,0.762967
616,"Was sie hierin bestätigt, ist der sonderbare U...",What confirms them in this view is the peculia...,ch01_clean.de_tge-ch01_clean.en_fowkes,0.748863


In [51]:
cur_df['sim'].mean()

0.7231409977940679

In [58]:
v2_fpath = "../Texts_Aligned/de_tge.en_aveling/ch01.align.txt"

In [61]:
v2_df = pd.read_csv(v2_fpath, delimiter='\t', header=None, names=['de','en','alignment_id'])

In [64]:
v2_df.fillna("", inplace=True)

In [68]:
v2_df['sim'] = v2_df.progress_apply(compute_row_sim, axis=1)

100%|██████████| 625/625 [00:08<00:00, 77.74it/s]


In [69]:
v2_df

Unnamed: 0,de,en,alignment_id,sim
0,ERSTES KAPITEL Die Ware 1. Die zwei Faktoren d...,Chapter 1: Commodities Section 1: The Two Fact...,ch01_clean.de_tge-ch01_clean.en_aveling,0.799426
1,Unsere Untersuchung beginnt daher mit der Anal...,Our investigation must therefore begin with th...,ch01_clean.de_tge-ch01_clean.en_aveling,0.805118
2,"Die Ware ist zunachst ein ausserer Gegenstand,...","A commodity is, in the first place, an object ...",ch01_clean.de_tge-ch01_clean.en_aveling,0.742811
3,"Die Natur dieser Bedurfnisse, ob sie z.B. dem ...","The nature of such wants, whether, for instanc...",ch01_clean.de_tge-ch01_clean.en_aveling,0.732976
4,"Es handelt sich hier auch nicht darum, wie die...",Neither are we here concerned to know how the ...,ch01_clean.de_tge-ch01_clean.en_aveling,0.685437
...,...,...,...,...
620,Eine Perle oder ein Diamant hat Wert als Perle...,A pearl or a diamond is valuable as a pearl or...,ch01_clean.de_tge-ch01_clean.en_aveling,0.789187
621,Bisher hat noch kein Chemiker Tauschwert in Pe...,So far no chemist has ever discovered exchange...,ch01_clean.de_tge-ch01_clean.en_aveling,0.732880
622,Die okonomischen Entdecker dieser chemischen S...,The economic discoverers of this chemical elem...,ch01_clean.de_tge-ch01_clean.en_aveling,0.699186
623,"Was sie hierin bestatigt, ist der sonderbare U...","What confirms them in this view, is the peculi...",ch01_clean.de_tge-ch01_clean.en_aveling,0.733851


In [70]:
v2_df['sim'].mean()

0.7033427904129028