## reference: https://heartbeat.fritz.ai/introduction-to-googles-universal-sentence-encoder-a-state-of-art-model-377c025afaca

In [12]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.spatial

In [13]:
# loading pretrained model
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print("Universal sentence encoder loaded from {}".format(module_url))

Universal sentence encoder loaded from https://tfhub.dev/google/universal-sentence-encoder/4


## get embeddings for the sentence

In [None]:
def embed(input):
  return model(input)

In [None]:
sentence = "This is an example code to get embeddings of a sentence"
message_embeddings = embed([sentence])

print("Message: {}".format(sentence))
print("Embedding size: {}".format(len(message_embeddings[0])))
# message_embedding_snippet = ", ".join(
    
# )

Message: This is an example code to get embeddings of a sentence
Embedding size: 512


## measuring textual similarity

In [26]:
def similarity_measure(messages_):
  message_embeddings_ = embed(messages_)
  distance1 = scipy.spatial.distance.cdist([message_embeddings_[0]], [message_embeddings_[1]], "cosine")[0]
  print("Similarity Score: {}".format(1 - distance1))

In [None]:
messages = [
            "how old are you?",
            "what is your age?"
]
similarity_measure(messages)

Similarity Score: [0.80158677]


## https://amitness.com/2020/06/universal-sentence-encoder/

In [1]:
!python -m spacy download en_core_web_md

['Collecting en_core_web_md==2.2.5',
 '\x1b[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)',
 '',
 '\x1b[K     |                                | 10kB 15.7MB/s eta 0:00:07',
 '\x1b[K     |                                | 20kB 14.0MB/s eta 0:00:07',
 '\x1b[K     |                                | 30kB 11.9MB/s eta 0:00:09',
 '\x1b[K     |                                | 40kB 9.0MB/s eta 0:00:11',
 '\x1b[K     |                                | 51kB 7.8MB/s eta 0:00:13',
 '\x1b[K     |                                | 61kB 7.7MB/s eta 0:00:13',
 '\x1b[K     |                                | 71kB 7.2MB/s eta 0:00:14',
 '\x1b[K     |                                | 81kB 7.8MB/s eta 0:00:13',
 '\x1b[K     |                                | 92kB 8.4MB/s eta 0:00:12',
 '\x1b[K     |                                | 102kB 7.4MB/s eta 0:00:13',
 '\x1b[K     |                                | 112kB 7.4MB

In [3]:
import en_core_web_md
nlp = en_core_web_md.load()

In [4]:
nlp("It is cool").similarity(nlp("It"))

0.8813888193696543

In [5]:
nlp('this is cool').similarity(nlp('is this cool?'))

0.9578698295673016

# tensorflow USE multiple languages from the paper

https://arxiv.org/pdf/1907.04307.pdf

how to use USE multilingual in tf2

https://stackoverflow.com/a/58792904

In [10]:
!pip install sentencepiece tf-sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 7.4MB/s 
[?25hCollecting tf-sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/16/a5/16123d662ebeb087552c39c895e9ec6239fb828e236d95fdf67b20907b27/tf_sentencepiece-0.1.90-py2.py3-none-manylinux1_x86_64.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 27.8MB/s 
[?25hInstalling collected packages: sentencepiece, tf-sentencepiece
Successfully installed sentencepiece-0.1.95 tf-sentencepiece-0.1.90


In [11]:
import tensorflow_hub as hub

model = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-multilingual/1")

def embed(input):
  return model(input)

multilingual_embeddings = embed(
    ["Hola Mundo!", "Bonjour le monde!", "Ciao mondo!",
    "Hello World!", "Hallo Welt!", "Hallo Wereld!",
     "你好世界!", "Привет, мир!", "!لمAلعAب Abحrم("])

NotFoundError: ignored

In [16]:
!pip3 install tensorflow_text>=2.0.0rc0

In [19]:
!pip3 install sentencepiece
!pip3 install tf-sentencepiece



In [22]:
import tensorflow_hub as hub
import numpy as np
import tensorflow_text

# Some texts of different lengths.
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

# Compute embeddings.
en_result = embed(english_sentences)
it_result = embed(italian_sentences)
ja_result = embed(japanese_sentences)

# Compute similarity matrix. Higher score indicates greater similarity.
similarity_matrix_it = np.inner(en_result, it_result)
similarity_matrix_ja = np.inner(en_result, ja_result)






In [23]:
similarity_matrix_it

array([[0.9578781 , 0.33086258, 0.30247933],
       [0.38761467, 0.7339295 , 0.24814026],
       [0.2361753 , 0.21800485, 0.928301  ]], dtype=float32)

In [24]:
similarity_matrix_ja

array([[0.9171357 , 0.511527  , 0.31587186],
       [0.44313598, 0.658635  , 0.30921304],
       [0.26650527, 0.25377446, 0.76729906]], dtype=float32)

In [28]:
messages = [
            "Puppies are nice",
            "子犬はいいです"
]
similarity_measure(messages)

Similarity Score: [0.67488011]
