<a href="https://colab.research.google.com/github/korakot/pythainlp_workshop/blob/master/notebooks/USE_encoder_Thai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Google เขา [แจก](http://ai.googleblog.com/2019/07/multilingual-universal-sentence-encoder.html) มา 3 เดือนแล้ว.​ เพิ่งจะได้ลองใช้จริงๆ

ทำตาม [tfhub](https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/1) และ [notebook](/github/tensorflow/hub/blob/master/examples/colab/cross_lingual_similarity_with_tf_hub_multilingual_universal_encoder.ipynb)

ใช้ GPU runtime

## Install, import

In [18]:
# sentencepiece ทำงานได้กับ 1.14 (ทดลอง 1.15 แล้ว error มันบอก)
!pip uninstall --quiet --yes tensorflow
!pip install --quiet tensorflow-gpu==1.14.0
!pip install --quiet tf-sentencepiece
# !pip install --quiet tensorflow-hub
# !pip install --quiet bokeh
# !pip install --quiet simpleneighbors
# !pip install --quiet tqdm



In [19]:
%tensorflow_version 1.x
import tensorflow as tf
import tensorflow_hub as hub
import tf_sentencepiece
tf.__version__

'1.14.0'

In [20]:
# load จาก TF Hub อัน large นี่น่าจะดีสุด.
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/1'  
g = tf.Graph()
with g.as_default():
  text_input = tf.placeholder(dtype=tf.string, shape=[None])
  multiling_embed = hub.Module(module_url)
  embed = multiling_embed(text_input)
  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()
session = tf.Session(graph=g)
session.run(init_op)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


## Encode

In [21]:
sent = 'Puppies are nice.'
vec = session.run(embed, feed_dict={text_input: [sent]})[0]
print(len(vec))
vec[:3]

512


array([ 0.00475384,  0.04097135, -0.00650338], dtype=float32)

In [0]:
def encode(text):
  return session.run(embed, feed_dict={text_input: [text]})[0]

In [23]:
vec = encode('สวัสดีครับ สบายดีไหม')
print(len(vec), vec.dtype, vec[:3])

512 float32 [-0.00248025  0.04347089 -0.0217911 ]


## Visualize

In [0]:
en_sentences = ['dog', 'Puppies are nice.', 
                     'I enjoy taking long walks along the beach with my dog.',
                     'I have dropped my phone in water.']
th_sentences    = ['หมา', 'ลูกหมาน่ารัก',
                     'ฉันชอบการจูงหมาของฉันไปเดินเล่นที่ชายหาด.',
                     'ฉันทำมือถือตกน้ำ']

In [0]:
en_result = session.run(embed, feed_dict={text_input: en_sentences})
th_result = session.run(embed, feed_dict={text_input: th_sentences})

In [0]:
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise
import bokeh
import bokeh.models
import bokeh.plotting

In [0]:
#@title visualize_similarity()
def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         plot_width=1200, plot_height=600,
                         xaxis_font_size='12pt', yaxis_font_size='12pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
  sim = 1 - np.arccos(
      sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                 embeddings_2))/np.pi

  embeddings_1_col, embeddings_2_col, sim_col = [], [], []
  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])
  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns=['embeddings_1', 'embeddings_2', 'sim'])

  mapper = bokeh.models.LinearColorMapper(
      palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
      high=df.sim.max())

  p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                            x_axis_location="above",
                            y_range=[*reversed(labels_2)],
                            plot_width=plot_width, plot_height=plot_height,
                            tools="save",toolbar_location='below', tooltips=[
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim')])
  p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
         fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

  p.title.text_font_size = '12pt'
  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  bokeh.io.output_notebook()
  bokeh.io.show(p)

In [28]:
visualize_similarity(en_result, th_result, en_sentences, th_sentences, 'English-Thai Similarity')

### ลองใช้ tensorboard projector บ้าง

In [0]:
vectors = np.concatenate((en_result, th_result))
metadata = en_sentences + th_sentences

In [0]:
# avoid bug
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

In [31]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
writer.add_embedding(vectors, metadata)
writer.close()

AttributeError: ignored

In [0]:
%load_ext tensorboard

In [0]:
# กด menu ด้านขวาบนของ cell แล้วเลือก View output fullscreen จะได้ชัดๆ
%tensorboard --logdir=runs

## Sentence Piece

In [0]:
embed

In [0]:
mod = multiling_embed

In [0]:
list(mod.variable_map.keys())

In [0]:
text_input

In [0]:
mod.variable_map['Embeddings/sharded_0']