##### Copyright 2018 The TensorFlow Hub Authors.

Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
# Copyright 2018 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Universal Sentence Encoder


<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/hub/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
  <td>
    <a href="https://tfhub.dev/s?q=google%2Funiversal-sentence-encoder%2F4%20OR%20google%2Funiversal-sentence-encoder-large%2F5"><img src="https://www.tensorflow.org/images/hub_logo_32px.png" />See TF Hub models</a>
  </td>
</table>

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


## Setup

This section sets up the environment for access to the Universal Sentence Encoder on TF Hub and provides examples of applying the encoder to words, sentences, and paragraphs.

In [None]:
# %%capture
# !pip3 install seaborn

More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [None]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns



In [None]:
#@title Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print("Message: {}".format(messages[i]))
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join(
      (str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

In [None]:
np.inner(message_embeddings, message_embeddings)

# Semantic Textual Similarity Task Example

The embeddings produced by the Universal Sentence Encoder are approximately normalized. The semantic similarity of two sentences can be trivially computed as the inner product of the encodings.

In [None]:
def plot_similarity(labels, features, rotation, print_labels=True):
  corr = np.inner(features, features)
  labels = [m[:25] + '/' + str(len(m)) for m in labels]
  sns.set(rc = {'figure.figsize':(20,12)})
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      annot=print_labels, fmt='.1f',
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")

def run_and_plot(messages_):
  message_embeddings_ = embed(messages_)
  plot_similarity(messages_, message_embeddings_, 90)

In [None]:
messages = [
# length 250
"What is a color or a friend.",
"This house has a big room and some small rooms.",
"My father was a rolling stone.",
"My father is a good guy.",
"When I get older.",
"When I was younger."
]

run_and_plot(messages)

## Evaluation: STS (Semantic Textual Similarity) Benchmark

The [**STS Benchmark**](https://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) provides an intrinsic evaluation of the degree to which similarity scores computed using sentence embeddings align with human judgements. The benchmark requires systems to return similarity scores for a diverse selection of sentence pairs. [Pearson correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) is then used to evaluate the quality of the machine similarity scores against human judgements.

### Download data

In [None]:
# import pandas
# import scipy
# import math
# import csv

# sts_dataset = tf.keras.utils.get_file(
#     fname="Stsbenchmark.tar.gz",
#     origin="http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz",
#     extract=True)
# sts_dev = pandas.read_table(
#     os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-dev.csv"),
#     error_bad_lines=False,
#     skip_blank_lines=True,
#     usecols=[4, 5, 6],
#     names=["sim", "sent_1", "sent_2"])
# sts_test = pandas.read_table(
#     os.path.join(
#         os.path.dirname(sts_dataset), "stsbenchmark", "sts-test.csv"),
#     error_bad_lines=False,
#     quoting=csv.QUOTE_NONE,
#     skip_blank_lines=True,
#     usecols=[4, 5, 6],
#     names=["sim", "sent_1", "sent_2"])
# # cleanup some NaN values in sts_dev
# sts_dev = sts_dev[[isinstance(s, str) for s in sts_dev['sent_2']]]

### Evaluate Sentence Embeddings

In [None]:
# sts_data = sts_dev #@param ["sts_dev", "sts_test"] {type:"raw"}

# def run_sts_benchmark(batch):
#   sts_encode1 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_1'].tolist())), axis=1)
#   sts_encode2 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_2'].tolist())), axis=1)
#   cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
#   clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
#   scores = 1.0 - tf.acos(clip_cosine_similarities) / math.pi
#   """Returns the similarity scores"""
#   return scores

# dev_scores = sts_data['sim'].tolist()
# scores = []
# for batch in np.array_split(sts_data, 10):
#   scores.extend(run_sts_benchmark(batch))

# pearson_correlation = scipy.stats.pearsonr(scores, dev_scores)
# print('Pearson correlation coefficient = {0}\np-value = {1}'.format(
#     pearson_correlation[0], pearson_correlation[1]))

In [None]:
# !pip3 install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel

#checkpoint = "allenai/longformer-base-4096"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
#checkpoint = "dmis-lab/biobert-large-cased-v1.1"
#checkpoint = "mrm8488/longformer-base-4096-finetuned-squadv2"
#checkpoint = "johngiorgi/declutr-small"
checkpoint = "johngiorgi/declutr-base"
#checkpoint = "johngiorgi/declutr-sci-base"
#checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

In [None]:
text = "my father was a Rolling Stone"*100
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [None]:
outputs = model(**inputs)

In [None]:
outputs.keys()

In [None]:
outputs['last_hidden_state'][:,7,:].shape #, outputs['pooler_output'].shape, 

In [None]:
#outputs['last_hidden_state'][:,-1,:] == outputs.pooler_output

In [None]:
import torch

def embed_trsf(messages):
    inputs = tokenizer(messages, padding=True, truncation=True, return_tensors="pt")
    #print(inputs)
    with torch.no_grad():
        outputs = model(**inputs)
    try:
        embed_trsf = outputs.pooler_output
        return torch.nn.functional.normalize(embed_trsf.detach()).numpy()
    except:
        embed_trsf = outputs.last_hidden_state # pooler_output
        return torch.nn.functional.normalize(embed_trsf[:, 0, :].detach()).numpy()

def run_and_plot_trsf(messages_):
  message_embeddings_ = embed_trsf(messages_)
  plot_similarity(messages_, message_embeddings_, 90)

In [None]:
m = ["We found him on the floor in a critical state"]

inputs = tokenizer(m, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

outputs.pooler_output

In [None]:
messages = [
"What is a color or a friend.",
"This house has a big room and some small rooms.",
"My father was a rolling stone.",
"My father is a good guy.",
"When I get older.",
"When I was younger."
]

run_and_plot_trsf(messages)

In [None]:
from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)


In [None]:
def run_and_plot_se(messages_, print_labels=True):
    message_embeddings_ = model.encode(messages_)
    plot_similarity(messages_, message_embeddings_, 90, print_labels)

In [None]:
messages = [
"What is a color or a friend.",
"This house has a big room and some small rooms.",
"My father was a rolling stone.",
"My father is a good guy.",
"When I get older.",
"When I was younger."
]

run_and_plot_se(messages)

In [None]:
messages = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.'
          ]
run_and_plot_se(messages)

In [None]:
model

In [None]:
from sentence_transformers import SentenceTransformer, models
import torch.nn as nn
#word_embedding_model = models.Transformer('bert-base-uncased')
word_embedding_model = models.Transformer('jamesmullenbach/CLIP_DNote_BERT_Context')

tokens = ["[DOC]", "[QRY]"]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=1024, activation_function=nn.Tanh())
normalizer = models.Normalize()
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model, normalizer])

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

In [None]:
sentence_embeddings.shape

In [None]:
model

In [None]:
import pickle

df = pickle.load(open('/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/full_train_data_unique.pickle', 'rb'))

In [None]:
notes = df['notes']
messages = []
no_messages = 50

for i in range(no_messages):
    messages.append(f'"{notes[i][:200]}",')
    
run_and_plot_se(messages, print_labels=False)

In [None]:
# experimental
model = SentenceTransformer('all-mpnet-base-v2')
word_embedding_model = model._modules['0']
pooling_model = model._modules['1']
normalize_model = model._modules['2']

#pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=1024, activation_function=nn.Tanh())
#normalizer = models.Normalize()
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model, normalizer])

In [None]:
model = SentenceTransformer('../models/pretrained_sentence_transformer')

In [None]:
model

In [None]:
model._modules['3']

In [None]:
pretrained_model="abc"

In [None]:
pretrained_model

In [None]:
# for m in word_embedding_model.modules():
#     print(m)

In [None]:
model

In [None]:
model.save('pretrained_sentence_transformer')

# SentenceTransformer Model used in Master Thesis Code

In [None]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

# 'microsoft/mpnet-base'
word_embedding_model = models.Transformer('sentence-transformers/all-mpnet-base-v2', max_seq_length=384)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=1024,
                           activation_function=nn.Tanh())

normalize_model = models.Normalize()
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model, normalize_model])

# freeze transformer layers of the model
auto_model = model._first_module().auto_model
for param in auto_model.parameters():
    param.requires_grad = False
    

In [None]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

# clinical BERT model embeddings
word_embedding_model = models.Transformer('emilyalsentzer/Bio_ClinicalBERT', max_seq_length=384)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=15000,
                           activation_function=nn.ReLU())
dense_model2 = models.Dense(in_features=15000, out_features=1024,
                           activation_function=nn.ReLU())

normalize_model = models.Normalize()

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model, dense_model2, normalize_model])

# freeze transformer layers of the model
auto_model = model._first_module().auto_model
for param in auto_model.parameters():
    param.requires_grad = False
    


In [None]:
model

In [None]:
model.save('pretrained_sentence_transformer_clinicalBertEmbeds')

In [None]:
model.load('pretrained_sentence_transformer_3')

In [None]:
model = SentenceTransformer('pretrained_sentence_transformer')

In [None]:
model

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

In [None]:
sentence_embeddings.shape

In [None]:
np.inner(sentence_embeddings, sentence_embeddings)

In [None]:
sentence_embeddings @ sentence_embeddings.T

In [None]:
messages = df['notes'][:-1:len(df['notes'])//20]
run_and_plot_se(messages)