In [None]:
%%capture
!pip install tika
!pip install -U spacy
!python -m spacy download xx_sent_ud_sm
!pip install sentence-transformers
!pip install onnxruntime

In [None]:
import pprint
import multiprocessing
from pathlib import Path

import torch
import torch.onnx as onnx_pt
import transformers

import numpy as np
import onnxruntime as rt

from termcolor import colored
from sentence_transformers import SentenceTransformer
from transformers import convert_graph_to_onnx

pp = pprint.PrettyPrinter(indent=4)
pprint = pp.pprint

In [None]:
print(colored(f"GPU Count: {torch.cuda.device_count()}", "green"))
print(colored(f"CORE Count: {multiprocessing.cpu_count()}", "green"))

[32mGPU Count: 0[0m
[32mCORE Count: 2[0m


In [None]:
span = "I am a span. A short span, but nonetheless a span"

model_name = f"paraphrase-multilingual-MiniLM-L12-v2"
model_access = f"sentence-transformers/{model_name}"

In [None]:
model_raw = SentenceTransformer(model_name, device="cpu")

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

In [None]:
model_pipeline = transformers.FeatureExtractionPipeline(
    model=transformers.AutoModel.from_pretrained(model_access),
    tokenizer=transformers.AutoTokenizer.from_pretrained(model_access, use_fast=True),
    framework="pt",
    device=-1
)

model_pipeline

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/449M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

<transformers.pipelines.feature_extraction.FeatureExtractionPipeline at 0x7f1095678190>

In [None]:
config = model_pipeline.model.config
print(config)

tokenizer = model_pipeline.tokenizer
print(tokenizer)

BertConfig {
  "_name_or_path": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 250037
}

PreTrainedTokenizerFast(name_or_path='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<p

In [None]:
tokenizer.save_pretrained('./sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

('./sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/tokenizer_config.json',
 './sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/special_tokens_map.json',
 './sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/unigram.json',
 './sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/added_tokens.json',
 './sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/tokenizer.json')

In [None]:
with torch.no_grad():
    input_names, output_names, dynamic_axes, tokens = convert_graph_to_onnx.infer_shapes(
        model_pipeline, 
        "pt"
    )
    ordered_input_names, model_args = convert_graph_to_onnx.ensure_valid_input(
        model_pipeline.model, tokens, input_names
    )

Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


In [None]:
pprint(input_names)
pprint(output_names)
pprint(dynamic_axes)
pprint(tokens)
pprint(ordered_input_names)
pprint(model_args)

['input_ids', 'token_type_ids', 'attention_mask']
['output_0', 'output_1']
{   'attention_mask': {0: 'batch', 1: 'sequence'},
    'input_ids': {0: 'batch', 1: 'sequence'},
    'output_0': {0: 'batch', 1: 'sequence'},
    'output_1': {0: 'batch'},
    'token_type_ids': {0: 'batch', 1: 'sequence'}}
{   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),
    'input_ids': tensor([[     0,   3293,     83,     10, 121413, 140992,      2]]),
    'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])}
['input_ids', 'attention_mask', 'token_type_ids']
(   tensor([[     0,   3293,     83,     10, 121413, 140992,      2]]),
    tensor([[1, 1, 1, 1, 1, 1, 1]]),
    tensor([[0, 0, 0, 0, 0, 0, 0]]))


In [None]:
del dynamic_axes["output_0"] # Delete unused output
del dynamic_axes["output_1"] # Delete unused output

output_names = ["sentence_embedding"]
dynamic_axes["sentence_embedding"] = {0: 'batch'}

# Check that everything worked
pprint(output_names)
pprint(dynamic_axes)

['sentence_embedding']
{   'attention_mask': {0: 'batch', 1: 'sequence'},
    'input_ids': {0: 'batch', 1: 'sequence'},
    'sentence_embedding': {0: 'batch'},
    'token_type_ids': {0: 'batch', 1: 'sequence'}}


In [None]:
class SentenceTransformer(transformers.BertModel):
  def __init__(self, config):
    super().__init__(config)

    self.sentence_embedding = torch.nn.Identity()
  
  def forward(self, input_ids, token_type_ids, attention_mask):
    token_embeddings = super().forward(
        input_ids, 
        attention_mask=attention_mask, 
        token_type_ids=token_type_ids
    )[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return self.sentence_embedding(sum_embeddings / sum_mask)

In [None]:
model = SentenceTransformer(config=config).from_pretrained(model_access)

In [None]:
assert np.allclose(
    model_raw.encode(span),
    model(**tokenizer(span, return_tensors="pt")).squeeze().detach().numpy(),
    atol=1e-6,
)

In [None]:
outdir = Path(model_name)
output = outdir / f"{model_name}.onnx"
outdir.mkdir(parents=True, exist_ok=True)

if output.exists():
    print(f"Model {model_name} exists. Skipping creation")
else:
    print(f"Saving to {output}")
    # This is essentially a copy of transformers.convert_graph_to_onnx.convert
    torch.onnx.export(
        model,
        model_args,
        f=output.as_posix(),
        input_names=input_names,
        output_names=output_names,
        dynamic_axes=dynamic_axes,
        do_constant_folding=True,
        use_external_data_format=False,
        enable_onnx_checker=True,
        opset_version=12,
    )

Saving to paraphrase-multilingual-MiniLM-L12-v2/paraphrase-multilingual-MiniLM-L12-v2.onnx




In [None]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.11.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 14.9 MB/s 
Installing collected packages: onnx
Successfully installed onnx-1.11.0


In [None]:
import onnx as onnx_orig

In [None]:
onnx_model = onnx_orig.load(output)
onnx_orig.checker.check_model(onnx_model)
print('The model is checked!')

The model is checked!


### Quantization and Model Optimization

In [None]:
def quantize_onnx_model(onnx_model_path, quantized_model_path):
    from onnxruntime.quantization import quantize_dynamic, QuantType
    import onnx
    onnx_opt_model = onnx.load(onnx_model_path)
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QInt8)

    print(f"quantized model saved to:{quantized_model_path}")

quantize_onnx_model(output, 'bert.quant.onnx')

Ignore MatMul due to non constant B: /[MatMul_70]
Ignore MatMul due to non constant B: /[MatMul_75]
Ignore MatMul due to non constant B: /[MatMul_164]
Ignore MatMul due to non constant B: /[MatMul_169]
Ignore MatMul due to non constant B: /[MatMul_258]
Ignore MatMul due to non constant B: /[MatMul_263]
Ignore MatMul due to non constant B: /[MatMul_352]
Ignore MatMul due to non constant B: /[MatMul_357]
Ignore MatMul due to non constant B: /[MatMul_446]
Ignore MatMul due to non constant B: /[MatMul_451]
Ignore MatMul due to non constant B: /[MatMul_540]
Ignore MatMul due to non constant B: /[MatMul_545]
Ignore MatMul due to non constant B: /[MatMul_634]
Ignore MatMul due to non constant B: /[MatMul_639]
Ignore MatMul due to non constant B: /[MatMul_728]
Ignore MatMul due to non constant B: /[MatMul_733]
Ignore MatMul due to non constant B: /[MatMul_822]
Ignore MatMul due to non constant B: /[MatMul_827]
Ignore MatMul due to non constant B: /[MatMul_916]
Ignore MatMul due to non constant

# Inference Runtime

In [None]:
!pip install onnxruntime



In [None]:
from onnxruntime import InferenceSession, SessionOptions

In [None]:
opt = rt.SessionOptions()
sess = rt.InferenceSession(str(output), opt) # Loads the model

In [None]:
q_output = '/content/bert.quant.onnx'
sess_qt = rt.InferenceSession(str(q_output), opt) # Loads the model

In [None]:
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)

assert np.allclose(model_raw.encode(span), onnx_result, atol=1e-6)
assert np.allclose(model(**tokenizer(span, return_tensors="pt")).squeeze().detach().numpy(), onnx_result, atol=1e-6)

NameError: ignored

In [None]:
model_input = tokenizer(span)

In [None]:
type(model_input)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
%%timeit -n 200
model_raw.encode(span)

200 loops, best of 5: 34.9 ms per loop


In [None]:
%%timeit -n 200
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
output = sess.run(None, model_input)

200 loops, best of 5: 18.2 ms per loop


## Testing large (number) of sentences

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
java_cvs_df = ('/content/drive/MyDrive/artemis-raw-cvs'
              '/sentence-embedding/java-senten-dataset-long-nltk.pkl')

In [None]:
java_sentences_df = pd.read_pickle(java_cvs_df)
java_sentences_df.head(5)

Unnamed: 0,filename,raw-content,file-language,norm-content,clean-content,senten-content,senten-content-nltk
0,Resume_Daniel_Jorge,Microsoft Word - Resume Daniel Jorge.docx\n\n\...,en,Microsoft Word - Resume Daniel Jorge.docx\n\n\...,Microsoft Word Resume Daniel Daniel Jorge de S...,[Microsoft Word Resume Daniel Daniel Jorge de ...,[Microsoft Word Resume Daniel Daniel Jorge de ...
1,Resume_JoseFabian,Copy of Resume - Jose Fabian\n\n\nJose Fabian\...,en,Copy of Resume - Jose Fabian\n\n\nJose Fabian\...,Copy of Resume Jose Fabian Jose Fabian 7+ year...,[Copy of Resume Jose Fabian Jose Fabian 7+ yea...,[Copy of Resume Jose Fabian Jose Fabian 7+ yea...
2,SMBS_Francisco_Briceno,Francisco Briceno\n\n\n \nFRANCISCO \nBRICEÑO ...,es,Francisco Briceno\n\n\n \nFRANCISCO \nBRICEÑO...,Francisco Briceno FRANCISCO BRICEÑO Java Deve...,[Francisco Briceno FRANCISCO BRICEÑO Java Dev...,[Francisco Briceno FRANCISCO BRICEÑO Java Dev...
3,SAULPONCE_RESUME,SAUL VLADIMIR PONCE OSORIO\nResidencial Bosque...,en,SAUL VLADIMIR PONCE OSORIO\nResidencial Bosque...,SAUL VLADIMIR PONCE OSORIO Residencial Bosques...,[SAUL VLADIMIR PONCE OSORIO Residencial Bosque...,[SAUL VLADIMIR PONCE OSORIO Residencial Bosque...
4,Curriculum_Vitae_-_Jose_Navarrete_en_ingles,"José Ignacio navARRETE ARMIJOS, 24 Years\n\n\n...",en,"José Ignacio navARRETE ARMIJOS, 24 Years\n\n\...","José Ignacio navARRETE ARMIJOS, 24 Years +593...","[José Ignacio navARRETE ARMIJOS, 24 Years +59...","[José Ignacio navARRETE ARMIJOS, 24 Years +59..."


### Separating sentences to make a list with all of them

In [None]:
def separating_sentences_with_cv(cv_sentences):
  sentences = [
               (cv_sentences.name, sentence) 
               for sentence in cv_sentences['senten-content']
               ]
  return sentences

In [None]:
java_sentences_list = java_sentences_df.apply(
  separating_sentences_with_cv, axis=1
)

len(java_sentences_list)

171

In [None]:
java_sentences_onelist = [
                          (sentence[0], sentence[1])
                          for sentences in java_sentences_list
                          for sentence in sentences
]
len(java_sentences_onelist), len(java_sentences_onelist[1])

(6542, 2)

In [None]:
java_sentences_onlylist = [
                          sentence[1]
                          for sentences in java_sentences_list
                          for sentence in sentences
]
len(java_sentences_onlylist)

6542

In [None]:
java_sentences_onlylist[10]

'It was designed for safety of owners using biometric control and registration of guest and owners.'

### Producing Embedding

In [None]:
def _text_length(text):
  """
  Help function to get the length for the input text. Text can be either
  a list of ints (which means a single text as input), or a tuple of list of ints
  (representing several text inputs to the model).
  """

  if isinstance(text, dict):              #{key: value} case
    return len(next(iter(text.values())))
  elif not hasattr(text, '__len__'):      #Object has no len() method
    return 1
  elif len(text) == 0 or isinstance(text[0], int):    #Empty string or list of ints
    return len(text)
  else:
    return sum([len(t) for t in text])      #Sum of length of individual strings

In [None]:
length_sorted_idx = np.argsort([-_text_length(sen) for sen in java_sentences_onlylist])
sentences_sorted = [java_sentences_onlylist[idx] for idx in length_sorted_idx]

In [None]:
tokens_java = tokenizer(
    sentences_sorted,
    padding=True,
    truncation='longest_first',
    return_tensors="np",
    max_length=128
)

In [None]:
len(tokens_java['input_ids'])

6542

In [None]:
batch_size = 32

all_embedings = []

for start_index in range(0, len(java_sentences_onlylist), batch_size):
  sentences_batch = sentences_sorted[start_index:start_index+batch_size]
  tokens_batch_java = tokenizer(
    sentences_batch,
    padding=True,
    truncation='longest_first',
    return_tensors="np",
    max_length=128
  )
  model_input = {name : np.atleast_2d(value) for name, value in tokens_batch_java.items()}
  outputs = sess.run(None, model_input)[0]

  all_embedings.extend(outputs)

In [None]:
batch_size = 32

all_embedings_q = []

for start_index in range(0, len(java_sentences_onlylist), batch_size):
  sentences_batch = sentences_sorted[start_index:start_index+batch_size]
  tokens_batch_java = tokenizer(
    sentences_batch,
    padding=True,
    truncation='longest_first',
    return_tensors="np",
    max_length=128
  )
  model_input = {name : np.atleast_2d(value) for name, value in tokens_batch_java.items()}
  outputs = sess_qt.run(None, model_input)[0]

  all_embedings_q.extend(outputs)

In [None]:
len(all_embedings_q)

6542

In [None]:
all_embedings_q[0].shape

(384,)

In [None]:
embeds_sorted_raw = model_raw.encode(sentences_sorted)

In [None]:
embeds_raw = model_raw.encode(java_sentences_onlylist)

In [None]:
import torch

In [None]:
from torch.nn import Embedding, Linear
from torch.quantization import quantize_dynamic
import os

In [None]:
q_model_f = quantize_dynamic(
    model_raw,
    {Linear, Embedding},
    dtype=torch.qint8
)

In [None]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

In [None]:
# compare the sizes
f=print_size_of_model(model_raw,"fp32")
q=print_size_of_model(q_model_f,"int8")
print("{0:.2f} times smaller".format(f/q))

model:  fp32  	 Size (KB): 470700.081
model:  int8  	 Size (KB): 406615.693
1.16 times smaller


In [None]:
sentence_embeddings_java_list = q_model_f.encode(
    java_sentences_onlylist,
    show_progress_bar=True,
    convert_to_tensor=True
)

Batches:   0%|          | 0/205 [00:00<?, ?it/s]

In [None]:
sentence_embeddings_java_list_f = q_model_f.encode(
    java_sentences_onlylist,
)

In [None]:
pip install scikit-learn

