In [None]:
# default_exp sentence_transformers

# Sentence Transformers

> This module is dedicated to implement and test representation with transformers:
>
> - BERT
> 
>
> Maintained by @danaderp

In [None]:
# export
# Imports
import numpy as np
from gensim import corpora

In [None]:
#! pip install -U sentence-transformers
#! pip install --upgrade pip
#!pip install transformers
#!pip install tensorflow
#!pip install grpcio

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.3.7.1)


In [None]:
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.models import KeyedVectors

In [None]:
import gensim.downloader as api

In [None]:
# export
# Imports
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os



In [None]:
import tensorflow as tf

In [None]:
hello = tf.constant("hello TensorFlow!")

# Dummy Example

In [None]:
from sentence_transformers import SentenceTransformer

AttributeError: module 'tensorflow.python.keras.api._v2.keras.activations' has no attribute 'swish'

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

## Soft Cosine Measure basics
Soft Cosine Measure (SCM) is a method that allows us to assess the similarity between two documents in a meaningful way, even when they have no words in common. It uses a measure of similarity between words, which can be derived [2] using word2vec [4] vector embeddings of words. It has been shown to outperform many of the state-of-the-art methods in the semantic text similarity task in the context of community question answering [2].

In [None]:
# Initialize logging.
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

To use SCM, we need some word embeddings first of all. 

In [None]:
def default_params():
    return {
        "system": 'libest',
        "path_to_trained_model": 'test_data/models/word2vec_libest.model',
        "source_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-req].csv',
        "target_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv',
        "system_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv',
        "saving_path": 'test_data/',
        "names": ['Source','Target','Linked?']
    }

In [None]:
params = default_params()
params

{'system': 'libest',
 'path_to_trained_model': 'test_data/models/word2vec_libest.model',
 'source_path': '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-req].csv',
 'target_path': '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv',
 'system_path': '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv',
 'saving_path': 'test_data/',
 'names': ['Source', 'Target', 'Linked?']}

In [None]:
df_source = pd.read_csv(params['source_path'], names=['ids', 'text'], header=None, sep=' ')
df_target = pd.read_csv(params['target_path'], names=['ids', 'text'], header=None, sep=' ')

In [None]:
df_corpus = pd.read_csv(params['system_path'], names=['ids', 'text'], header=0, sep=',')

In [None]:
documents = [doc.split() for doc in df_corpus['text'].values] #documents
documents[0]

['unit',
 'test',
 'user',
 'stori',
 'server',
 'simpl',
 'enrol',
 'august',
 'copyright',
 'cisco',
 'system',
 'inc',
 'right',
 'reserv',
 'includ',
 'stdio',
 'ifndef',
 'win',
 'includ',
 'unistd',
 'endif',
 'includ',
 'est',
 'includ',
 'curl',
 'curl',
 'includ',
 'curl',
 'util',
 'includ',
 'test',
 'util',
 'includ',
 'server',
 'includ',
 'openssl',
 'ssl',
 'ifdef',
 'cunit',
 'includ',
 'cunit',
 'basic',
 'includ',
 'cunit',
 'autom',
 'endif',
 'ifndef',
 'win',
 'static',
 'char',
 'test',
 'outfil',
 'filenam',
 'max',
 'test',
 'hdr',
 'defin',
 'cacert',
 'est',
 'cacert',
 'crt',
 'defin',
 'explicit_cert',
 'us903',
 'cert',
 'pem',
 'defin',
 'us903_explicit_key',
 'us903',
 'key',
 'pem',
 'defin',
 'us903_cacert',
 'est',
 'cacert',
 'crt',
 'defin',
 'us903_trusted_cert',
 'trustedcert',
 'crt',
 'defin',
 'est',
 'privat',
 'estservercertandkey',
 'pem',
 'els',
 'static',
 'char',
 'test5_outfil',
 'filename_max',
 'us903',
 'test5',
 'hdr',
 'defin',
 'us

In [None]:
documents[80]

['libest',
 'test',
 'api',
 'void',
 'est',
 'log',
 'est',
 'log',
 'level',
 'lvl',
 'char',
 'format',
 'libest',
 'test',
 'api',
 'void',
 'est',
 'log',
 'backtrac',
 'void',
 'est',
 'char',
 'est',
 'get',
 'tls',
 'uid',
 'ssl',
 'ssl',
 'int',
 'client',
 'est',
 'libest',
 'test',
 'api',
 'est',
 'error',
 'est',
 'load',
 'cert',
 'est',
 'ctx',
 'ctx',
 'unsign',
 'char',
 'raw',
 'int',
 'size',
 'est',
 'libest',
 'test',
 'api',
 'est',
 'error',
 'est',
 'load',
 'trust',
 'cert',
 'est',
 'ctx',
 'ctx',
 'unsign',
 'char',
 'cert',
 'int',
 'cert',
 'len',
 'est',
 'void',
 'est',
 'log',
 'est',
 'log',
 'level',
 'lvl',
 'char',
 'format',
 'est',
 'libest',
 'test',
 'api',
 'void',
 'est',
 'log',
 'version',
 'void',
 'est',
 'void',
 'est',
 'hex',
 'str',
 'char',
 'dst',
 'unsign',
 'char',
 'src',
 'int',
 'len',
 'est',
 'int',
 'est',
 'base',
 'encod',
 'const',
 'char',
 'src',
 'int',
 'actual',
 'src',
 'len',
 'char',
 'dst',
 'int',
 'max',
 'dst',


In [None]:
len(df_source['text'].values[0].split())

255

In [None]:
sentence_1 = df_source['text'].values[0].split() #This data is already preprocessed
sentence_2 = df_source['text'].values[0].split() #This data is already preprocessed
sentence_3 = df_source['text'].values[1].split() #This data is already preprocessed

In [None]:
sentence_1

['requir',
 'http',
 'uri',
 'control',
 'est',
 'server',
 'must',
 'support',
 'use',
 'path',
 'prefix',
 'well',
 'known',
 'defin',
 'rfc',
 'regist',
 'name',
 'est',
 'thus',
 'valid',
 'est',
 'server',
 'uri',
 'path',
 'begin',
 'https',
 'www',
 'exampl',
 'com',
 'well',
 'known',
 'est',
 'est',
 'oper',
 'indic',
 'path',
 'suffix',
 'indic',
 'intend',
 'oper',
 'oper',
 'correspond',
 'uri',
 'oper',
 'oper',
 'path',
 'detail',
 'distribut',
 'cacert',
 'section',
 'certif',
 'must',
 'enrol',
 'simpleenrol',
 'section',
 'client',
 'must',
 'enrol',
 'simplereenrol',
 'section',
 'client',
 'must',
 'full',
 'cmc',
 'option',
 'fullcmc',
 'section',
 'server',
 'side',
 'key',
 'serverkeygen',
 'section',
 'generat',
 'option',
 'csr',
 'attribut',
 'csrattr',
 'section',
 'option',
 'figur',
 'oper',
 'path',
 'figur',
 'append',
 'path',
 'prefix',
 'form',
 'uri',
 'use',
 'http',
 'get',
 'post',
 'perform',
 'desir',
 'est',
 'oper',
 'exampl',
 'valid',
 'uri',


In [None]:
#[Step 0] After preprocessing and loading the model
new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )

2020-06-09 20:42:44,094 : INFO : loading Word2Vec object from test_data/models/word2vec_libest.model
2020-06-09 20:42:44,201 : INFO : loading wv recursively from test_data/models/word2vec_libest.model.wv.* with mmap=None
2020-06-09 20:42:44,202 : INFO : setting ignored attribute vectors_norm to None
2020-06-09 20:42:44,202 : INFO : loading vocabulary recursively from test_data/models/word2vec_libest.model.vocabulary.* with mmap=None
2020-06-09 20:42:44,203 : INFO : loading trainables recursively from test_data/models/word2vec_libest.model.trainables.* with mmap=None
2020-06-09 20:42:44,203 : INFO : setting ignored attribute cum_table to None
2020-06-09 20:42:44,204 : INFO : loaded test_data/models/word2vec_libest.model


In [None]:
new_model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f4ee4009978>

In [None]:
#[Step 1] Prepare a dictionary and a corpus.
#documents = [sentence_1, sentence_2, sentence_3]
dictionary = corpora.Dictionary(documents)

2020-06-10 15:45:00,938 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-06-10 15:45:01,002 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)


In [None]:
dictionary[2]

'("\\'

In [None]:
#[Step 2] Convert the sentences into bag-of-words vectors.
# https://en.wikipedia.org/wiki/Bag-of-words_model
sentence_1 = dictionary.doc2bow(documents[0])
sentence_2 = dictionary.doc2bow(documents[1])
sentence_3 = dictionary.doc2bow(documents[80])

In [None]:
len(sentence_1)

747

In [None]:
sentence_1[2] #The frequencies to what extent a word occurs in the given sentence

(2, 2)

In [None]:
w2v_model

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f4ea7f27ac8>

In [None]:
#[Step 3] We will use the embeddings to construct a term similarity matrix that will be used by the inner_product method.
#WordEmbeddingSimilarityIndex Computes cosine similarities between word embeddings and retrieves the closest word embeddings by cosine similarity 
#for a given word embedding.
similarity_index = WordEmbeddingSimilarityIndex(new_model.wv)
#Build a term similarity matrix and compute the Soft Cosine Measure.
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)

2020-06-10 15:46:36,910 : INFO : constructing a sparse term similarity matrix using <gensim.models.keyedvectors.WordEmbeddingSimilarityIndex object at 0x7f4e989a3fd0>
2020-06-10 15:46:36,912 : INFO : iterating over columns in dictionary order
2020-06-10 15:46:36,922 : INFO : PROGRESS: at 0.01% columns (1 / 6957, 0.014374% density, 0.014374% projected density)
2020-06-10 15:46:40,690 : INFO : PROGRESS: at 14.39% columns (1001 / 6957, 0.315672% density, 2.108413% projected density)
2020-06-10 15:46:43,414 : INFO : PROGRESS: at 28.76% columns (2001 / 6957, 0.524049% density, 1.786393% projected density)
2020-06-10 15:46:45,194 : INFO : PROGRESS: at 43.14% columns (3001 / 6957, 0.644822% density, 1.475896% projected density)
2020-06-10 15:46:46,851 : INFO : PROGRESS: at 57.51% columns (4001 / 6957, 0.754211% density, 1.300814% projected density)
2020-06-10 15:46:47,933 : INFO : PROGRESS: at 71.88% columns (5001 / 6957, 0.806662% density, 1.116543% projected density)
2020-06-10 15:46:49,278

In [None]:
#[Step 4]Let's compute SCM using the inner_product method.
#Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
#where the dot product between the basis vectors is given by the sparse term similarity matrix.
scm_similarity = similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
print('similarity = %.4f' % scm_similarity)

similarity = 0.6976


In [None]:
#Computing un related sentenes
scm_similarity = similarity_matrix.inner_product(sentence_1, sentence_3, normalized=True)
print('similarity = %.4f' % scm_similarity)

similarity = 0.4969


In [None]:
! nbdev_build_docs