In [0]:
!pip install sentence_transformers

Collecting sentence_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b9/46/b7d6c37d92d1bd65319220beabe4df845434930e3f30e42d3cfaecb74dc4/sentence-transformers-0.2.6.1.tar.gz (55kB)
[K     |████████████████████████████████| 61kB 4.1MB/s 
[?25hCollecting transformers>=2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 12.9MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 23.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86

In [0]:
import numpy as np
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import scipy.spatial
from nltk.metrics import precision, recall, f_measure
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer


### примеры

In [0]:
corpus = [
   'Iterations can group work packages with the same due dates.',
    'Iterations can group your work packages by due dates.',
 ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(X[0], X[1]))

[[0.73786479]]


In [0]:
from sklearn.metrics.pairwise import cosine_similarity

corpus = [
   'Causes the file information and file type evaluated for each symbolic link to be those of the file referenced by the link, and not the link itself. See NOTES.',
    'when showing file information for a symbolic link, show information  for  the file the link references rather than for the link itself.',
 ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(cosine_similarity(X[0], X[1]))

[[0.6991387]]


In [0]:
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
corpus = ['Causes the file information and file type evaluated for each symbolic link to be those of the file referenced by the link, and not the link itself. See NOTES.']
corpus_embeddings = embedder(corpus)
queries = ['when showing file information for a symbolic link, show information  for  the file the link references rather than for the link itself.']
query_embeddings = embedder(queries)
for query, query_embedding in zip(queries, query_embeddings):
    distances = 1 - scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
print(distances)

[0.72122288]


In [0]:
# get cosine similairty matrix
def cos_sim(input_vectors):
    similarity = cosine_similarity(input_vectors)
    return similarity

In [0]:
corpus = [
   'Causes the file information and file type evaluated for each symbolic link to be those of the file referenced by the link, and not the link itself. See NOTES.',
    'when showing file information for a symbolic link, show information  for  the file the link references rather than for the link itself.',
 ]
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
corpus_embeddings = embedder(corpus)


In [0]:
similarity_matrix = cos_sim(np.array(corpus_embeddings))
similarity_matrix

array([[0.9999999 , 0.72122276],
       [0.72122276, 0.99999994]], dtype=float32)

In [0]:
embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
corpus = ['Causes the file information and file type evaluated for each symbolic link to be those of the file referenced by the link, and not the link itself. See NOTES.']
corpus_embeddings = embedder.encode(corpus)
queries = ['when showing file information for a symbolic link, show information  for  the file the link references rather than for the link itself.']
query_embeddings = embedder.encode(queries)
for query, query_embedding in zip(queries, query_embeddings):
    distances = 1 - scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
print(distances)

[0.82735966]


In [0]:
embedder = SentenceTransformer('bert-large-nli-stsb-mean-tokens')
corpus = ['Causes the file information and file type evaluated for each symbolic link to be those of the file referenced by the link, and not the link itself. See NOTES.']
corpus_embeddings = embedder.encode(corpus)
queries = ['when showing file information for a symbolic link, show information  for  the file the link references rather than for the link itself.']
query_embeddings = embedder.encode(queries)
for query, query_embedding in zip(queries, query_embeddings):
    distances = 1 - scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
print(distances)

[0.85496401]


In [0]:
embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [0]:
corpus = ['Iterations can group work packages with the same due dates.']
corpus_embeddings = embedder.encode(corpus)
queries = ['Iterations can group your work packages by due dates.']
query_embeddings = embedder.encode(queries)


In [0]:
for query, query_embedding in zip(queries, query_embeddings):
    distances = 1 - scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

In [0]:
print(distances)

[0.87100628]


In [0]:
corpus = [
   'Adds to errors if this method: is not public, or takes parameters, or returns something other than void, or is static (given isStatic is false), or is not static (given isStatic is true).',
    'Adds to errors if any method in this class is annotated with annotation, but: is not public, or takes parameters, or returns something other than void, or is static (given isStatic is false), or is not static (given isStatic is true).',
 ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(cosine_similarity(X[0], X[1]))

[[0.95700119]]


In [0]:
embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
corpus = ['Adds to errors if this method: is not public, or takes parameters, or returns something other than void, or is static (given isStatic is false), or is not static (given isStatic is true).']
corpus_embeddings = embedder.encode(corpus)
queries = ['Adds to errors if any method in this class is annotated with annotation, but: is not public, or takes parameters, or returns something other than void, or is static (given isStatic is false), or is not static (given isStatic is true).']
query_embeddings = embedder.encode(queries)
for query, query_embedding in zip(queries, query_embeddings):
    distances = 1 - scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
print(distances)
# для bert 0.882, count 0.957, roberta 0.891

[0.89121865]


In [0]:
corpus = [
   'Adding work packages to an iteration.Iterations can group work packages with the same due dates. At least one iteration must exist. Group all work packages due for the same milestone into one iteration. In My Products, expand the desired product. Click the desired release. Click Iterations. Select the box around the iteration to which the work package belongs. Work packages that are not yet added to an iteration are shown in Unscheduled. All work packages in the selected iteration appear. Drag and drop the work package into the desired iteration. The work package is now part of the selected iteration.',
    'Adding an iteration. Iterations can group your work packages by due dates. Know the due date for your iteration. Use iterations to easily track a group of work packages with similar due dates. In My Products, expand the desired product. Click the desired release. Click Iterations. Click Create new iteration. Enter a name and description for the iteration. Set the start and end dates. Click Save. The iteration is set and ready for work packages. If you need to edit an iteration, click the iteration name. The iteration details window opens for editing.',
 ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(cosine_similarity(X[0], X[1]))

[[0.77136617]]


In [0]:
corpus = ['Adding work packages to an iteration.Iterations can group work packages with the same due dates. At least one iteration must exist. Group all work packages due for the same milestone into one iteration. In My Products, expand the desired product. Click the desired release. Click Iterations. Select the box around the iteration to which the work package belongs. Work packages that are not yet added to an iteration are shown in Unscheduled. All work packages in the selected iteration appear. Drag and drop the work package into the desired iteration. The work package is now part of the selected iteration']
corpus_embeddings = embedder(corpus)
queries = ['Adding an iteration. Iterations can group your work packages by due dates. Know the due date for your iteration. Use iterations to easily track a group of work packages with similar due dates. In My Products, expand the desired product. Click the desired release. Click Iterations. Click Create new iteration. Enter a name and description for the iteration. Set the start and end dates. Click Save. The iteration is set and ready for work packages. If you need to edit an iteration, click the iteration name. The iteration details window opens for editing.']
query_embeddings = embedder(queries)
for query, query_embedding in zip(queries, query_embeddings):
    distances = 1 - scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
print(distances)

[0.71163601]


In [0]:
# гугл 0.71, roberta 0.846, берт 0.866