# BERT For Measuring Text Similarity


https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1

In [None]:
import json
import torch
import scipy
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [None]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

themes = ['fish', 'dream', 'toilet', 'friend']
corpus = [" ".join(themes)] + sentences

model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(corpus)

cs = cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

In [None]:
asort = np.argsort(cs)+1
asort[0][::-1]

In [None]:
corpus[2]

# MR testing

In [None]:
import json
import torch
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [5]:
import json
import numpy as np
from glob import glob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# corpi = glob("./corpus*txt")
# with open(corpi[1], 'r') as file:
#         f = json.load(file)
#         themes = [v['name'] for k, v in f[0].items()]
#         corpus = f[1:]

# for c in corpus:
#     guid = c['guid']
#     content = c['content']
#     for t in themes:
#         tc = [t] + content
#         model = SentenceTransformer('bert-base-nli-mean-tokens')
#         sentence_embeddings = model.encode(tc)
#         sims = cosine_similarity([sentence_embeddings[0]],sentence_embeddings[1:])[0]
#         asort = np.argsort(cs)+1
#         asort = asort[0][::-1]
#         for x in asort[:5]:
#             print(f"{t.upper()}:")
#             print(f"{tc[x]}")
#             #print(f"{t.upper()}: {tc[x]}")

In [6]:
corpi = glob("./corpus*txt")
with open(corpi[1], 'r') as file:
        f = json.load(file)
        themes = [v['name'] for k, v in f[0].items()]
        corpus = f[1:]

themes = " ".join(themes)
print(f"THEMES: {themes}")
print("------------------------------------------------------------------------------------------------------------------")
print()


for c in corpus:
    guid = c['guid']
    content = c['content']
    tc = [themes] + content
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode(tc)
    sims = cosine_similarity([sentence_embeddings[0]],sentence_embeddings[1:])[0]
    asort = np.argsort(sims)+1
    asort = asort[::-1]
    for x in asort[:5]:
        print(f"{tc[x]}")
        print()

THEMES: big data chris9ne z data center data lake data management et cetera point of view public cloud use cases
------------------------------------------------------------------------------------------------------------------

A typical  example, a uAlity company; collecAng data the whole IOT big data story.

Right here  we're actually talking about taking enterprise funcAonality like the storage  system or the NAS system and bring that into the cloud.

One thing we believe that customers are interested in adopAng is the  preservaAon of the enterprise style of IT management while borrowing  resources and services from the cloud.

I  you are spinning oﬀ a media and you want some VSP capability, do it in  the cloud and when you see it actually takes oﬀ, and the load is increasing,  move it on Prim and move it even into the enterprise environment where  you have all the stuﬀ ...

You have just made the SAP HANA a cloud business which is having  a self service, orchestration, automation,

In [None]:
import numpy as np
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


### Critical operations for the system

def sentencesims(my_themes, corpus, n=5):
    themes = " ".join([t for t in my_themes['themes'].keys()])
    themes = " ".join(set(themes.split()))
    for c in corpus:
        guid = c['guid']
        content = c['content']
        tc = [themes] + content
        model = SentenceTransformer('bert-base-nli-mean-tokens')
        sentence_embeddings = model.encode(tc)
        sims = cosine_similarity([sentence_embeddings[0]],sentence_embeddings[1:])[0]
        asort = np.argsort(sims)+1
        asort = asort[::-1]
        for x in asort[:n]:
            print(f"GUID {guid}: {tc[x]}")
            print()
            
sentencesims(my_themes, corpus)

In [18]:
corpus[0]['content'][2]

'!'

# NLPIA

In [162]:
import copy
import nltk
import torch
import torch.nn as nn
import numpy as np
from collections import Counter, OrderedDict
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [39]:
def cosine_sim(vec1, vec2):
    """ Let's convert our dictionaries to lists for easier matching."""
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
        mag_1 = np.math.sqrt(sum([x**2 for x in vec1]))
        mag_2 = np.math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

In [11]:
docs = ["The faster Harry got to the store, the faster and faster Harry would get home."]
docs.append("Harry is hairy and faster than Jill.")
docs.append("Jill is not as hairy as Harry.")
docs

['The faster Harry got to the store, the faster and faster Harry would get home.',
 'Harry is hairy and faster than Jill.',
 'Jill is not as hairy as Harry.']

In [37]:
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]
all_doc_tokens = sum(doc_tokens, [])
lexicon = sorted(set(all_doc_tokens))
zero_vector = OrderedDict((token, 0) for token in lexicon)

doc_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        vec[key] = value / len(lexicon)
    doc_vectors.append(vec)

In [42]:
document_tfidf_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc in docs:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf*idf
    document_tfidf_vectors.append(vec)

In [46]:
query = "How long does it take to get to the store?"
query_vec = copy.copy(zero_vector)

tokens = tokenizer.tokenize(query.lower())
token_counts = Counter(tokens)

for key, value in token_counts.items():
    docs_containing_key = 0
    for doc in docs:
        if key in doc.lower():
            docs_containing_key += 1
    if docs_containing_key == 0:
        continue
    tf = value/len(tokens)
    idf = len(docs)/docs_containing_key
    query_vec[key] = tf*idf

In [48]:
cosine_sim(query_vec, document_tfidf_vectors[0])

0.6132857433407973

In [49]:
cosine_sim(query_vec, document_tfidf_vectors[1])

0.0

In [50]:
cosine_sim(query_vec, document_tfidf_vectors[2])

0.0

In [112]:
# https://medium.com/ai%C2%B3-theory-practice-business/use-gpu-in-your-pytorch-code-676a67faed09

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu" 

device = torch.device(dev)
cuda = torch.device('cuda')     # Default CUDA device
cuda0 = torch.device('cuda:0')
cuda1 = torch.device('cuda:1')

In [152]:
input1 = torch.randn(100, 128, device=cuda1)
input2 = torch.randn(100, 128, device=cuda1)
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
output = cos(input1, input2)
output

tensor([ 0.0218, -0.0120,  0.0751, -0.0462, -0.0030,  0.0359, -0.0107,  0.1124,
         0.0466,  0.0317, -0.0743, -0.0534, -0.0541,  0.0668, -0.0301,  0.0067,
        -0.0880,  0.0521,  0.1038, -0.0142, -0.1874,  0.0231, -0.0638,  0.0549,
        -0.2302,  0.1356,  0.0156,  0.1173, -0.0147, -0.0610, -0.0421, -0.0905,
         0.0730, -0.0375,  0.1820,  0.0050,  0.1159,  0.1038, -0.0736, -0.0991,
        -0.0556, -0.1296, -0.0726, -0.0030, -0.0003, -0.0639, -0.0581, -0.0035,
         0.0170,  0.0156,  0.0198,  0.1042,  0.1606, -0.1383, -0.0982, -0.0272,
         0.0352,  0.1743,  0.1269, -0.1168,  0.0923,  0.0318,  0.0286, -0.0811,
        -0.1772,  0.1352,  0.0039, -0.1444,  0.0541,  0.1122, -0.0457, -0.0635,
         0.0620,  0.0261, -0.0147,  0.1097, -0.0705,  0.0329, -0.0510, -0.0074,
        -0.1664, -0.1363, -0.0201, -0.0019, -0.0978, -0.0119,  0.0870, -0.1468,
        -0.0423,  0.0140,  0.1048, -0.1867, -0.2359,  0.0283,  0.0401, -0.1466,
         0.0552, -0.0212, -0.0469,  0.02

In [156]:
# cosine_sim(query_vec, document_tfidf_vectors[0])
query = torch.tensor(list(query_vec.values()), device=cuda1)
tfidf_vec = torch.tensor(list(document_tfidf_vectors[0].values()), device=cuda1)
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
output = cos(query, tfidf_vec)
output


tensor(0.6133, device='cuda:1')

In [167]:
corpi = glob("./corpus*txt")
with open(corpi[1], 'r') as file:
        f = json.load(file)

corpus = f[2]['content']

document_tfidf_vectors = []
for doc in corpus:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc in corpus:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf*idf
    document_tfidf_vectors.append(vec)