# 1.1 Locality Sensitive Hashing (LSH) Algorithm (First Layer)
https://nbviewer.org/github/bassimeledath/quora_profile/blob/master/questions_analysis.ipynb

https://www.pinecone.io/learn/locality-sensitive-hashing/

In [84]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [85]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    tokens = text
    # Removing punctuations in string
    # Using loop + punctuation string
    for ele in tokens:
        if ele in punc:
            tokens = tokens.replace(ele, "")
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [86]:
#Number of Permutations
permutations = 128

#Number of Recommendations to return
num_recommendations = 5

In [87]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [88]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    result={}
    result = database.iloc[idx_array]['ParagraphP']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [89]:
db = pd.read_csv('/Users/ibrahim/Desktop/paragraphP.csv')#We have 67 Sentences
db['text'] = db['ParagraphP']
forest = get_forest(db, permutations)

It took 0.023745059967041016 seconds to build forest.


In [99]:
num_recommendations = 10
query = "t is related  O auspicious King that there was a merchant of the merchants who had much wealth and business in various cities Now on a day he mounted horse and went forth to re cover monies in certain towns and the heat sore oppressed him so he sat beneath a tree and putting his hand into his saddle bags took thence some broken bread and dry dates and began to break his fast When he had ended eating the dates he threw away the stones with force and lo an Ifrit appeared huge of stature and brandishing a drawn sword wherewith he approached the mer chant and said Stand up that I may slay thee even as thou slewest my son Asked the merchant How have I slain thy son and he answered When thou atest dates and threwest away the stones they struck my son full in the breast as he was walking by so that he died forthwith F40 Quoth the merchant Verily from Allah we proceeded and unto Allah are we re turning There is no Majesty and there is no Might save in Allah the Glorious the Great If I slew thy son I slew him by chance medley".lower()
result = predict(query, db, permutations, num_recommendations, forest)
print('\n Top similar sentences \n', result)

It took 0.007851839065551758 seconds to query forest.

 Top similar sentences 
 0     have heard, O fortunate king, that a wealthy ...
1    When he had finished, the ‘ifrit said: ‘Stop t...
2    He took his seat by the merchant’s side and pr...
Name: ParagraphP, dtype: object


In [100]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values[:]
print(candidates)

[' have heard, O fortunate king, that a wealthy merchant, who had many dealings throughout the lands, rode out one day to settle a matter of business in one of them. When it became hot, he sat down under a tree and put his hand in his saddlebag, from which he took out a piece of bread and a date. He ate and when he had finished with the date he threw away its stone, at which a huge ‘ifrit appeared, with a drawn sword in his hand. This ‘ifrit came up to the merchant and said: ‘Get up so that I can kill you as you killed my son.’ ‘How did I kill your son?’ asked the merchant, and the ‘ifrit told him: ‘When you ate that date and threw away the stone, it struck my son in the chest as he was walking, and he died instantly.’ ‘We belong to God and to Him do we return,’ recited the merchant, adding: ‘There is no might and no power except with God, the Exalted, the Omnipotent. If I killed him, this was by accident, so please forgive me.’ ‘I must kill you,’ insisted the ‘ifrit, and he dragged of

# 1.2 Cosine Similarity (The second layer)

https://towardsdatascience.com/how-to-rank-text-content-by-semantic-similarity-4d2419a84c32

In [101]:
from re import sub
from gensim.utils import simple_preprocess

#query_string = 'fruit and vegetables'
#documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = []

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in candidates]
query = preprocess(query)

In [102]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|████████████████████████████████████████| 329/329 [00:02<00:00, 126.75it/s]


In [103]:
# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf] 


# Output the sorted similarity scores and documents
NW_Candiadates=[]
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes:
    if doc_similarity_scores[idx]>0.700:
        print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {candidates[idx]}')
        NW_Candiadates.append(candidates[idx])

0 	 0.785 	  have heard, O fortunate king, that a wealthy merchant, who had many dealings throughout the lands, rode out one day to settle a matter of business in one of them. When it became hot, he sat down under a tree and put his hand in his saddlebag, from which he took out a piece of bread and a date. He ate and when he had finished with the date he threw away its stone, at which a huge ‘ifrit appeared, with a drawn sword in his hand. This ‘ifrit came up to the merchant and said: ‘Get up so that I can kill you as you killed my son.’ ‘How did I kill your son?’ asked the merchant, and the ‘ifrit told him: ‘When you ate that date and threw away the stone, it struck my son in the chest as he was walking, and he died instantly.’ ‘We belong to God and to Him do we return,’ recited the merchant, adding: ‘There is no might and no power except with God, the Exalted, the Omnipotent. If I killed him, this was by accident, so please forgive me.’ ‘I must kill you,’ insisted the ‘ifrit, and he 

In [104]:
print(NW_Candiadates)

[' have heard, O fortunate king, that a wealthy merchant, who had many dealings throughout the lands, rode out one day to settle a matter of business in one of them. When it became hot, he sat down under a tree and put his hand in his saddlebag, from which he took out a piece of bread and a date. He ate and when he had finished with the date he threw away its stone, at which a huge ‘ifrit appeared, with a drawn sword in his hand. This ‘ifrit came up to the merchant and said: ‘Get up so that I can kill you as you killed my son.’ ‘How did I kill your son?’ asked the merchant, and the ‘ifrit told him: ‘When you ate that date and threw away the stone, it struck my son in the chest as he was walking, and he died instantly.’ ‘We belong to God and to Him do we return,’ recited the merchant, adding: ‘There is no might and no power except with God, the Exalted, the Omnipotent. If I killed him, this was by accident, so please forgive me.’ ‘I must kill you,’ insisted the ‘ifrit, and he dragged of

# 1.3 Needleman Wunch (The Last Layer)

In [105]:
from minineedle import needle, core

#query= 'it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition.'

threshold = 10.0
R = {} # Dicitonary to Save query, target sentence, precent identity and the actual aligmnet

for i in NW_Candiadates:
    alignment = needle.NeedlemanWunsch(query, i)
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query:\n ',max_value[1],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
#print('\n',max_value[3])

Query:
  ['t', 'is', 'related', 'o', 'auspicious', 'king', 'that', 'there', 'was', 'a', 'merchant', 'of', 'the', 'merchants', 'who', 'had', 'much', 'wealth', 'and', 'business', 'in', 'various', 'cities', 'now', 'on', 'a', 'day', 'he', 'mounted', 'horse', 'and', 'went', 'forth', 'to', 're', 'cover', 'monies', 'in', 'certain', 'towns', 'and', 'the', 'heat', 'sore', 'oppressed', 'him', 'so', 'he', 'sat', 'beneath', 'a', 'tree', 'and', 'putting', 'his', 'hand', 'into', 'his', 'saddle', 'bags', 'took', 'thence', 'some', 'broken', 'bread', 'and', 'dry', 'dates', 'and', 'began', 'to', 'break', 'his', 'fast', 'when', 'he', 'had', 'ended', 'eating', 'the', 'dates', 'he', 'threw', 'away', 'the', 'stones', 'with', 'force', 'and', 'lo', 'an', 'ifrit', 'appeared', 'huge', 'of', 'stature', 'and', 'brandishing', 'a', 'drawn', 'sword', 'wherewith', 'he', 'approached', 'the', 'mer', 'chant', 'and', 'said', 'stand', 'up', 'that', 'i', 'may', 'slay', 'thee', 'even', 'as', 'thou', 'slewest', 'my', 'son', 

# First Filter (Cosine Similarity)
https://www.machinelearningplus.com/nlp/cosine-similarity/

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(candidates)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=[candidates])
df

Unnamed: 0,after,all,almighty,an,and,another,are,asaf,asked,back,...,which,who,whom,wife,wise,with,worked,would,you,yourself
"this, however, is not more surprising than the tale of the fisherman.’ when the king asked what that was, she went on:\n\ni have heard, o fortunate king, that there once was a poor, elderly fisherman with a wife and three children, who was in the habit of casting his net exactly four times each day.",0,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,1,0,0,0,0
"he took one end of it to the shore and fixed it to a peg that he drove in there, after which he stripped and dived into the sea beside it, where he continued tugging until he managed to get it up.",1,0,0,0,2,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
"he climbed out delightedly, put his clothes back on and went up to the net, only to find that what was in it was a dead donkey, and that the donkey had made a hole in the net.",0,0,0,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
"then, when he opened it up, he found in it a brass bottle with a lead seal, imprinted with the inscription of our master solomon, the son of david, on both of whom be peace.",0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,0,0,0,0
"i’ll open it up and have a look before selling it.’ he took out a knife and worked on the lead until he had removed it from the bottle, which he then put down on the ground, shaking it in order to pour out its contents.",0,0,0,0,2,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
"when it had all come out, it collected and solidified; a tremor ran through it and it became an ‘ifrit with his head in the clouds and his feet on the earth.",0,1,0,1,3,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
"he encouraged himself, saying that almighty god would show favour and reciting:\n\nwhen you are faced with hardship, clothe yourself\n\nin noble patience; that is more resolute.",0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,1,1,1
"invoking the name of god, he made another cast, waited until the net had settled, and found it heavier and more difficult to move than before.",0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"solomon sent his vizier, asaf, to fetch me to him under duress, and i was forced to go with him in a state of humiliation to stand before solomon.",0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
how many wise men lie hidden in the earth!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [81]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(df, df))

[[1.         0.19885368 0.48418203 0.3138032  0.23692416 0.26440136
  0.21025857 0.31008684 0.15540357 0.18871284]
 [0.19885368 1.         0.44003599 0.4322906  0.62523197 0.40708658
  0.20382711 0.44088265 0.26782228 0.10976426]
 [0.48418203 0.44003599 1.         0.27288841 0.48156157 0.42217828
  0.22333125 0.43915503 0.30567718 0.2227177 ]
 [0.3138032  0.4322906  0.27288841 1.         0.45782444 0.34061366
  0.16251869 0.31957418 0.17795362 0.14586499]
 [0.23692416 0.62523197 0.48156157 0.45782444 1.         0.5429066
  0.12951987 0.44570018 0.14182079 0.15499685]
 [0.26440136 0.40708658 0.42217828 0.34061366 0.5429066  1.
  0.1530433  0.3761774  0.14663103 0.18314742]
 [0.21025857 0.20382711 0.22333125 0.16251869 0.12951987 0.1530433
  1.         0.16951588 0.08495482 0.06189845]
 [0.31008684 0.44088265 0.43915503 0.31957418 0.44570018 0.3761774
  0.16951588 1.         0.22273842 0.12171612]
 [0.15540357 0.26782228 0.30567718 0.17795362 0.14182079 0.14663103
  0.08495482 0.22273842

In [64]:
query_term_matrix = sparse_matrix.todense()
df_query = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=[query])

df_query

ValueError: Shape of passed values is (10, 174), indices imply (1, 174)

# NW to test our candidates

In [56]:
from minineedle import needle, core

query= 'it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition.'

threshold = 10.0
R = {} # Dicitonary to Save query, target sentence, precent identity and the actula aligmnet

for i in candidates:
    alignment = needle.NeedlemanWunsch(query.lower(), i.lower())
    x = alignment.get_identity()
    y = alignment 
    R[i]=[x , query, i, y]
max_value = max(R.values())
print('Query:\n ',max_value[1],'\n')
print('Target Sentence: \n',max_value[2],'\n')
print('Perecent Identity using NW:',max_value[0])
print('\n',max_value[3])

Query:
  it hath reached me, o auspicious king, that there was a fisher man well stricken in years who had a wife and three children, and withal was of poor condition. 

Target Sentence: 
 he climbed out delightedly, put his clothes back on and went up to the net, only to find that what was in it was a dead donkey, and that the donkey had made a hole in the net. 

Perecent Identity using NW: 37.19

 Alignment of Query and Target Sentence is:
	-----i---t hath re-ach-ed me, --o auspicio--us ---k-in--g, ---t--hat- the-re-- -was -a fisher -man well --stricken in years who -had a wife-- and three children, and withal wa-s of poor condit-ion--.
	he climbed out- delighted-ly, put his- clothes back on and went up to the net, only to fi--nd that what was- i---n it -wa-s --a dead -donkey, and th-at the d-on--key ---had made -a hole -in- the net.



# LSH Algorithm 4

In [155]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

a = 'It is related, O auspicious King, that there was a merchant of the merchants who had much wealth, and business in various cities'
b = 'I have heard, O fortunate king, that a wealthy merchant, who had many dealings throughout the lands, rode out one day to settle a matter of business in one of them.'
c = "When it became hot, he sat down under a tree and put his hand in his saddlebag, from which he took out a piece of bread and a date"



In [156]:
#Steps 2-7
def shingle(text: str, k: int):
    shingle_set = []
    for i in range(len(text) - k+1):
        shingle_set.append(text[i:i+k])
    return set(shingle_set)

In [157]:
k = 2
a = shingle(a, k)
b = shingle(b, k)
c = shingle(c, k)
print(a)

{'ed', 'ch', 'th', 'au', 'nt', ' h', 'ea', 'ic', 'ng', 'g,', 'h,', 'sp', 'Ki', 'va', 'O ', ' m', 'ts', ' w', 'me', 'ho', 'ha', 'he', 'te', 'ie', ' a', 't ', 'ri', 'mu', 'we', 'f ', 'of', 'la', ' b', 'h ', 'ti', ' t', 'ci', 'si', 'es', 'bu', 'an', 'is', 'ar', 'uc', 'It', 'o ', 'e ', 'pi', 'el', ', ', 're', 'as', ' O', ' K', 'io', 's ', 'n ', 'it', 'in', ' o', 'ad', 'wh', 'al', 'us', 'ss', ' i', 'ne', ' c', ' v', 'd ', 'ou', ' r', 'er', 'wa', 'at', 'rc', 'nd', 'lt', 'a ', 'd,'}


In [158]:
vocab = list(a.union(b).union(c))

In [159]:
a_1hot = [1 if x in a else 0 for x in vocab]
b_1hot = [1 if x in b else 0 for x in vocab]
c_1hot = [1 if x in c else 0 for x in vocab]

In [160]:
#Step 8-9
hash_ex = list(range(1, len(vocab)+1))
print(hash_ex)  # we haven't shuffled yet

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154]


In [161]:
#Step 8-9
from random import shuffle

shuffle(hash_ex)
print(hash_ex)

[135, 66, 120, 9, 148, 152, 57, 108, 111, 14, 43, 67, 106, 144, 96, 154, 73, 28, 107, 93, 121, 97, 42, 113, 112, 143, 77, 92, 44, 55, 72, 87, 74, 60, 4, 3, 1, 139, 137, 17, 6, 24, 71, 80, 61, 15, 102, 86, 101, 35, 23, 142, 153, 45, 146, 13, 124, 76, 119, 53, 91, 56, 21, 70, 98, 52, 62, 12, 122, 54, 10, 123, 65, 90, 140, 78, 89, 85, 8, 103, 131, 18, 2, 7, 58, 141, 26, 34, 132, 95, 129, 99, 46, 5, 51, 150, 116, 110, 104, 63, 11, 25, 88, 117, 126, 48, 134, 118, 138, 16, 39, 49, 84, 105, 127, 27, 36, 100, 82, 128, 79, 109, 59, 29, 69, 115, 30, 22, 20, 136, 33, 31, 145, 81, 75, 130, 32, 41, 50, 125, 94, 47, 38, 37, 83, 19, 40, 149, 151, 64, 114, 133, 147, 68]


In [162]:
print(f"7 -> {hash_ex.index(7)}") # note that value 7 can be found at index 1 in hash_ex

7 -> 83


In [163]:
for i in range(1, 5):
    print(f"{i} -> {hash_ex.index(i)}")

1 -> 36
2 -> 82
3 -> 35
4 -> 34


In [164]:
for i in range(1, len(vocab)+1):
    idx = hash_ex.index(i)
    signature_val = a_1hot[idx]
    print(f"{i} -> {idx} -> {signature_val}")
    if signature_val == 1:
        print('match!')
        break

1 -> 36 -> 1
match!


In [165]:
#7-9
def create_hash_func(size: int):
    # function for creating the hash vector/function
    hash_ex = list(range(1, len(vocab)+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(vocab_size: int, nbits: int):
    # function for building multiple minhash vectors
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(vocab_size))
    return hashes

# we create 20 minhash vectors
minhash_func = build_minhash_func(len(vocab), 20)

In [166]:
def create_hash(vector: list):
    # use this function for creating our signatures (eg the matching)
    signature = []
    for func in minhash_func:
        for i in range(1, len(vocab)+1):
            idx = func.index(i)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(i)
                break
    return signature

In [167]:
# now create signatures
a_sig = create_hash(a_1hot)
b_sig = create_hash(b_1hot)
c_sig = create_hash(c_1hot)

print(a_sig)
print(b_sig)
#print(c_sig)

[4, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1]
[1, 3, 1, 1, 1, 3, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2]


# Cosine Similarity using 3 ways 
https://danielcaraway.github.io/html/sklearn_cosine_similarity.html

In [168]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# calculate cosine similarity between [X] and [Y,Z]
# sending input as arrays would allow for calculating both cosine_sim(X,Y) and cosine_sim (X,Y)
cos_sim = cosine_similarity([a_sig], [b_sig,c_sig])
print(cos_sim)

# calculate the entire cosie similarity matrix among X, Y, and Z
cos_sim = cosine_similarity([a_sig, b_sig, c_sig])
print(cos_sim)
print()

[[0.74921534 0.84714052]]
[[1.         0.74921534 0.84714052]
 [0.74921534 1.         0.7675329 ]
 [0.84714052 0.7675329  1.        ]]



In [169]:
from scipy import spatial

cos_sim = 1 - spatial.distance.cosine(b_sig, c_sig)
print(cos_sim)

0.7675329008014159


In [170]:
from numpy import dot
from numpy.linalg import norm

cos_sim = dot(a_sig,c_sig) / (norm(a_sig)*norm(c_sig))
print(cos_sim)

0.8471405189362209


# LSH itself

In [171]:
def split_vector(signature, b):
    assert len(signature) % b == 0
    r = int(len(signature) / b)
    # code splitting signature in b parts
    subvecs = []
    for i in range(0, len(signature), r):
        subvecs.append(signature[i : i+r])
    return subvecs

In [172]:
band_a = split_vector(a_sig, 10)
band_b = split_vector(b_sig, 10)
band_b

[[1, 3],
 [1, 1],
 [1, 3],
 [2, 1],
 [2, 1],
 [1, 2],
 [1, 1],
 [1, 2],
 [2, 1],
 [1, 2]]

In [173]:
band_c = split_vector(c_sig, 10)
band_c

[[3, 2],
 [2, 3],
 [2, 2],
 [1, 1],
 [2, 4],
 [3, 1],
 [1, 3],
 [1, 1],
 [1, 1],
 [2, 1]]

In [178]:
for a_rows, b_rows in zip(band_a, band_b):
    if a_rows == b_rows:
        print(f"Candidate pair: {a_rows} == {b_rows}")
        # we only need one band to match
        break

Candidate pair: [1, 2] == [1, 2]


In [183]:
print([a_rows])

[[1, 2]]


In [179]:
for a_rows, c_rows in zip(band_a, band_c):
    if a_rows == b_rows:
        print(f"Candidate pair: {a_rows} == {c_rows}")
        # we only need one band to match
        break

Candidate pair: [1, 2] == [3, 1]


In [180]:
for b_rows, c_rows in zip(band_b, band_c):
    if a_rows == c_rows:
        print(f"Candidate pair: {b_rows} == {c_rows}")
        # we only need one band to match
        break

# Another Resource


In [2]:
pip install snapy

Note: you may need to restart the kernel to use updated packages.


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
 
import argparse
 
from snapy import MinHash, LSH 
SEED = 3
 
 
def load_content(sentence_file):
    """Load input file with sentences to build LSH.
 
    Args:
        sentence_file (str): Path to input with txt file with sentences to Build LSH.
 
    Returns:
        dict: Dict with strings and version of string in lower case and without comma.
 
    """
    sentences = {}
    with open(sentence_file) as content:
        for line in content:
            line = line.strip()
            line_clean = line.replace(",", "")
            line_clean = line_clean.lower()
            sentences[line_clean] = line
 
    return sentences
 
 
def create_lsh(content, no_of_bands, n_permutations, n_gram):
    """Create Minhash and Locality Sensitive Hashing (LSH) to detect near duplicate texts.
 
    Args:
        content (list): List with string to build LSH.
        no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
        n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
        n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
        no_of_bands(int): Number of bands to break minhash signature into before hashing into buckets.
 
    Returns:
        class 'snapy.lsh.LSH':  Snapy LSH object.
 
    """
    labels = range(len(content))
 
    # Create MinHash object.
    minhash = MinHash(content, n_gram=n_gram, permutations=n_permutations, hash_bits=64, seed=SEED)
 
    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)
 
    return lsh
 
 
def find_near_duplicate(query_sentences, sentences, min_jaccard_value, no_of_bands, n_permutations, n_gram):
    """Using LSH object finds the near duplicate strings.
 
    Args:
        query_sentences (dict): Dict with query strings and version of string in lower case and without comma.
        sentences (dict): Dict with target strings and version of string in lower case and without comma.
        min_jaccard_value (float): Minimum value for the Jaccard Distance.
        no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
        n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
        n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
 
    """
    content = list(query_sentences.keys()) + list(sentences.keys())
    lsh = create_lsh(content, no_of_bands, n_permutations, n_gram)
 
    # Query to find near duplicates the string in `search`
    closest_results = lsh.query(0, min_jaccard=min_jaccard_value)
 
    for index_query, search_string in enumerate(query_sentences):
        print("{} QUERY: {}".format(index_query + 1, query_sentences[search_string]))
        for content_index in closest_results:
            result = content[content_index]
            print(sentences[result])
        print()
 
 
def parse_args():
    """Parse args entered by the user.
 
    Returns:
        argparse.Namespace: Parsed arguments.
 
    """
    parser = argparse.ArgumentParser(
        description="Detect near duplicate texts using Minhash and Locality Sensitive Hashing.",
        epilog="example > python3 find_near_duplicate.py  -q INPUT -t TARGERS")
    parser.add_argument("-q", "--query", help="Path to file with sentences to query", required=True)
    parser.add_argument("-t", "--targets", help="Path to file with sentences be matched against", required=True)
    parser.add_argument("-g", "--n_gram", help="Size of each overlapping text shingle to break text into "
                                               "prior to hashing", default=9)
    parser.add_argument("-p", "--n_permutations", help="Number of permutations used to create minhash signatures used "
                                                       "in LSH model.", default=100)
    parser.add_argument("-j", "--min_jaccard", help="Jaccard similarity threshold texts have to exceed to be "
                                                          "returned as similar.", default=0.25)
    parser.add_argument("-b", "--no_of_bands", help="Number of bands to break minhash signature into "
                                                    "before hashing into buckets..", default=50)
    return parser.parse_args()
 
 
def main():
    args = parse_args()
 
    query = args.query
    targets = args.targets
    min_jaccard_value = float(args.min_jaccard)
    n_gram = int(args.n_gram)
    n_permutations = int(args.n_permutations)
    no_of_bands = int(args.no_of_bands)
 
    # load sentences from file
    query_sentences = load_content(query)
    targets_sentences = load_content(targets)
 
    # find near duplicate sequences to `search_string`
    find_near_duplicate(query_sentences, targets_sentences, min_jaccard_value, no_of_bands, n_permutations, n_gram)
 
 
if __name__ == "__main__":
    main()

ImportError: cannot import name 'MinHash' from 'snapy' (/Users/ibrahim/opt/anaconda3/lib/python3.9/site-packages/snapy/__init__.py)

# Test NW with Lemmatization+ POS+ Synonyms

In [106]:
from minineedle import needle, core

seq1='Here we are!'.lower().split()
seq2='We are here!'.lower().split()

alignment = needle.NeedlemanWunsch(seq1, seq2)

alignment.align()

print('Score',alignment.get_score())

print('Identity',alignment.get_identity())

print(alignment)

Score -2
Identity 25.0
Alignment of Query and Target Sentence is:
	herewe-are!
	-wearehere!



In [75]:
from minineedle import needle, core

seq1='This is a good night'.lower()
seq2='This is a good day'.lower()

alignment = needle.NeedlemanWunsch(seq1, seq2)

alignment.align()

print('Score',alignment.get_score())

print('Identity',alignment.get_identity())

print(alignment)

Score 10
Identity 75.0
Alignment of SEQUENCE 1 and SEQUENCE 2:
	this is a good night
	this is a good --day



In [76]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

documents=[seq1,seq2]
# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['seq1', 'seq2'])
df

Unnamed: 0,day,good,is,night,this
seq1,0,1,1,1,1
seq2,1,1,1,0,1


In [77]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))

[[1.   0.75]
 [0.75 1.  ]]


# Get Synonyms

In [43]:
import nltk
from nltk.corpus import wordnet
synonyms = []
antonyms = []
  
for syn in wordnet.synsets("cemetery"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
  
print(set(synonyms))

{'necropolis', 'graveyard', 'burial_site', 'cemetery', 'burial_ground', 'memorial_park', 'burying_ground'}


# Test for Lemmatization and POS
### https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [2]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet
import nltk
from nltk.stem import WordNetLemmatizer 


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'asked'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "I saw Susie sitting in a shoe shine shop. Where Susie sits Susie shines, and where Susie shines Susie sits"

sentence2 = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]) 
print(sentence2)
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']


ask
I saw Susie sit in a shoe shine shop . Where Susie sits Susie shine , and where Susie shine Susie sits


In [62]:
def are_synonyms(x,y):# To check for synonyms between seq1 and seq2 return (True or False) 
             synonyms2 = []
             for syn in wordnet.synsets(y):
                 for lm in syn.lemmas():
                       synonyms2.append(lm.name())
             synonyms=set(synonyms2)
             if x in synonyms:
                  return True
             else:
                  return False

In [67]:
are_synonyms('was','was')

False

In [3]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet
import nltk
from nltk.stem import WordNetLemmatizer 


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    

    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)



# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag


x = lemmatizer.lemmatize(word, get_wordnet_pos(word))
y = lemmatizer.lemmatize(word, get_wordnet_pos(word))
print(x)
print(y)

ask
ask


In [8]:
from nltk.corpus import wordnet
import nltk
from nltk.stem import WordNetLemmatizer 

def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
    

        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
def checklemma_pos(word1,word2):

# 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    
    x = lemmatizer.lemmatize(word1, get_wordnet_pos(word1))
    y = lemmatizer.lemmatize(word2, get_wordnet_pos(word2))
    if x == y:
        return True
    else:
        return False

In [11]:
checklemma_pos('dead','die')

False

In [28]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'where'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "I saw Susie sitting in a shoe shine shop. Where Susie sits Susie shines, and where Susie shines Susie sits"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

where
['I', 'saw', 'Susie', 'sit', 'in', 'a', 'shoe', 'shine', 'shop', '.', 'Where', 'Susie', 'sits', 'Susie', 'shine', ',', 'and', 'where', 'Susie', 'shine', 'Susie', 'sits']


In [7]:
from nltk import pos_tag, word_tokenize
pos_tag(word_tokenize(""))

[('The', 'DT'), ('rest', 'NN'), ('of', 'IN'), ('life', 'NN')]

In [14]:
import nltk
from nltk import word_tokenize
sentence = "There is a stubbornness about me that never can bear to be frightened at the will of others. My courage always rises at every attempt to intimidate me."
print (nltk.pos_tag(word_tokenize(sentence)))

[('There', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ('stubbornness', 'NN'), ('about', 'IN'), ('me', 'PRP'), ('that', 'IN'), ('never', 'RB'), ('can', 'MD'), ('bear', 'VB'), ('to', 'TO'), ('be', 'VB'), ('frightened', 'VBN'), ('at', 'IN'), ('the', 'DT'), ('will', 'MD'), ('of', 'IN'), ('others', 'NNS'), ('.', '.'), ('My', 'PRP$'), ('courage', 'NN'), ('always', 'RB'), ('rises', 'VBZ'), ('at', 'IN'), ('every', 'DT'), ('attempt', 'NN'), ('to', 'TO'), ('intimidate', 'VB'), ('me', 'PRP'), ('.', '.')]
