In [32]:
import re
from collections import defaultdict
from itertools import combinations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Characteristic Matrix
In natural language processing, a characteristic matrix (also known as a term-document matrix) is a matrix that represents the presence or absence of each term (word) in each document in a collection. The rows of the matrix correspond to the terms, and the columns correspond to the documents. Each element in the matrix represents whether the corresponding term appears in the corresponding document. Typically, the values in the matrix are binary, indicating presence or absence, but they can also be weighted to represent the importance of each term in each document. Characteristic matrices are commonly used as a basis for various text analysis tasks such as information retrieval, topic modeling, and text classification.




In natural language processing, a k-shingle (also known as k-gram) refers to a sequence of k contiguous tokens (usually words) from a document. For example, if we have the sentence "The quick brown fox jumps over the lazy dog", a 3-shingle of this sentence would be "The quick brown", "quick brown fox", "brown fox jumps", and so on.

K-shingles are often used in text similarity and plagiarism detection algorithms, as they can help identify similar phrases or sentences that share common subsequences of words. The Jaccard similarity metric, for instance, can be used to compare the set of k-shingles between two documents to measure their similarity.

In [33]:
def get_shingles(text, k):
    # split the text into words
    words = re.findall(r'\w+', text)

    # create a set of k-grams
    shingles = set()
    for i in range(len(words) - k + 1):
        shingle = " ".join(words[i:i+k])
        shingles.add(shingle)

    return shingles

# Jaccard Similarity
Jaccard similarity is a measure of similarity between two sets of elements. It is defined as the size of the intersection divided by the size of the union of the sets. In language processing, Jaccard similarity is often used to compare the similarity between two documents by considering the set of words present in each document. The Jaccard similarity between two documents is the ratio of the number of common words to the total number of distinct words in both documents. This measure ranges from 0 (no overlap) to 1 (identical documents). Jaccard similarity is widely used in applications such as document clustering, search engines, and recommendation systems.

The Jaccard similarity measures the similarity between two sets of data to see which members are shared and distinct. The Jaccard similarity is calculated by dividing the number of observations in both sets by the number of observations in either set.

In [80]:
# define a function to calculate the Jaccard similarity of two sets
def get_jaccard_similarity(s1, s2):
    # calculate the Jaccard index
    non_zero = np.where(s1 + s2 > 0)[0]
    set_intersect = np.sum(s1[non_zero] & s2[non_zero])
    set_union = len(non_zero)
    return set_intersect / set_union


# Minhashing 

Minhashing is a technique used in language processing for efficiently estimating the similarity between two large sets of data, such as documents or web pages. The basic idea of minhashing is to represent each set as a signature, which is a much shorter sequence of values that still captures the most important characteristics of the original set.

To create the signature, minhashing uses a set of hash functions to map the elements of each set to a smaller range of values. The signature for a set is then constructed by selecting the minimum hash value among all the elements in the set for each hash function. The resulting signature is typically much shorter than the original set, which makes it more efficient to compare signatures of different sets to estimate their similarity.

One common application of minhashing is in the construction of Locality-Sensitive Hashing (LSH) algorithms, which are used to quickly find similar pairs of documents or other data in large datasets. By dividing the signature space into a set of "buckets" using a hash function, LSH algorithms can efficiently identify pairs of signatures that are likely to belong to similar sets, allowing for further processing to determine the actual degree of similarity.

The MinHash algorithm will provide us with a fast approximation to the Jaccard Similarity between two sets.

For each set in our data, we are going to calculate a MinHash signature. The MinHash signatures will all have a fixed length, independent of the size of the set. 

Minhashing involves compressing the large sets of unique shingles into a much smaller representation called “signatures”.
We then use these signatures to measure the similarity between documents.
Although it is impossible for these signatures to give the exact similarity measure, the estimates are pretty close.
The larger the number of signatures chosen, the more accurate the estimate is.

More info here: https://www.youtube.com/watch?v=R-iFka68ZwM


In [35]:
# define a function to get the min-hash values for a set of shingles
def get_min_hash(shingles, num_hashes):
    # create a list of hash functions
    hash_functions = [hashlib.sha1, hashlib.md5, hashlib.sha256]

    # create a dictionary of min-hash values
    min_hashes = defaultdict(lambda: float("inf"))
    for shingle in shingles:
        # hash the shingle using each of the hash functions
        for i, hf in enumerate(hash_functions):
            hash_value = int(hf(shingle.encode()).hexdigest(), 16)

            # update the min-hash value for this hash function
            if hash_value < min_hashes[i]:
                min_hashes[i] = hash_value

    return [min_hashes[i] for i in range(num_hashes)]

# Locality Sensitivity Hashing (LSH)

Locality-sensitive hashing (LSH) is a technique used in natural language processing (NLP) to find similar documents efficiently. It is a technique that involves hashing high-dimensional vectors such that similar vectors are mapped to the same hash buckets with high probability. This allows for the efficient retrieval of similar documents by querying only a small subset of the hash buckets, rather than the entire collection of documents.

In LSH, a set of hash functions are generated such that each function maps a high-dimensional vector to a low-dimensional hash value. The idea is to partition the vectors into hash buckets based on their hash values. Similar vectors are more likely to be hashed to the same bucket, allowing for efficient retrieval of similar documents. The number of hash functions and the number of buckets can be tuned to achieve a desired trade-off between accuracy and speed.

LSH has a wide range of applications in NLP, including near-duplicate detection, plagiarism detection, and document clustering. It is particularly useful in scenarios where the size of the document collection is large and a brute-force search is not feasible.



While the information necessary to compute the similarity between documents has been compressed from the original sparse characteristic matrix into a much smaller signature matrix, but the underlying problem or need to perform pairwise comparisons on all the documents still exists.


The concept for locality-sensitive hashing (LSH) is that given the signature matrix of size n (row count), we will partition it into b bands, resulting in each band with r rows. This is equivalent to the simple math formula — n = br, thus when we are doing the partition, we have to be sure that the b we choose is divisible by n.

Locality-Sensitive Hashing (LSH)
Locality-Sensitive Hashing (LSH) is a technique for approximate nearest neighbor search, which is the problem of finding the data point in a dataset that is most similar to a given query point. LSH works by dividing the data points into multiple hash tables, where each table is designed to capture the similarity between points.

To use LSH for text similarity, we can first represent each piece of text as a set of shingles. Then, we can use a hash function to map each shingle to a bucket in a hash table. The hash function should be designed such that shingles that are similar are more likely to be mapped to the same bucket.

For example, let's say we have the following two pieces of text:

To use LSH for text similarity, we can first represent each piece of text as a set of shingles. Then, we can use a hash function to map each shingle to a bucket in a hash table. The hash function should be designed such that shingles that are similar are more likely to be mapped to the same bucket.

For example, let's say we have the following two pieces of text:

"The cat sat on the mat"
"The dog lay on the rug"

We can generate their shingles and map them to buckets using a hash function:

"The cat sat" => h("The cat sat") => bucket 1

"cat sat on" => h("cat sat on") => bucket 2

"sat on the" => h("sat on the") => bucket 3

"on the mat" => h("on the mat") => bucket 4

As you can see, the shingles "The cat sat" and "The dog lay" are mapped to the same bucket (bucket 1), which indicates that they are similar. Similarly, the shingles "cat sat on" and "dog lay on" are mapped to the same bucket (bucket 2), which indicates that they are also similar.


In [36]:
# define a function to calculate the LSH similarity of two min-hash values
def get_lsh_similarity(m1, m2, num_bands, band_size):
    # split the min-hash values into bands
    bands1 = [m1[i:i+band_size] for i in range(0, len(m1), band_size)]
    bands2 = [m2[i:i+band_size] for i in range(0, len(m2), band_size)]

    # create a set of hash values for each band
    band_hashes1 = [hash(tuple(band)) for band in bands1]
    band_hashes2 = [hash(tuple(band)) for band in bands2]

    # calculate the number of bands that have the same hash value
    num_common_bands = len(set(band_hashes1) & set(band_hashes2))

    # return the LSH similarity
    return num_common_bands / num_bands

In [37]:
# define the main function
def main():
    # create some sample text
    text1 = "I like apples"
    text2 = "I like bananas"
    text

In [38]:
# import the necessary libraries
import re
import hashlib
from collections import defaultdict
from itertools import combinations

In [43]:
import nltk

text = "The cat sat on the mat"
shingles = nltk.ngrams(text.split(), 3)
print(list(shingles))  # ["The cat sat", "cat sat on", "sat on the", "on the mat"]

[('The', 'cat', 'sat'), ('cat', 'sat', 'on'), ('sat', 'on', 'the'), ('on', 'the', 'mat')]


In [47]:
import hashlib

def hash_shingle(shingle):
    # Create a hash object
    h = hashlib.sha1()
    
    # Update the hash object with the shingle
    h.update(shingle.encode("utf-8"))
    
    # Return the hash value modulo 10 (to map it to a bucket)
    return int(h.hexdigest(), 16) % 10

shingles = ["The cat sat", "cat sat on", "sat on the", "on the mat"]

for shingle in shingles:
    print(f"{shingle}: {hash_shingle(shingle)}")

The cat sat: 3
cat sat on: 6
sat on the: 2
on the mat: 1


In [44]:
import hashlib
import nltk

def hash_shingle(shingle):
    # Split the shingle into individual words
    #words = shingle.split()
    
    # Create a hash object
    h = hashlib.sha1()
    
    # Update the hash object with the concatenated hashes of each word in the shingle
    for word in shingle:
        h.update(hashlib.sha1(word.encode("utf-8")).hexdigest().encode("utf-8"))
    
    # Return the hash value modulo 10 (to map it to a bucket)
    return int(h.hexdigest(), 16) % 10

# Define the two pieces of text
text1 = ("The cat sat on the mat")
text2 = ("The dog lay on the rug")

# Generate the shingles for each piece of text
shingles1 = nltk.ngrams(text1.split(), 3)
shingles2 = nltk.ngrams(text2.split(), 3)

# Print the shingles and their corresponding hash values
print("Shingles for text1:")
for shingle in shingles1:
    print(f"{shingle}: {hash_shingle(shingle)}")

print("\nShingles for text2:")
for shingle in shingles2:
    print(f"{shingle}: {hash_shingle(shingle)}")

Shingles for text1:
('The', 'cat', 'sat'): 1
('cat', 'sat', 'on'): 7
('sat', 'on', 'the'): 0
('on', 'the', 'mat'): 9

Shingles for text2:
('The', 'dog', 'lay'): 1
('dog', 'lay', 'on'): 7
('lay', 'on', 'the'): 4
('on', 'the', 'rug'): 9


In [45]:
 
#Here is an example of how LSH can be used in practice to find similar documents:

#Suppose we have a dataset of documents, where each document is a string of text. 
#We want to use LSH to find the documents that are most similar to a given query document. 
#Here is how we can do this in Python:

import hashlib
import nltk



def hash_shingle(shingle):
    # Split the shingle into individual words
    print(shingle)
    #words = shingle.split()
    
    # Create a hash object
    h = hashlib.sha1()
    
    # Update the hash object with the concatenated hashes of each word in the shingle
    for word in shingle:
        h.update(hashlib.sha1(word.encode("utf-8")).hexdigest().encode("utf-8"))
    
    # Return the hash value modulo 10 (to map it to a bucket)
    return int(h.hexdigest(), 16) % 10

# Define the dataset of documents
documents = [
    "The cat sat on the mat",
    "The dog lay on the rug",
    "The mouse ran up the clock",
    "The rain fell from the sky",
    "The sun shone down on the earth"
]

# Define the query document
query_document = "The bird sang in the tree"

# Generate the shingles for each document
shingles_by_doc = {}
for document in documents:
    shingles_by_doc[document] = set(nltk.ngrams(document.split(), 3))

# Generate the shingles for the query document
query_shingles = set(nltk.ngrams(query_document.split(), 3))

# Create a hash table for each possible hash value
hash_tables = [[] for _ in range(10)]

# Map each shingle in each document to a hash table
for document, shingles in shingles_by_doc.items():
    for shingle in shingles:
        hash_tables[hash_shingle(shingle)].append(document)

# Find the documents that have at least one shingle in common with the query document
similar_documents = set()
for shingle in query_shingles:
    similar_documents.update(hash_tables[hash_shingle(shingle)])

# Print the similar documents
print(f"Similar documents to {query_document}:")
for document in similar_documents:
    print(document)

('on', 'the', 'mat')
('sat', 'on', 'the')
('The', 'cat', 'sat')
('cat', 'sat', 'on')
('The', 'dog', 'lay')
('dog', 'lay', 'on')
('lay', 'on', 'the')
('on', 'the', 'rug')
('up', 'the', 'clock')
('ran', 'up', 'the')
('The', 'mouse', 'ran')
('mouse', 'ran', 'up')
('rain', 'fell', 'from')
('from', 'the', 'sky')
('fell', 'from', 'the')
('The', 'rain', 'fell')
('on', 'the', 'earth')
('shone', 'down', 'on')
('sun', 'shone', 'down')
('The', 'sun', 'shone')
('down', 'on', 'the')
('sang', 'in', 'the')
('bird', 'sang', 'in')
('in', 'the', 'tree')
('The', 'bird', 'sang')
Similar documents to The bird sang in the tree:
The mouse ran up the clock
The sun shone down on the earth
The cat sat on the mat
The dog lay on the rug
The rain fell from the sky


# Working copy

In [116]:
import numpy as np
import itertools
import random
documents = ["The cat sat on the mat",
    "The dog lay on the rug",
    "The mouse ran up the clock",
    "The rain fell from the sky",
    "The sun shone down on the earth"]
# define k for k-shingles
k = 3

# define number of hash functions
n_hash = 100

# define number of bands
n_bands = 50

# define number of rows per band
n_rows = int(n_hash / n_bands)

# define prime number for hashing
p = 4294967311

# function to generate k-shingles
def generate_k_shingles(text, k):
    shingles = set()
    text = text.replace(' ', '')
    for i in range(len(text) - k + 1):
        shingles.add(text[i:i+k])
    return shingles

# create a set of all shingles
all_shingles = set()
for doc in documents:
    doc_shingles = generate_k_shingles(doc, k)
    all_shingles = all_shingles.union(doc_shingles)
all_shingles = sorted(list(all_shingles))

# create a characteristic matrix
char_matrix = np.zeros((len(all_shingles), len(documents)), dtype=np.int8)
for j, doc in enumerate(documents):
    doc_shingles = generate_k_shingles(doc, k)
    for i, shingle in enumerate(all_shingles):
        if shingle in doc_shingles:
            char_matrix[i,j] = 1

# generate hash functions
hash_funcs = []
for i in range(n_hash):
    a = random.randint(0, p-1)
    b = random.randint(0, p-1)
    hash_funcs.append((a,b))

# create minhash signature for each document
minhash_sig = np.zeros((n_hash, len(documents)), dtype=np.int64)
for j in range(len(documents)):
    for i in range(n_hash):
        minhash_sig[i,j] = p+1
        for shingle in all_shingles:
            if char_matrix[all_shingles.index(shingle), j] == 1:
                h = (hash_funcs[i][0] * hash(shingle) + hash_funcs[i][1]) % p
                if h < minhash_sig[i,j]:
                    minhash_sig[i,j] = h

# apply LSH to minhash signatures
candidates = set()
for band in range(n_bands):
    band_sig = minhash_sig[band*n_rows:(band+1)*n_rows, :]
    buckets = {}
    for j in range(len(documents)):
        bucket_key = hash(tuple(band_sig[:,j]))
        if bucket_key not in buckets:
            buckets[bucket_key] = []
        buckets[bucket_key].append(j)
    for key in buckets:
        if len(buckets[key]) > 1:
            pairs = itertools.combinations(buckets[key], 2)
            for pair in pairs:
                candidates.add(pair)

# calculate Jaccard similarity for candidate pairs
for pair in candidates:
    doc1 = pair[0]
    doc2 = pair[1]
    intersection = np.sum(char_matrix[:,doc1] & char_matrix[:,doc2])
    union = np.sum(char_matrix[:,doc1] | char_matrix[:,doc2])
    similarity = intersection / union
    print(f"Similarity between Document {doc1+1} and Document {doc2+1}: {similarity:.2f}")

Similarity between Document 1 and Document 2: 0.15
Similarity between Document 2 and Document 5: 0.15
Similarity between Document 3 and Document 4: 0.09
Similarity between Document 1 and Document 5: 0.12
Similarity between Document 1 and Document 3: 0.13
