In [1]:
from numpy import unique
from pandas import read_csv
url = 'https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv'
df = read_csv(url)

A class **Shingling** that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

In [7]:
import random, math
class Shingling:
   
    def __init__(self, input_text, input_k):
        self.text = input_text
        self.k = input_k
           

    def shingles(self):
        for line in self.text:
            return {line[i:i + self.k] for i in range(len(line) - self.k + 1)} 
    
    
    def ordered_hash(self, input_sh):
        a = random.randint(0,100)
        b = random.randint(0,100)        #convert string into int
        return set(sorted(set((a*(int.from_bytes(x.encode(), 'little')) + 
                           b)%1039205197 for x in input_sh)))

In [8]:
import io
sample_text = io.StringIO("This is a sample text. It is a ordinary string but simulated to act as the contents of a file")
# Construct 10-shingles
sh = Shingling(sample_text,10) 
shingles = sh.shingles() 
# Ordered set of hashed 10-shingles
ordered_set = sh.ordered_hash(shingles) 

A class **MinHashing** that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

In [11]:
class MinHashing:

    def __init__(self, input_set, input_n):
        self.s = input_set
        self.n = input_n

    def hashing(self):
        min_hash = [None] * self.n # Vector of length n
        for n in range(self.n):
            a = random.randint(0, 2**16-1)
            b = random.randint(0, 2**16-1)
            vector = [None] * len(self.s) # Vector of length set hashed shingles
            for i, value in enumerate(self.s):
                hashed = ((a * value + b) % 1039205197) % (2**16-1) # Hash shingle value
                vector[i] = hashed 
            min_hash[n] = min(vector)
        return min_hash

In [13]:
mh = MinHashing(ordered_set,5)
mh.hashing()

[1040, 634, 1524, 54, 389]