## DOCUMENT SIMILARITY SEARCH USING PANDAS ON SINGLE MACHINE

In [2]:
import pandas as pd

In [48]:
# Loading dataset
file_path = "./dataset/corpus.jsonl"

chunk_size = 500000

dataset = pd.read_json(file_path, lines=True, chunksize=chunk_size)
dataset = next(dataset)

SAMPLE_SIZE = 25000

# Sample 25,000 samples to run comparision
df = dataset.sample(SAMPLE_SIZE)
df


Unnamed: 0,_id,title,text,metadata
144549,340516,Airbus A318,The Airbus A318 is the smallest member of the ...,{'url': 'https://en.wikipedia.org/wiki?curid=3...
20389,42636,Hubert Humphrey,"Hubert Horatio Humphrey Jr. (May 27, 1911Janua...",{'url': 'https://en.wikipedia.org/wiki?curid=4...
188936,489792,Griqua people,"The Griqua ( ; Afrikaans ""Griekwa"", sometimes ...",{'url': 'https://en.wikipedia.org/wiki?curid=4...
233441,659051,Shawn Wayans,"Shawn Mathis Wayans (born January 19, 1971) is...",{'url': 'https://en.wikipedia.org/wiki?curid=6...
490609,1929054,Protestant Reformers,"Protestant Reformers were those theologians, c...",{'url': 'https://en.wikipedia.org/wiki?curid=1...
...,...,...,...,...
210221,572178,Point Reyes,Point Reyes is a prominent cape and popular No...,{'url': 'https://en.wikipedia.org/wiki?curid=5...
186066,479936,Rime dictionary,"A rime dictionary, rhyme dictionary, or rime b...",{'url': 'https://en.wikipedia.org/wiki?curid=4...
443092,1681226,Michael Rapaport,"Michael David Rapaport (born March 20, 1970) i...",{'url': 'https://en.wikipedia.org/wiki?curid=1...
9161,19653,May 31,May 31 is the day of the year in the Gregorian...,{'url': 'https://en.wikipedia.org/wiki?curid=1...


#### Compare between LSH and Bruteforce

In [49]:
from utils.utilities import Shingling, MinHashing, LSH

SHINGLING_SIZE = 5
SIGNATURE_SIZE = 100
BAND_NUM = 10
THRESHHOLD = 0.5

shingling = Shingling(SHINGLING_SIZE)
min_hash = MinHashing(SIGNATURE_SIZE)
lsh = LSH(THRESHHOLD)

In [50]:
import time 

start_time = time.time()
shingling_list = []

for index, row in df.iterrows():
    doc = row["title"] + " " + row["text"]
    shinglings = shingling.get_hashed_shingles(shingling.get_shingles(doc, words=True))
    shingling_list.append(shinglings)

signature_matrix = min_hash.compute_signature_matrix(shingling_list)
lsh_similar_itemset = lsh.get_similar_items(signature_matrix, BAND_NUM, SIGNATURE_SIZE)
end_time = time.time()
print(f"Time execution: {end_time - start_time} (s)")
print("SIMILARITY DOCUMENT PAIRS: ", len(lsh_similar_itemset))


Time execution: 74.08294320106506 (s)
SIMILARITY DOCUMENT PAIRS:  144


In [54]:
from tqdm import tqdm

def jaccard_similarity(set1: set, set2: set):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union > 0 :
        return intersection/union
    else:
        return 0
                
start_time = time.time()
brute_force_similar_items = set()

for i in tqdm(range(0, SAMPLE_SIZE - 1)):
    for j in range(i + 1, SAMPLE_SIZE):
        similarity = jaccard_similarity(set(shingling_list[i]), set(shingling_list[j]))
        if similarity >= THRESHHOLD:
            brute_force_similar_items.add((i, j))

end_time = time.time()
print(f"Time execution: {end_time - start_time} (s)")
print("SIMILARITY DOCUMENT PAIRS: ", len(lsh_similar_itemset))

  0%|          | 0/24999 [00:00<?, ?it/s]

100%|██████████| 24999/24999 [25:22<00:00, 16.42it/s] 

Time execution: 1522.1483767032623 (s)
SIMILARITY DOCUMENT PAIRS:  144



