This notebook aims to find near duplicates using the SimHash technique, and organizing them in an M-Tree

In [21]:
# import similarities.mtree
import importcsv
from similarities import libsim, mtree
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
print('Importing csv file...........')
small_dataset = importcsv.opencsv('processedspeech.csv') 

Importing csv file...........


Κρατάω μόνο τα processed speeches

In [3]:
speeches = small_dataset['processed_speeches']
# speeches = small_dataset['speech']

In [4]:
print(speeches)
print(type(speeches[0]))
# speeches.head()

0         ΠΑΡΑΚΑΛΕΙΤΑ ΚΥΡΙ ΓΡΑΜΜΑΤΕ ΣΥΝΟΔΕΥΣ ΙΕΡ ΣΥΝΟΔ Α...
1         ΥΠΑΡΧ ΕΚ ΚΥΡΙ ΣΥΝΑΔΕΛΦ ΨΗΦΙΣ ΚΗΡΥΣΣΕΤΑ ΠΕΡΑΙΩΜ...
2         ΚΥΡΙ ΒΟΥΛΕΥΤ ΤΙΜ ΑΝΑΚΟΙΝΩΣ ΣΩΜ ΒΟΥΛΕΥΤ ΕΒΡ ΥΠΟ...
3         ΥΠΑΡΧ ΕΚ ΚΥΡΙ ΣΥΝΑΔΕΛΦ ΕΨΗΦΙΣ ΚΗΡΥΣΣΕΤΑ ΠΕΡΑΙΩ...
4         ΚΥΡΙ ΣΥΝΑΔΕΛΦ ΤΙΜ ΑΝΑΚΟΙΝΩΣ ΑΠΟΤΕΛΕΣΜ ΔΙΕΞΑΧΘΕ...
                                ...                        
362625    Π ΤΕΘ ΠΕΡΙΟΡΙΣΜ ΠΑΜ ευρωπαϊκη ΕΝΩΣ ΖΗΤΑΜ ΛΕΜ ...
362626    ΥΠΗΡΧΕΕΜ ΑΝΑΠΤΥΞΙΑΚ ΥΠΟΥΡΓΕΙ ΥΠΟΥΡΓΕΙ ΚΑΝ ΚΟΙΝ...
362627    ΕΥΧΑΡΙΣΤΟΥΜ ΥΠΟΥΡΓ ΚΗΡΥΣΣΕΤΑ ΠΕΡΑΙΩΜΕΝ ΣΥΖΗΤΗΣ...
362628    ΟΛΟΚΛΗΡΩΣ ΨΗΦΟΦΟΡΙ ΗΛΕΚΤΡΟΝΙΚ ΣΥΣΤΗΜ ΣΧΕΔΙ ΝΟΜ...
362629    ΣΥΝΑΙΝΕΣ ΣΩΜΑΤ ΩΡ 1949΄ ΛΥΕΤΑ ΣΥΝΕΔΡΙΑΣ ΔΕΥΤΕΡ...
Name: processed_speeches, Length: 362630, dtype: object
<class 'str'>


We need to define the weights of each word for all speeches, and keep them in a dictionary.
The dictionary is of the form {word: weight}

### Find word weights

In [5]:
# This shall come in handy
# https://www.reddit.com/r/LanguageTechnology/comments/dugjis/approximate_string_matching_with_simhash_and_a/
# def tokenizer(word):
#     return word.split()
# del vectorizer, X
# speeches = speeches.to_list()
vectorizer = TfidfVectorizer(encoding='utf-8', lowercase=False, )
X = vectorizer.fit_transform(speeches.values)

In [6]:
names = vectorizer.get_feature_names_out()
index = 10550

print(type(names))


<class 'numpy.ndarray'>


We need word weights for this to work

In [7]:
max_idf = len(names) -1
weights = {name: (idx/max_idf) for idx, name in enumerate(names)} # normalized weights
weights

{'00': 0.0,
 '000': 1.245913404034766e-06,
 '0000': 2.491826808069532e-06,
 '000000': 3.7377402121042977e-06,
 '0000000': 4.983653616139064e-06,
 '000000000': 6.22956702017383e-06,
 '0000000395': 7.475480424208595e-06,
 '0000006': 8.721393828243361e-06,
 '00001': 9.967307232278128e-06,
 '0000145': 1.1213220636312894e-05,
 '00003': 1.245913404034766e-05,
 '00005': 1.3705047444382426e-05,
 '00006': 1.495096084841719e-05,
 '00007': 1.619687425245196e-05,
 '0000τοσο': 1.7442787656486722e-05,
 '0001': 1.868870106052149e-05,
 '00010': 1.9934614464556255e-05,
 '00012': 2.1180527868591022e-05,
 '00018': 2.242644127262579e-05,
 '0001α': 2.3672354676660552e-05,
 '0002': 2.491826808069532e-05,
 '00020': 2.6164181484730085e-05,
 '000200': 2.7410094888764852e-05,
 '000250': 2.865600829279962e-05,
 '0002γραφημα': 2.990192169683438e-05,
 '0002θεωρουμε': 3.114783510086915e-05,
 '0003': 3.239374850490392e-05,
 '00036': 3.363966190893868e-05,
 '0003επιλεξει': 3.4885575312973445e-05,
 '0004': 3.613148871

Acquire the signature for one document

In [15]:
import importlib
importlib.reload(libsim)

sig = libsim.getSignatureOfSpeech(speeches[0], weights)
print('signature of first document:',bin(sig))

signature of first document: 0b1000011000110010010010111001011010011001001011000111000110010111


### Create M-Tree strucure
This data structure will hold all the signatures, based on their Hamming Distance.
This way, we can find the relevant ones

In [16]:
# The following list is an array of signatures in integer form 
# signatures = [libsim.getSignatureOfSpeech(speech, weights) for speech in speeches]
signatures = {idx: libsim.getSignatureOfSpeech(speech, weights) for idx, speech in enumerate(speeches)}
# this took 14 minutes to complete

In [17]:
# run the following code? protection against Run-All
if 1 == 0: # change this if statement
    import pickle
    filename = 'signatures_processed_speech.ser'
    with open(filename, 'wb') as fo:
        pickle.dump(signatures, fo)


In [18]:
# run the following code? protection against Run-All
if 1 == 0: # change this if statement
    import pickle
    filename = 'signatures_processed_speech.ser'
    with open(filename, 'rb') as fo:
        signatures = pickle.load(fo) # this is a dictionary of {indexOfSpeech: signatureOfSpeech}

In [20]:
def hamming_distance(a: int, b:int):
    """
    This function returns the hamming distance between two integers.
    The hamming distance is defined as the total number of 1s, after
    the XOR operation between two binary numbers
    """
    return int.bit_count(a ^ b) # requires python 3.10

#### The magic number 3
If the distance between two document signatures is greater than 3, the documents are not near duplicates. This is taken as-is, no further investigation is needed. (Apostolos Papadopoulos)

In [28]:
tree = mtree.MTree(hamming_distance, max_node_size=10)
tree.add_all(signatures)
# takes 30 seconds to build the tree

<map at 0x7fb718c51900>

In [40]:
m = tree.search(signatures[-1], 3) # this returns a map object

Let's see if a query is possible

In [41]:
list(m)

[9289731456576002547, 9432509604114806263, 10874859952015151254]

### We have a tree of signatures
Now all the signatures are stored in a dictionary, indexed based on speech