In [52]:
import numpy as np
import pandas as pd
import csv
from nltk import ngrams
from sklearn.metrics import jaccard_score
import glob
from collections import defaultdict
from itertools import combinations

In [2]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [3]:
corpus = []

file_list = glob.glob("data/corpus-20090418/*.txt")
for file_path in file_list:
    with open(file_path, encoding="utf8", errors='ignore') as file_input:
        doc = file_input.read()
        doc = doc.replace('\n', ' ')
        doc = doc.replace('  ', ' ')
        corpus.append(doc)

In [22]:
nrow = len(corpus)
k = 7
list_kShingles = []
list_hashed = []
for i in range(nrow):
    tokens = corpus[i].split()
    list_kShingles.append([shingle for shingle in ngrams(tokens, k)])
    list_hashed.append([hash(shingle) for shingle in ngrams(tokens, k)])

In [23]:
jac_sim = pd.DataFrame(columns=['Doc1', 'Doc2', 'Jaccard_Score'])
for i in range(nrow):
    for j in range(nrow):
        if i != j:
            jac_sim = jac_sim.append({'Doc1': 'Doc'+str(i), 'Doc2': 'Doc'+str(j), 'Jaccard_Score': jaccard_similarity(list_hashed[i], list_hashed[j])}, ignore_index=True)

In [24]:
jac_sim

Unnamed: 0,Doc1,Doc2,Jaccard_Score
0,Doc0,Doc1,0.000000
1,Doc0,Doc2,0.000000
2,Doc0,Doc3,0.000000
3,Doc0,Doc4,0.000000
4,Doc0,Doc5,0.000000
5,Doc0,Doc6,0.000000
6,Doc0,Doc7,0.000000
7,Doc0,Doc8,0.000000
8,Doc0,Doc9,0.000000
9,Doc0,Doc10,0.000000


In [25]:
corpus[17]

'An algebraic model for representing text documents and any objects in general is known by the name Vector space model. It represents these as vectors of identifiers, index terms are one illustration of these. The Vector Space model was first used in the SMART Information Retrieval System, and it is utilised variously in indexing, information filtering, indexing and information retrieval. A document has representation as a vector. Every dimension is precisely related to a separate term. The way in which term is defined depends entirely on the application: typically ‘terms’ are either single words, keywords or longer phrases. The dimensionality of the vector is the number of words in the vocabulary, if it is the words that are chose to be the terms. So the same rule applies with keywords and indeed longer phrases. If a term occurs in the document, its value in the vector is non-zero. Several different ways of computing these values, additionally known as (term) weights, have been develo

In [26]:
corpus[35]

'Inheritance is a method of forming new classes using predefined classes. The new classes are called derived classes and they inherit the behaviours and attributes of the base classes. It was intended to allow existing code to be used again with minimal or no alteration. It also offers support for representation by categorization in computer languages; this is a powerful mechanism of information processing, vital to human learning by means of generalization and cognitive economy. Inheritance is occasionally referred to as generalization due to the fact that is-a relationships represent a hierarchy between classes of objects. Inheritance has the advantage of reducing the complexity of a program since modules with very similar interfaces can share lots of code. Due to this, inheritance has another view called polymorphism, where many sections of code are being controlled by some shared control code. Inheritance is normally achieved by overriding one or more methods exposed by ancestor, o

In [27]:
global_hash = list(set(per_hash for per_doc in list_hashed for per_hash in per_doc))

In [28]:
matrix_input = {}
for i in range(len(list_hashed)):
    matrix_input['Doc'+str(i)] = [ 1 if single_hash in list_hashed[i] else 0 for single_hash in global_hash ]

In [29]:
chr_matrix = pd.DataFrame(matrix_input, index=global_hash).reset_index(drop=True)

In [30]:
chr_matrix

Unnamed: 0,Doc0,Doc1,Doc2,Doc3,Doc4,Doc5,Doc6,Doc7,Doc8,Doc9,...,Doc90,Doc91,Doc92,Doc93,Doc94,Doc95,Doc96,Doc97,Doc98,Doc99
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
chr_matrix.shape[0]

16528

In [31]:
signature_num = 4
nrow_matrix = chr_matrix.shape[0]
prime = 13887
np.random.seed(12345)
coeff_a = np.random.choice(nrow_matrix, size=signature_num, replace=False)
coeff_b = np.random.choice(nrow_matrix, size=signature_num, replace=False)

In [32]:
coeff_a

array([9933, 4831, 6642, 1745])

In [33]:
coeff_b

array([9624, 5402, 8772, 8704])

In [34]:
matrix_permutation = pd.DataFrame(columns=['Hash'+str(j) for j in range(signature_num)])
for i in range(nrow_matrix):
    dict_hash = {}
    for j in range(signature_num):
        dict_hash['Hash'+str(j)] = ( coeff_a[j] * i + coeff_b[j] ) % prime
    matrix_permutation = matrix_permutation.append(dict_hash, ignore_index=True)

In [35]:
matrix_permutation

Unnamed: 0,Hash0,Hash1,Hash2,Hash3
0,9624,5402,8772,8704
1,5670,10233,1527,10449
2,1716,1177,8169,12194
3,11649,6008,924,52
4,7695,10839,7566,1797
5,3741,1783,321,3542
6,13674,6614,6963,5287
7,9720,11445,13605,7032
8,5766,2389,6360,8777
9,1812,7220,13002,10522


In [36]:
matrix_signature = pd.DataFrame(columns=['Doc'+str(j) for j in range(nrow)])
for i in range(signature_num):
    dict_doc = {}
    idx = list(matrix_permutation[['Hash'+str(i)]].values.ravel())
    for j in range(nrow):
        dict_doc['Doc'+str(j)] = np.where(chr_matrix.reindex(idx)[['Doc'+str(j)]] == 1)[0].min()
    matrix_signature = matrix_signature.append(dict_doc, ignore_index=True)

In [37]:
matrix_signature

Unnamed: 0,Doc0,Doc1,Doc2,Doc3,Doc4,Doc5,Doc6,Doc7,Doc8,Doc9,...,Doc90,Doc91,Doc92,Doc93,Doc94,Doc95,Doc96,Doc97,Doc98,Doc99
0,109,123,3,79,58,26,1,24,39,46,...,107,21,41,12,43,122,102,41,0,33
1,59,105,80,60,209,133,51,4,6,14,...,88,27,82,35,163,7,46,2,21,15
2,8,108,266,161,30,34,123,87,0,45,...,24,102,136,93,105,79,47,87,71,2
3,3,148,203,52,154,2,44,21,53,113,...,209,33,170,35,74,41,33,61,20,113


In [38]:
sign_sim = pd.DataFrame(columns=['Doc1', 'Doc2', 'Signature_Score'])
for i in range(nrow):
    for j in range(nrow):
        if i != j:
            sign_sim = sign_sim.append({'Doc1': 'Doc'+str(i), 'Doc2': 'Doc'+str(j), 'Signature_Score': matrix_signature.loc[:,'Doc'+str(i)].eq(matrix_signature.loc[:,'Doc'+str(j)]).sum()/signature_num}, ignore_index=True)
            

In [39]:
sign_sim

Unnamed: 0,Doc1,Doc2,Signature_Score
0,Doc0,Doc1,0.00
1,Doc0,Doc2,0.00
2,Doc0,Doc3,0.00
3,Doc0,Doc4,0.00
4,Doc0,Doc5,0.00
5,Doc0,Doc6,0.00
6,Doc0,Doc7,0.00
7,Doc0,Doc8,0.00
8,Doc0,Doc9,0.00
9,Doc0,Doc10,0.00


In [49]:
results = pd.merge(jac_sim, sign_sim, on=['Doc1','Doc2']).sort_values(by=['Jaccard_Score'], ascending=False)
results[results['Jaccard_Score']>0.3]

Unnamed: 0,Doc1,Doc2,Jaccard_Score,Signature_Score
9425,Doc95,Doc20,0.878689,1.0
2074,Doc20,Doc95,0.878689,1.0
9485,Doc95,Doc80,0.790625,0.75
8014,Doc80,Doc95,0.790625,0.75
7940,Doc80,Doc20,0.683891,0.75
2059,Doc20,Doc80,0.683891,0.75
9765,Doc98,Doc63,0.619048,1.0
6334,Doc63,Doc98,0.619048,1.0
9785,Doc98,Doc83,0.493438,0.75
8314,Doc83,Doc98,0.493438,0.75


In [55]:
def get_candidates(mtrx_signature: pd.DataFrame, b, r):
    """
    Args:
        sig_mat (str) : Signature matrix generated using the LSH family
        b (int) : Number of bands
        r (int) : Number of rows per band
    Returns:
        set : The set of candidate pairs
    """
    num_cols = len(mtrx_signature.columns) - 1
    candidates = set()
    for i in range(0, b):
        bucket = defaultdict(list)
        for j in range(1, num_cols):
            a = mtrx_signature.iloc[i * r : (i + 1) * r, j + 1]
            col = ''.join(map(str, a))
            bucket[col].append(j)
        for values in bucket.values():
            candidates.update(combinations(values, 2))
    return candidates

In [67]:
candidates = get_candidates(matrix_signature, 250, 20)
candidates

{(32, 54),
 (50, 96),
 (14, 74),
 (21, 28),
 (4, 36),
 (39, 70),
 (8, 63),
 (7, 25),
 (63, 76),
 (48, 86),
 (11, 90),
 (29, 44),
 (33, 41),
 (16, 47),
 (72, 92),
 (73, 82),
 (54, 92),
 (1, 64),
 (2, 78),
 (41, 57),
 (78, 86),
 (12, 59),
 (52, 98),
 (15, 30),
 (17, 64),
 (19, 91),
 (20, 75),
 (76, 88),
 (41, 74),
 (42, 88),
 (5, 84),
 (6, 98),
 (45, 61),
 (10, 97),
 (49, 58),
 (67, 80),
 (13, 20),
 (20, 58),
 (59, 97),
 (60, 81),
 (23, 95),
 (8, 87),
 (25, 49),
 (45, 78),
 (84, 95),
 (85, 87),
 (32, 77),
 (14, 77),
 (70, 94),
 (53, 62),
 (1, 40),
 (57, 59),
 (75, 97),
 (21, 37),
 (4, 35),
 (95, 98),
 (8, 38),
 (7, 22),
 (48, 77),
 (11, 83),
 (29, 37),
 (33, 34),
 (16, 38),
 (73, 91),
 (36, 41),
 (1, 89),
 (2, 73),
 (54, 87),
 (78, 89),
 (61, 63),
 (26, 67),
 (79, 85),
 (12, 50),
 (17, 57),
 (19, 96),
 (20, 82),
 (37, 54),
 (76, 87),
 (42, 67),
 (5, 93),
 (77, 95),
 (45, 54),
 (66, 93),
 (49, 51),
 (67, 89),
 (35, 96),
 (3, 11),
 (23, 84),
 (8, 94),
 (25, 58),
 (45, 71),
 (32, 68),
 (33,

In [68]:
dir = "C:/Users/user/Documents/GitHub/id2222-data-mining-advanced/assignment-1/data"

for pair in candidates:
    print(ind_list.iloc[pair[0] - 1, 2], ind_list.iloc[pair[1] - 1, 2])

corpus-20090418\g1pB_taskb.txt corpus-20090418\g2pC_taskd.txt
corpus-20090418\g2pB_taske.txt corpus-20090418\orig_taska.txt
corpus-20090418\g0pC_taskd.txt corpus-20090418\g3pC_taskd.txt
corpus-20090418\g0pE_taska.txt corpus-20090418\g1pA_taskc.txt
corpus-20090418\g0pA_taskd.txt corpus-20090418\g1pD_taska.txt
corpus-20090418\g1pD_taskd.txt corpus-20090418\g3pB_taske.txt
corpus-20090418\g0pB_taskc.txt corpus-20090418\g3pA_taskc.txt
corpus-20090418\g0pB_taskb.txt corpus-20090418\g0pE_taske.txt
corpus-20090418\g3pA_taskc.txt corpus-20090418\g4pB_taska.txt
corpus-20090418\g2pB_taskc.txt corpus-20090418\g4pD_taska.txt
corpus-20090418\g0pC_taska.txt corpus-20090418\g4pD_taske.txt
corpus-20090418\g1pA_taskd.txt corpus-20090418\g2pA_taskd.txt
corpus-20090418\g1pB_taskc.txt corpus-20090418\g2pA_taska.txt
corpus-20090418\g0pD_taska.txt corpus-20090418\g2pB_taskb.txt
corpus-20090418\g3pC_taskb.txt corpus-20090418\g4pE_taskb.txt
corpus-20090418\g3pC_taskc.txt corpus-20090418\g4pC_taskb.txt
corpus-2

corpus-20090418\g3pA_taska.txt corpus-20090418\g3pC_taskc.txt
corpus-20090418\g1pA_taska.txt corpus-20090418\g4pE_taskc.txt
corpus-20090418\g0pB_taskd.txt corpus-20090418\g1pB_taske.txt
corpus-20090418\g1pA_taskb.txt corpus-20090418\g4pD_taskd.txt
corpus-20090418\g1pA_taske.txt corpus-20090418\g2pC_taskd.txt
corpus-20090418\g4pD_taskc.txt corpus-20090418\g4pD_taskd.txt
corpus-20090418\g2pC_taska.txt corpus-20090418\g4pB_taskd.txt
corpus-20090418\g1pB_taskd.txt corpus-20090418\g2pA_taske.txt
corpus-20090418\g0pD_taskb.txt corpus-20090418\g1pB_taske.txt
corpus-20090418\g2pC_taskb.txt corpus-20090418\g3pC_taska.txt
corpus-20090418\g2pA_taskb.txt corpus-20090418\g3pA_taska.txt
corpus-20090418\g0pB_taska.txt corpus-20090418\g4pB_taskd.txt
corpus-20090418\g3pB_taskb.txt corpus-20090418\g4pB_taskd.txt
corpus-20090418\g1pA_taske.txt corpus-20090418\g3pA_taske.txt
corpus-20090418\g0pC_taskc.txt corpus-20090418\g2pC_taske.txt
corpus-20090418\g1pB_taska.txt corpus-20090418\g4pE_taskc.txt
corpus-2

corpus-20090418\g0pE_taskd.txt corpus-20090418\g4pE_taskc.txt
corpus-20090418\g0pE_taskc.txt corpus-20090418\g3pA_taskc.txt
corpus-20090418\g0pB_taska.txt corpus-20090418\g1pA_taskd.txt
corpus-20090418\g0pE_taske.txt corpus-20090418\g4pC_taska.txt
corpus-20090418\g4pC_taska.txt corpus-20090418\g4pE_taskb.txt
corpus-20090418\g1pA_taskc.txt corpus-20090418\g2pA_taskb.txt
corpus-20090418\g4pC_taskb.txt corpus-20090418\orig_taskc.txt
corpus-20090418\g1pB_taskb.txt corpus-20090418\g2pA_taske.txt
corpus-20090418\g0pC_taskc.txt corpus-20090418\g3pB_taskd.txt
corpus-20090418\g1pB_taska.txt corpus-20090418\g2pB_taskb.txt
corpus-20090418\g1pB_taske.txt corpus-20090418\g1pD_taske.txt
corpus-20090418\g1pD_taskc.txt corpus-20090418\g4pD_taskb.txt
corpus-20090418\g1pD_taskd.txt corpus-20090418\g4pE_taske.txt
corpus-20090418\g0pA_taskd.txt corpus-20090418\g3pB_taskb.txt
corpus-20090418\g1pD_taske.txt corpus-20090418\g3pA_taska.txt
corpus-20090418\g0pB_taskb.txt corpus-20090418\g2pC_taskd.txt
corpus-2

corpus-20090418\g1pA_taske.txt corpus-20090418\g2pB_taske.txt
corpus-20090418\g2pC_taska.txt corpus-20090418\g3pC_taske.txt
corpus-20090418\g0pD_taskb.txt corpus-20090418\g2pB_taskb.txt
corpus-20090418\g2pA_taskb.txt corpus-20090418\g2pB_taskd.txt
corpus-20090418\g0pA_taske.txt corpus-20090418\g3pC_taske.txt
corpus-20090418\g0pB_taska.txt corpus-20090418\g3pC_taske.txt
corpus-20090418\g3pB_taskb.txt corpus-20090418\g3pC_taske.txt
corpus-20090418\g1pA_taske.txt corpus-20090418\g4pE_taskc.txt
corpus-20090418\g0pC_taskc.txt corpus-20090418\g2pE_taskd.txt
corpus-20090418\g1pB_taska.txt corpus-20090418\g4pC_taska.txt
corpus-20090418\g2pC_taske.txt corpus-20090418\g4pB_taskd.txt
corpus-20090418\g3pA_taske.txt corpus-20090418\orig_taskb.txt
corpus-20090418\g1pB_taskb.txt corpus-20090418\g4pD_taska.txt
corpus-20090418\g1pB_taskc.txt corpus-20090418\g4pD_taskc.txt
corpus-20090418\g0pC_taskd.txt corpus-20090418\g2pA_taskb.txt
corpus-20090418\g3pB_taske.txt corpus-20090418\g3pC_taska.txt
corpus-2

corpus-20090418\g0pA_taskc.txt corpus-20090418\g4pB_taskd.txt
corpus-20090418\g0pA_taskd.txt corpus-20090418\g3pC_taska.txt
corpus-20090418\g1pD_taske.txt corpus-20090418\g2pE_taskb.txt
corpus-20090418\g0pB_taskb.txt corpus-20090418\g2pE_taskc.txt
corpus-20090418\g3pA_taske.txt corpus-20090418\g4pD_taskb.txt
corpus-20090418\g1pA_taskc.txt corpus-20090418\g4pE_taskc.txt
corpus-20090418\g0pC_taska.txt corpus-20090418\g3pA_taskc.txt
corpus-20090418\g1pA_taskd.txt corpus-20090418\g3pC_taskc.txt
corpus-20090418\g0pC_taskd.txt corpus-20090418\g0pE_taskd.txt
corpus-20090418\g2pC_taskc.txt corpus-20090418\g3pB_taskb.txt
corpus-20090418\g1pD_taskd.txt corpus-20090418\g2pB_taskc.txt
corpus-20090418\g2pA_taskd.txt corpus-20090418\g3pA_taska.txt
corpus-20090418\g0pB_taskb.txt corpus-20090418\g3pC_taske.txt
corpus-20090418\g2pB_taska.txt corpus-20090418\orig_taskc.txt
corpus-20090418\g2pB_taskb.txt corpus-20090418\g4pB_taske.txt
corpus-20090418\g0pC_taskb.txt corpus-20090418\g0pE_taskb.txt
corpus-2