In [1]:
import textract
import os
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
root = Path(".")


In [2]:
# list of all doc names
files = list()
for dir in [r"\Auto", r"\Property"]:
    cur_dir = r".\Docs" + dir
    for file in os.listdir(cur_dir):
        cur_path = r".\Docs" + dir + "\\" + file
        files.append(cur_path)
files.sort()

In [3]:
# List of string with entire documents 
documents = list()

for file in files:
    cur = textract.process(file).decode("utf8")
    
    ans = list()
    for i in range(len(cur)):
        if cur[i].isalnum():
            ans.append(cur[i].lower())
        else:
            ans.append(' ')
    
    res = list()
    for i in range(len(ans) - 1):
        if ans[i] == ans[i + 1] and ans[i] == ' ':
            continue
        else:
            res.append(ans[i])
    
    if len(ans) > 0:
        res.append(ans[-1])
        documents.append(''.join(res))

## Taking K

In [4]:
def get_shingle_set(documents, k):
    shingles = set()
    shingle_doc_id = dict()

    for j in range(len(documents)):
        for i in range(len(documents[j]) - k + 1):
            cur_shingle = documents[j][i : i + k]

            shingles.add(cur_shingle)
            
            if shingle_doc_id.get(cur_shingle) == None:
                shingle_doc_id[cur_shingle] = [j]
            else:
                shingle_doc_id[cur_shingle].append(j)
    
    return (shingles, shingle_doc_id)

In [5]:
for k in range(2, 15):
    print(k, len(get_shingle_set(documents, k)[0]))

2 782
3 5546
4 21106
5 54265
6 107153
7 176014
8 256246
9 340657
10 423212
11 502426
12 576688
13 645218
14 707339


so we will take K as 4

In [6]:
K = 4
shingles, shingle_doc_id = get_shingle_set(documents, K)
shingles = sorted(list(shingles))

In [7]:
incident_matrix = np.zeros(shape=(len(shingles), len(documents)))
incident_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
shingle_id = dict()
for i in range(len(shingles)):
    shingle_id[shingles[i]] = i

for shingle, doc_ids in shingle_doc_id.items():
    for doc_id in doc_ids:
        incident_matrix[shingle_id[shingle]][doc_id] = 1

incident_matrix

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
my_path = root / "Pickled_files" / "Incident_Matrix"
dbfile = open(my_path, 'wb')
pickle.dump(incident_matrix, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Shingles"
dbfile = open(my_path, 'wb')
pickle.dump(shingles, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Shingle_id"
dbfile = open(my_path, 'wb')
pickle.dump(shingle_id, dbfile) 
dbfile.close()