In [1]:
import nltk
import textract
from nltk.corpus import *
from nltk.stem.porter import *
import os
from Constants import *
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from pathlib import Path


In [2]:
# ! Uncomment in first run 
# nltk.download('punkt')
# nltk.download('words')
# nltk.download('stopwords')

In [3]:
# stemming porter object
stemmer = PorterStemmer()
root = Path(".")


In [4]:
# list of all doc names
files = list()
for dir in [r"\Auto", r"\Property"]:
    cur_dir = r".\Docs" + dir
    for file in os.listdir(cur_dir):
        cur_path = r".\Docs" + dir + "\\" + file
        files.append(cur_path)
files.sort()

In [5]:
docs = list()
for x in range(len(files)):
    for i in sent_tokenize(textract.process(files[x]).decode("utf8")): 
        docs.append((i, x))

In [6]:
# set of Stop words
stop_words = set(stopwords.words('english'))

"""
{
   key : string (normalized)
   value: list of ("doc index", file_index, frequency) 
}
"""
inverted_idx = dict()

# list of string modified document
documents = list()

count_id = 0

def process(doc_index):
    """
    Reads file, tokenize it, normalizes it and builds the inverted index
    """

    result = 0
    global count_id
    text = docs[doc_index][0]
    file_index = docs[doc_index][1]
    if doc_index + 1 < len(docs):
        if docs[doc_index + 1][1] == docs[doc_index][1]:
            text += docs[doc_index + 1][0]
            if doc_index + 2 < len(docs):
                if docs[doc_index + 2][1] == docs[doc_index][1]:
                    text += docs[doc_index + 2][0]
                else:
                    result = -1
        else:
            result = -2

    tokens = nltk.tokenize.word_tokenize(str(text))

    new_token = list()
    for i in tokens:
        new_token.append(i.lower())
    tokens = new_token

    curr_str = ""
    normalised_word_freq = dict()
    for j in range(len(tokens)):
        curr_str += tokens[j] + " "

        normal = stemmer.stem(tokens[j].lower())
        if normalised_word_freq.get(normal) != None:
            normalised_word_freq[normal] += 1
        else:
            normalised_word_freq[normal] = 1 

    documents.append((curr_str, files[file_index]))
    
    visited = set()
    for j in range(len(tokens)):
        normalised_word = stemmer.stem(tokens[j].lower())
        if tokens[j].lower() not in stop_words and normalised_word not in visited:
            visited.add(normalised_word)

            if inverted_idx.get(normalised_word) != None:
                inverted_idx[normalised_word].append((count_id, file_index, normalised_word_freq[normalised_word]))
            else:
                inverted_idx[normalised_word] = [(count_id, file_index, normalised_word_freq[normalised_word])]
    count_id += 1

    return result

i = 0
while i < len(docs):
    x = process(i)
    i += 3
    i += x

for x in inverted_idx:
    inverted_idx[x] = sorted(inverted_idx[x], key=lambda y: -y[2])

# documents

In [7]:
my_path = root / "Pickled_files" / "Inverted_index"
dbfile = open(my_path, 'wb')
pickle.dump(inverted_idx, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Documents"
dbfile = open(my_path, 'wb')
pickle.dump(documents, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Files"
dbfile = open(my_path, 'wb')
pickle.dump(files, dbfile) 
dbfile.close()

In [8]:
my_path = root / "Pickled_files" / "Inverted_index"
dbfile = open(my_path, 'rb')     
inverted_idx = pickle.load(dbfile)
dbfile.close()

my_path = root / "Pickled_files" / "Documents"
dbfile = open(my_path, 'rb')     
documents = pickle.load(dbfile)
dbfile.close()

my_path = root / "Pickled_files" / "Files"
dbfile = open(my_path, 'rb')     
files = pickle.load(dbfile)
dbfile.close()

In [9]:
def query(query_str):
    """
    Normalize query string and search in inverted index and retervive doc
    """

    query_str = stemmer.stem(query_str.lower())
    if inverted_idx.get(query_str) == None:
        return "Not found kill yourself"
    else:
        ans = []
        for i in range(min(len(inverted_idx[query_str]), 3)):
            ans.append((documents[inverted_idx[query_str][i][0]], files[inverted_idx[query_str][i][1]]))
        return ans

print(query("deductible"))
# for i in ["ontario", "Milind", "illegal", "Canadian", "dollar", "TerrIbLe", "PoliCy", "Tyre", "Mobile", "Motor", "LMAO", "Induction", "Proof"]:
    # print(i, query(i))


[(('deductible amounts 10.1 despite anything in this contract , the insurer shall be liable only for amounts in excess of the applicable deductible amount , if any , mentioned in this contract ; and any provision in this contract relating to an obligation of the insurer to pay an amount or to repair , rebuild or replace property that is damaged or lost shall be satisfied by paying the amount determined by deducting any applicable deductible amount from , the amount the insured would otherwise be entitled to recover , or the cost of repairing , rebuilding or replacing the property.deemed deductible amount ( 2 ) for the purposes of sub condition ( 1 ) , an amount that an insurer is not liable to pay by reason of subsection 261 ( 1 ) or ( 1.1 ) or 263 ( 5.1 ) or ( 5.2.1 ) of the insurance act shall be deemed to be a deductible amount under this contract.termination 11 . ', '.\\Docs\\Auto\\1215E.2.docx'), '.\\Docs\\Auto\\1215E.2.docx'), (('deductible we will pay only when a loss covered un