In [10]:
import nltk
import textract
from nltk.corpus import *
from nltk.stem.porter import *
import os
from Constants import *
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from pathlib import Path


In [11]:
# ! Uncomment in first run 
# nltk.download('punkt')
# nltk.download('words')
# nltk.download('stopwords')

In [12]:
# stemming porter object
stemmer = PorterStemmer()
root = Path(".")


In [13]:
# list of all doc names
files = list()
for dir in [r"\Auto", r"\Property"]:
    cur_dir = r".\Docs" + dir
    for file in os.listdir(cur_dir):
        cur_path = r".\Docs" + dir + "\\" + file
        files.append(cur_path)
files.sort()

In [14]:
docs = list()
for x in range(len(files)):
    for i in sent_tokenize(textract.process(files[x]).decode("utf8")): 
        docs.append((i, x))

In [15]:
# set of Stop words
stop_words = set(stopwords.words('english'))

"""
{
   key : string (normalized)
   value: list of ("doc index", file_index, frequency) 
}
"""
inverted_idx = dict()

# list of string modified document
documents = list()

count_id = 0

def process(doc_index):
    """
    Reads file, tokenize it, normalizes it and builds the inverted index
    """

    result = doc_index + 1
    global count_id
    text = docs[doc_index][0]
    file_index = docs[doc_index][1]
    while doc_index + 1 < len(docs):
        doc_index += 1
        if docs[doc_index][1] == docs[doc_index - 1][1] and len(text + docs[doc_index][0]) <= 500:
            text += docs[doc_index][0]
        else:
            result = doc_index
            break

    tokens = nltk.tokenize.word_tokenize(str(text))

    new_token = list()
    for i in tokens:
        new_token.append(i.lower())
    tokens = new_token

    curr_str = ""
    normalised_word_freq = dict()
    for j in range(len(tokens)):
        curr_str += tokens[j] + " "

        normal = stemmer.stem(tokens[j].lower())
        if normalised_word_freq.get(normal) != None:
            normalised_word_freq[normal] += 1
        else:
            normalised_word_freq[normal] = 1 

    documents.append((curr_str, files[file_index]))
    
    visited = set()
    for j in range(len(tokens)):
        normalised_word = stemmer.stem(tokens[j].lower())
        if tokens[j].lower() not in stop_words and normalised_word not in visited:
            visited.add(normalised_word)

            if inverted_idx.get(normalised_word) != None:
                inverted_idx[normalised_word].append((count_id, file_index, normalised_word_freq[normalised_word]))
            else:
                inverted_idx[normalised_word] = [(count_id, file_index, normalised_word_freq[normalised_word])]
    count_id += 1

    return result

i = 0
while i < len(docs):
    i = process(i)

for x in inverted_idx:
    inverted_idx[x] = sorted(inverted_idx[x], key=lambda y: -y[2])

documents

[("ontario automobile policy ( oap 1 ) owner ’ s policy approved by the superintendent of financial services for use as the standard owner 's policy on or after june 1 , 2016 about this policy this is your automobile insurance policy.it is written in easy to understand language.please read it carefully so you know your rights and obligations and the rights and obligations of your insurance company . ",
  '.\\Docs\\Auto\\1215E.2.docx'),
 ('here is a summary of each section of the policy.for details of each coverage and the conditions that apply , consult the appropriate sections of the policy.section 1 - introduction contains information that applies to the entire policy.in order to understand what is covered and what is not covered by each coverage , you should read sections 1 and 2 and the entire section of the policy that deals with the specific coverage . ',
  '.\\Docs\\Auto\\1215E.2.docx'),
 ('section 2 - what automobiles are covered explains what coverages are available to a descr

In [16]:
my_path = root / "Pickled_files" / "Inverted_index"
dbfile = open(my_path, 'wb')
pickle.dump(inverted_idx, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Documents"
dbfile = open(my_path, 'wb')
pickle.dump(documents, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Files"
dbfile = open(my_path, 'wb')
pickle.dump(files, dbfile) 
dbfile.close()

In [17]:
my_path = root / "Pickled_files" / "Inverted_index"
dbfile = open(my_path, 'rb')     
inverted_idx = pickle.load(dbfile)
dbfile.close()

my_path = root / "Pickled_files" / "Documents"
dbfile = open(my_path, 'rb')     
documents = pickle.load(dbfile)
dbfile.close()

my_path = root / "Pickled_files" / "Files"
dbfile = open(my_path, 'rb')     
files = pickle.load(dbfile)
dbfile.close()

In [18]:
def query(query_str):
    """
    Normalize query string and search in inverted index and retervive doc
    """

    query_str = stemmer.stem(query_str.lower())
    if inverted_idx.get(query_str) == None:
        return "Not found kill yourself"
    else:
        ans = []
        for i in range(min(len(inverted_idx[query_str]), 3)):
            ans.append((documents[inverted_idx[query_str][i][0]][0], files[inverted_idx[query_str][i][1]]))
        return ans

print(query("deductible"))
for i in ["ontario", "Milind", "illegal", "Canadian", "dollar", "TerrIbLe", "PoliCy", "Tyre", "Mobile", "Motor", "LMAO", "Induction", "Proof"]:
    print(i, query(i))


[('the deductible the amount we pay to cover any losses may be subject to a deductible.the deductible is the amount you agree to pay toward the cost of any single claim you make under this section.the deductible , if any , is shown on the certificate of automobile insurance.you will need to make a separate claim for each incident that causes loss or damage.the deductible applies each time you make a claim and separately to each automobile that is insured . ', '.\\Docs\\Auto\\1215E.2.docx'), ('under dc-pd , we will pay $ 3,525 ( $ 3,750 -- being 75 % of $ 5,000 -- less $ 225 -- being 75 % of the dc-pd deductible ) .you have the optional collision or upset coverage and your deductible is $ 500.under the optional coverage , we will pay a further $ 1,125 ( $ 1,250 -- being 25 % of $ 5,000 -- less $ 125 -- being 25 % of the deductible ) .in sum : you receive $ 4,650.you are responsible for the deductibles totalling $ 350 . ', '.\\Docs\\Auto\\1215E.2.docx'), ('deductible amounts 10.1 despite