In [1]:
import nltk
import textract
from nltk.corpus import *
from nltk.stem.porter import *
import os
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from pathlib import Path


In [2]:
# ! Uncomment in first run 
# nltk.download('punkt')
# nltk.download('words')
# nltk.download('stopwords')

In [3]:
# stemming porter object
stemmer = PorterStemmer()
root = Path(".")


In [4]:
# list of all doc names
files = list()
for dir in [r"\Auto", r"\Property"]:
    cur_dir = r".\Docs" + dir
    for file in os.listdir(cur_dir):
        cur_path = r".\Docs" + dir + "\\" + file
        files.append(cur_path)
files.sort()

In [5]:
docs = list()
for x in range(len(files)):
    for i in sent_tokenize(textract.process(files[x]).decode("utf8")): 
        docs.append((i, x))

In [6]:
# set of Stop words
stop_words = set(stopwords.words('english'))

"""
{
   key : string (normalized)
   value: list of ("doc index", file_index, frequency) 
}
"""
inverted_idx = dict()

# list of string modified document
documents = list()

count_id = 0

def process(doc_index):
    """
    Reads file, tokenize it, normalizes it and builds the inverted index
    """

    result = doc_index + 1
    global count_id
    text = docs[doc_index][0]
    file_index = docs[doc_index][1]
    while doc_index + 1 < len(docs):
        doc_index += 1
        if docs[doc_index][1] == docs[doc_index - 1][1] and len(text + docs[doc_index][0]) <= 500:
            text += docs[doc_index][0]
        else:
            result = doc_index
            break

    tokens = nltk.tokenize.word_tokenize(str(text))

    new_token = list()
    for i in tokens:
        new_token.append(i.lower())
    tokens = new_token

    curr_str = ""
    normalised_word_freq = dict()
    for j in range(len(tokens)):
        curr_str += tokens[j] + " "

        normal = stemmer.stem(tokens[j].lower())
        if normalised_word_freq.get(normal) != None:
            normalised_word_freq[normal] += 1
        else:
            normalised_word_freq[normal] = 1 

    documents.append((curr_str, files[file_index]))
    
    visited = set()
    for j in range(len(tokens)):
        normalised_word = stemmer.stem(tokens[j].lower())
        if tokens[j].lower() not in stop_words and normalised_word not in visited:
            visited.add(normalised_word)

            if inverted_idx.get(normalised_word) != None:
                inverted_idx[normalised_word].append((count_id, file_index, normalised_word_freq[normalised_word]))
            else:
                inverted_idx[normalised_word] = [(count_id, file_index, normalised_word_freq[normalised_word])]
    count_id += 1

    return result

i = 0
while i < len(docs):
    i = process(i)

for x in inverted_idx:
    inverted_idx[x] = sorted(inverted_idx[x], key=lambda y: -y[2])


In [None]:
my_path = root / "Pickled_files" / "Inverted_index"
dbfile = open(my_path, 'wb')
pickle.dump(inverted_idx, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Documents"
dbfile = open(my_path, 'wb')
pickle.dump(documents, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Files"
dbfile = open(my_path, 'wb')
pickle.dump(files, dbfile) 
dbfile.close()