In [None]:
import nltk
import random
import json
import sys
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pathlib import Path

ALLOCBLOCKSIZE = 50000
BASEINDEXPATH = 'index/'
BASESOURCEPATH = 'source/'

def download():
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')


In [None]:
def preprocess(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    # Normalize text
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation)) #punctuation removal

    words = word_tokenize(text) #tokenize
    tokens = [w for w in words if not w.lower() in stop_words] #stopword removal
    tokens = [stemmer.stem(w) for w in tokens]
    
    return tokens

In [None]:
def writeblock(dictionary):
    sorted_terms = sorted(dictionary)
    block_file = BASEINDEXPATH + ''.join(random.choices(string.ascii_lowercase, k=8))+'.block'
    with open(block_file, 'w+') as file:
        for term in sorted_terms:
            file.write('%s %s\n' %(term, json.dumps(dictionary[term],separators=(",", ":"))))
            
    print(f"block file saved - {block_file}")
    return block_file

In [None]:
def spimiinvert(documents):
    dictionary = {}
    block_files = []
    for index, docId in enumerate(documents):
        for index, term in enumerate(documents[docId]):
            if term not in dictionary:
                dictionary[term] = {docId: [index]}
            else:
                if docId not in dictionary[term]:
                    dictionary[term][docId] = [index]
                else:
                    dictionary[term][docId].append(index)
        if docId == len(documents) - 1 or sys.getsizeof(dictionary) > int(ALLOCBLOCKSIZE):
            print(f'dictionary size {sys.getsizeof(dictionary)}. Saving to disk')
            block_files.append(writeblock(dictionary))
            dictionary = {}
    return block_files   

In [None]:
def mergeblock(block_files):
    index_file = BASEINDEXPATH + 'inverted_index.txt'
    final_index = {}
    for block_file in block_files:            
        with open(index_file, 'w+') as file:
            for line in file:
                currentkey, currentvalue = line.strip().split(' ')
                final_index[currentkey] = json.load(currentvalue)

            with open(block_file, 'r') as block:
                for entry in block:
                    key , newvalue = entry.strip().split(' ')
                    newvalue = json.loads(newvalue)
                    if key not in final_index:
                        final_index[key]= newvalue
                    else:
                        final_index[key].update(newvalue)
            
            sorted_index = sorted(final_index)
            if(block_files.index(block_file) == len(block_files) - 1):
                # write final merged index with frequency
                 for term in sorted_index:
                    freqmap = {}
                    total = 0
                    for doc in final_index[term]:
                        count = len(final_index[term][doc])
                        total = total + count
                        freqmap[f"{doc}_{count}"] =  final_index[term][doc]
                    file.write('%s_%s %s\n' % (term, total, json.dumps(freqmap,separators=(",", ":"))))
            else:
                # write intermediate merged index    
                for term in sorted_index:
                    file.write('%s %s\n' % (term, json.dumps(final_index[term],separators=(",", ":"))))
    
    return index_file

In [None]:
def create_index():
    documents = {}

    for index, p in enumerate(Path(BASESOURCEPATH).iterdir()):
        print(f"Current directory {p.name}.")
        for i in p.glob('*.txt'):
            print(f"\t Found file {i.name}. Preprocessing file")
            documents[index] = preprocess(i.read_text())
    
    blocks = spimiinvert(documents)
    index_file = mergeblock(blocks)
    print(f"Index file created - {index_file}")

In [None]:
download()
create_index()