In [3]:
import nltk
import os
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/dnlab/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:

def preprocess(doc):
    # Remove punctuations and numbers
    doc = re.sub(r'[^\w\s]', '', doc)
    doc = re.sub(r'\d+', '', doc)
    # Normalization
    doc = doc.lower()
    # Tokenization
    tokens = doc.split()
    # Stop words removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    ps = PorterStemmer() 
    terms = []
    for word in filtered_tokens:
        terms.append(ps.stem(word))
    doc = ' '.join(terms)
    return doc


def build_inverted_index(dir_path):
    inverted_index = {}
    doc_id = 0
    for dir, _, filename in os.walk(dir_path):
        for file in tqdm(filename):
            # The script crashes processing .lws files
            if file.endswith('.swp'):
                continue
            with open(os.path.join(dir, file), 'r', encoding='latin-1') as f:
                # print(f"processing: {f}")
                doc = f.read()
            doc = preprocess(doc)
            # Split the document into terms
            terms = doc.split()
            for term in terms:
                if term not in inverted_index:
                    inverted_index[term] = []
                if doc_id not in inverted_index[term]:
                    inverted_index[term].append(doc_id)
            doc_id += 1
    return inverted_index


In [5]:
dataset = '/home/dnlab/Desktop/stories'
inverted_index = build_inverted_index(dataset)

100%|██████████| 452/452 [00:27<00:00, 16.57it/s]
100%|██████████| 5/5 [00:00<00:00, 502.38it/s]
100%|██████████| 19/19 [00:00<00:00, 28.34it/s]


In [6]:
import json

with open('inverted_index.json', 'w') as f:
    json.dump(inverted_index, f)


In [12]:
def boolean_query_and(term1, term2, inverted_index):
    # Retrieve the document IDs for each term in the query
    docs_list1 = set(inverted_index[term1]) if term1 in inverted_index else set()
    docs_list2 = set(inverted_index[term2]) if term2 in inverted_index else set()
    # Compute the intersection of the document IDs
    result_docs = docs_list1.intersection(docs_list2)
    return result_docs

term1 = 'ben'
term2 = 'june'

and_query = boolean_query_and(term1=term1, term2=term2, inverted_index=inverted_index)
print(and_query)

{0, 298, 149, 407, 410}
