In [1]:
# !pip install pandas numpy

In [None]:
import os
from collections import defaultdict
import pandas as pd
import math

FOLDER_PATH = "./documents"

def build_inverted_index(folder_path):
    doc_tokens = {}
    doc_id_map = {}
    doc_counter = 1

    for root, dirs, files in os.walk(folder_path):
        for file_name in sorted(files):
            path = os.path.join(root, file_name)
            with open(path, 'r', encoding='utf-8') as f:
                tokens = set()
                for line in f:
                    words = line.lower().split()
                    tokens.update(words)
            doc_tokens[str(doc_counter)] = tokens
            doc_id_map[str(doc_counter)] = file_name
            doc_counter += 1

    inverted_index = defaultdict(lambda: {'docs': [], 'doc_freq': 0})

    for doc_id, words in doc_tokens.items():
        for word in words:
            inverted_index[word]['docs'].append(int(doc_id))
            inverted_index[word]['doc_freq'] += 1

    return inverted_index, doc_id_map, set(range(1, doc_counter))

def evaluate_boolean_query(query, inverted_index, all_docs):
    tokens = query.upper().split()
    postfix = infix_to_postfix(tokens)
    result = evaluate_postfix(postfix, inverted_index, all_docs)
    return sorted(result)


def infix_to_postfix(tokens):
    precedence = {'NOT': 3, 'AND': 2, 'OR': 1}
    output = []
    stack = []

    for token in tokens:
        if token not in ('AND', 'OR', 'NOT'):
            output.append(token.lower())
        elif token == 'NOT':
            stack.append(token)
        else:
            while stack and precedence.get(stack[-1], 0) >= precedence[token]:
                output.append(stack.pop())
            stack.append(token)

    while stack:
        output.append(stack.pop())

    return output


def and_operation_with_skip(list1, list2):
    result = []
    n1, n2 = len(list1), len(list2)
    skip1 = int(math.sqrt(n1)) if n1 > 0 else 0
    skip2 = int(math.sqrt(n2)) if n2 > 0 else 0
    i, j = 0, 0

    while i < n1 and j < n2:
        if list1[i] == list2[j]:
            result.append(list1[i])
            i += 1
            j += 1
        elif list1[i] < list2[j]:
            next_i = i + skip1 if (i + skip1 < n1) else n1 - 1
            if skip1 > 1 and list1[next_i] <= list2[j]:
                while next_i < n1 and list1[next_i] <= list2[j]:
                    i = next_i
                    next_i = i + skip1 if (i + skip1 < n1) else n1 - 1
                if list1[i] < list2[j]:
                    i += 1
            else:
                i += 1
        else:
            next_j = j + skip2 if (j + skip2 < n2) else n2 - 1
            if skip2 > 1 and list2[next_j] <= list1[i]:
                while next_j < n2 and list2[next_j] <= list1[i]:
                    j = next_j
                    next_j = j + skip2 if (j + skip2 < n2) else n2 - 1
                if list2[j] < list1[i]:
                    j += 1
            else:
                j += 1
    return result

def and_operation(list1, list2):
    result = []
    i, j = 0, 0
    while i < len(list1) and j < len(list2):
        if list1[i] == list2[j]:
            result.append(list1[i])
            i += 1
            j += 1
        elif list1[i] < list2[j]:
            i += 1
        else:
            j += 1
    return result

def or_operation(list1, list2):
    result = []
    i, j = 0, 0
    while i < len(list1) and j < len(list2):
        if list1[i] == list2[j]:
            result.append(list1[i])
            i += 1
            j += 1
        elif list1[i] < list2[j]:
            result.append(list1[i])
            i += 1
        else:
            result.append(list2[j])
            j += 1
    # Add remaining elements
    result.extend(list1[i:])
    result.extend(list2[j:])
    return result

def evaluate_postfix(postfix, inverted_index, all_docs):
    stack = []
    all_docs_list = sorted(list(all_docs))
    for token in postfix:
        if token not in ('AND', 'OR', 'NOT'):
            docs = sorted(inverted_index[token]['docs']) if token in inverted_index else []
            stack.append(docs)
        elif token == 'NOT':
            operand = stack.pop()
            result = [doc for doc in all_docs_list if doc not in set(operand)]
            stack.append(result)
        else:
            right = stack.pop()
            left = stack.pop()
            if token == 'AND':
                stack.append(and_operation(left, right))
            elif token == 'OR':
                stack.append(or_operation(left, right))
    return stack.pop() if stack else []


def term_document_frequency_matrix(inverted_index, doc_id_map, folder_path):
    terms = sorted(inverted_index.keys())
    docs = [doc_id_map[str(i)] for i in range(1, len(doc_id_map) + 1)]

    doc_word_counts = {}
    for doc_id, file_name in doc_id_map.items():
        path = os.path.join(folder_path, file_name)
        word_counts = {}
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                for word in line.lower().split():
                    word_counts[word] = word_counts.get(word, 0) + 1
        doc_word_counts[int(doc_id)] = word_counts

    matrix = []
    for term in terms:
        row = []
        for i in range(1, len(doc_id_map) + 1):
            row.append(doc_word_counts[i].get(term, 0))
        matrix.append(row)
    df = pd.DataFrame(matrix, index=terms, columns=docs)
    return df





In [5]:
inverted_index, doc_id_map, all_docs = build_inverted_index(FOLDER_PATH)

df = term_document_frequency_matrix(inverted_index, doc_id_map,folder_path=FOLDER_PATH)
# print(df.head(50))

queries = [
    "accelerates AND stress",
    "biology OR science",
    "NOT stress",

]

for q in queries:
    result = evaluate_boolean_query(q, inverted_index, all_docs)
    print(f"Query: {q}\nMatched Doc IDs: {result}\n")





Query: accelerates AND stress
Matched Doc IDs: [5]

Query: biology OR science
Matched Doc IDs: [2, 3, 5]

Query: NOT stress
Matched Doc IDs: [1, 2, 3, 6]



In [None]:
# Building TF matrix with Counter and computing IDF vector
from collections import Counter
import math
import pandas as pd
import numpy as np

def build_tf_matrix(corpus, normalize=False):
    """
    corpus: list of strings (documents) or list of list of tokens
    normalize: if True, convert counts to term-frequency (count / total_terms_in_doc)
    returns: pandas DataFrame (documents x terms) with TF counts or TF normalized values
    """
    # tokenize if needed (simple whitespace tokenize)
    tokenized = []
    for doc in corpus:
        if isinstance(doc, str):
            toks = doc.split()
        else:
            toks = list(doc)
        tokenized.append(toks)
    # build vocabulary (sorted for stable order)
    vocab = sorted({tok for doc in tokenized for tok in doc})
    # build tf rows using Counter
    rows = []
    for toks in tokenized:
        c = Counter(toks)
        if normalize:
            total = sum(c.values()) or 1
            row = [c.get(term, 0)/total for term in vocab]
        else:
            row = [c.get(term, 0) for term in vocab]
        rows.append(row)
    df = pd.DataFrame(rows, columns=vocab)
    return df

def compute_idf(corpus, smooth=True, add_one=True, log_base=math.e):
    """
    corpus: list of strings or list of token lists
    smooth: if True use smoothing (idf = log((N+1)/(df+1)) + 1)
            if False use idf = log(N/df)
    add_one: if True add 1 to idf result (common variants)
    log_base: base for logarithm (default natural log)
    returns: pandas Series indexed by term with idf values
    """
    # tokenization
    tokenized = []
    for doc in corpus:
        if isinstance(doc, str):
            toks = set(doc.split())  # use set to compute document frequency
        else:
            toks = set(doc)
        tokenized.append(toks)
    N = len(tokenized)
    # collect document frequencies
    df_counts = Counter()
    for toks in tokenized:
        for t in toks:
            df_counts[t] += 1
    vocab = sorted(df_counts.keys())
    idf_vals = []
    for t in vocab:
        df_t = df_counts[t]
        if smooth:
            val = math.log((N + 1) / (df_t + 1), log_base) + 1
        else:
            val = math.log(N / df_t, log_base)
            if add_one:
                val = val + 1
        idf_vals.append(val)
    return pd.Series(idf_vals, index=vocab, name="idf")

# Example usage
corpus = [
    "the quick brown fox",
    "jumped over the lazy dog",
    "the fox",
    "the dog dog dog",
]

tf_counts = build_tf_matrix(corpus, normalize=False)
tf_normalized = build_tf_matrix(corpus, normalize=True)
idf_series = compute_idf(corpus, smooth=True)

display(tf_counts)
display(tf_normalized)
display(idf_series)

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
0,1,0,1,0,0,0,1,1
1,0,1,0,1,1,1,0,1
2,0,0,1,0,0,0,0,1
3,0,3,0,0,0,0,0,1


Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
0,0.25,0.0,0.25,0.0,0.0,0.0,0.25,0.25
1,0.0,0.2,0.0,0.2,0.2,0.2,0.0,0.2
2,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5
3,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.25


brown     1.916291
dog       1.510826
fox       1.510826
jumped    1.916291
lazy      1.916291
over      1.916291
quick     1.916291
the       1.000000
Name: idf, dtype: float64