In [79]:
def build_inverted_index(documents):
    inverted_index = {}
    for doc_id, text in documents.items():
        for word in text.lower().split():
            if word not in inverted_index:
                inverted_index[word] = set()
            inverted_index[word].add(doc_id)
    return dict(sorted(inverted_index.items()))

In [80]:
def boolean_query(query, inverted_index, doc_ids):
    tokens = query.lower().split()
    if not tokens:
        return set()
    
    all_docs = set(doc_ids)
    
    if tokens[0] == 'not' and len(tokens) > 1:
        result = all_docs - inverted_index.get(tokens[1], set())
        i = 2
    else:
        result = inverted_index.get(tokens[0], set())
        i = 1
    
    while i < len(tokens):
        if tokens[i] == 'and' and i + 1 < len(tokens):
            if tokens[i + 1] == 'not' and i + 2 < len(tokens):
                term = all_docs - inverted_index.get(tokens[i + 2], set())
                i += 3
            else:
                term = inverted_index.get(tokens[i + 1], set())
                i += 2
            result = result & term
        elif tokens[i] == 'or' and i + 1 < len(tokens):
            if tokens[i + 1] == 'not' and i + 2 < len(tokens):
                term = all_docs - inverted_index.get(tokens[i + 2], set())
                i += 3
            else:
                term = inverted_index.get(tokens[i + 1], set())
                i += 2
            result = result | term
        else:
            i += 1
    
    return result

Problem 1

In [81]:
documents = {
    1: "SRM University AP",
    2: "AP is State"
}

inverted_index = build_inverted_index(documents)
print("Inverted Index:", inverted_index)

Inverted Index: {'ap': {1, 2}, 'is': {2}, 'srm': {1}, 'state': {2}, 'university': {1}}


In [82]:
queries = [
    "srm AND ap",
    "srm OR ap",
    "NOT state"
]

for q in queries:
    result = boolean_query(q, inverted_index, documents.keys())
    print(f"{q} -> {result}")

srm AND ap -> {1}
srm OR ap -> {1, 2}
NOT state -> {1}


Problem 2

In [83]:
documents = {
    1: "brutus and caesar",
    2: "brutus killed caesar",
    3: "capitol sees brutus"
}

inverted_index = build_inverted_index(documents)
print("Inverted Index:", inverted_index)

Inverted Index: {'and': {1}, 'brutus': {1, 2, 3}, 'caesar': {1, 2}, 'capitol': {3}, 'killed': {2}, 'sees': {3}}


In [84]:
queries = [
    "brutus AND caesar",
    "brutus OR capitol",
    "NOT killed"
]

for q in queries:
    result = boolean_query(q, inverted_index, documents.keys())
    print(f"{q} -> {result}")

brutus AND caesar -> {1, 2}
brutus OR capitol -> {1, 2, 3}
NOT killed -> {1, 3}


Problem 3

In [85]:
documents = {
    1: "new home sales top forecasts",
    2: "home sales rise in july",
    3: "increase in home sales in july",
    4: "july new home sales rise"
}

inverted_index = build_inverted_index(documents)
print("Inverted Index:", inverted_index)

Inverted Index: {'forecasts': {1}, 'home': {1, 2, 3, 4}, 'in': {2, 3}, 'increase': {3}, 'july': {2, 3, 4}, 'new': {1, 4}, 'rise': {2, 4}, 'sales': {1, 2, 3, 4}, 'top': {1}}


In [86]:
queries = [
    "top AND sales",
    "sales AND rise OR july",
    "NOT increase AND top OR sales"
]

for q in queries:
    result = boolean_query(q, inverted_index, documents.keys())
    print(f"{q} -> {result}")

top AND sales -> {1}
sales AND rise OR july -> {2, 3, 4}
NOT increase AND top OR sales -> {1, 2, 3, 4}


Problem 4

In [87]:
documents = {
    1: "breakthrough drug for schizophrenia",
    2: "new schizophrenia drug",
    3: "new approach for treatment of schizophrenia",
    4: "new hopes for schizophrenia patients"
}

inverted_index = build_inverted_index(documents)
print("Inverted Index:", inverted_index)

Inverted Index: {'approach': {3}, 'breakthrough': {1}, 'drug': {1, 2}, 'for': {1, 3, 4}, 'hopes': {4}, 'new': {2, 3, 4}, 'of': {3}, 'patients': {4}, 'schizophrenia': {1, 2, 3, 4}, 'treatment': {3}}


In [88]:
queries = [
    "schizophrenia AND drug",
    "drug OR schizophrenia NOT patients",
    "for AND NOT (drug OR approach)"
]

for q in queries:
    result = boolean_query(q, inverted_index, documents.keys())
    print(f"{q} -> {result}")

schizophrenia AND drug -> {1, 2}
drug OR schizophrenia NOT patients -> {1, 2, 3, 4}
for AND NOT (drug OR approach) -> {1, 3, 4}
