In [1]:
from collections import defaultdict

In [2]:
documents = {
    1: "Data science is an interdisciplinary field.",
    2: "Machine learning is a subset of artificial intelligence.",
    3: "Data science includes machine learning and data mining.",
    4: "Artificial intelligence is transforming the world.",
    5: "Data mining helps to extract information from data."
}

In [3]:
# Step 1: Build the inverted index
inverted_index = defaultdict(list)

for doc_id, content in documents.items():
    # Tokenize each document into words
    words = content.lower().split()
    for word in words:
        if doc_id not in inverted_index[word]:
            inverted_index[word].append(doc_id)

# Display the inverted index (optional, for reference)
print("Inverted Index:")
for word, doc_ids in inverted_index.items():
    print(f"{word}: {doc_ids}")

Inverted Index:
data: [1, 3, 5]
science: [1, 3]
is: [1, 2, 4]
an: [1]
interdisciplinary: [1]
field.: [1]
machine: [2, 3]
learning: [2, 3]
a: [2]
subset: [2]
of: [2]
artificial: [2, 4]
intelligence.: [2]
includes: [3]
and: [3]
mining.: [3]
intelligence: [4]
transforming: [4]
the: [4]
world.: [4]
mining: [5]
helps: [5]
to: [5]
extract: [5]
information: [5]
from: [5]
data.: [5]


In [4]:
def search(query):
    query_words = query.lower().split()
    result_docs = [set(inverted_index[word]) for word in query_words if word in inverted_index]
    return set.intersection(*result_docs) if result_docs else set()


In [5]:
# Step 3: Search documents based on user input query
query = input("\nEnter a search query: ")
result = search(query)

# Display search results
if result:
    print(f"\nDocuments containing '{query}': {result}")
    for doc_id in result:
        print(f"Document {doc_id}: {documents[doc_id]}")
else:
    print(f"\nNo documents found containing '{query}'.")



Enter a search query:  data



Documents containing 'data': {1, 3, 5}
Document 1: Data science is an interdisciplinary field.
Document 3: Data science includes machine learning and data mining.
Document 5: Data mining helps to extract information from data.
