**Question 3: POSITIONAL INDEX AND PHRASE QUERIES**

---



---



**DRIVE MOUNTE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**LIBRARY / PACKAGE**

In [None]:
import os
import re
import string
import warnings
import pickle
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**INPUT FOLDER**

In [None]:
directory_path = "/content/drive/MyDrive/text_files/"

**POSITIONAL INDEX CREATION**

In [None]:
positional_index = {}

for doc_id in range(1,1000):
    file_path = os.path.join(directory_path, f"file{doc_id}.txt")

    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            content = file.read()
            terms = content.split()

            for position, term in enumerate(terms, start=1):
                if term not in positional_index:
                    positional_index[term] = {
                        'doc_count': 1,
                        'docs': {doc_id: [position]}
                    }
                else:
                    positional_index[term]['doc_count'] += 1
                    if doc_id not in positional_index[term]['docs']:
                        positional_index[term]['docs'][doc_id] = [position]
                    else:
                        positional_index[term]['docs'][doc_id].append(position)
    else:
        print(f"File file{doc_id}.txt does not exist.")

In [None]:
for key, value in positional_index.items():
    print(f"{key}: {{\n    'doc_freq': {value['doc_count']},\n    'docs': {value['docs']}\n}}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
decision: {
    'doc_freq': 1,
    'docs': {680: [7]}
}
shelled: {
    'doc_freq': 1,
    'docs': {680: [9]}
}
3035: {
    'doc_freq': 1,
    'docs': {680: [10]}
}
licensed: {
    'doc_freq': 1,
    'docs': {680: [14]}
}
eat: {
    'doc_freq': 1,
    'docs': {680: [17]}
}
knocking: {
    'doc_freq': 1,
    'docs': {680: [23]}
}
auratone: {
    'doc_freq': 1,
    'docs': {681: [7]}
}
ilouds: {
    'doc_freq': 1,
    'docs': {681: [15]}
}
steroids: {
    'doc_freq': 1,
    'docs': {681: [26]}
}
pairing: {
    'doc_freq': 1,
    'docs': {681: [27]}
}
inboard: {
    'doc_freq': 1,
    'docs': {681: [28]}
}
matched: {
    'doc_freq': 2,
    'docs': {681: [30], 835: [68]}
}
loudly: {
    'doc_freq': 1,
    'docs': {681: [46]}
}
girlfriend: {
    'doc_freq': 1,
    'docs': {681: [58]}
}
transportation: {
    'doc_freq': 1,
    'docs': {682: [7]}
}
keyboardssustain: {
    'doc_freq': 1,
    'docs': {684: [39]}
}
workaround: {
   

In [None]:
print(len(positional_index))

6471


In [None]:
file_path = "/content/drive/MyDrive/positional_index.pkl"

In [None]:
def dump_positional_index(file_path, positional_index):
    with open(file_path, "wb") as file_obj:
        pickle.dump(positional_index, file_obj)

# Dumping
dump_positional_index(file_path, positional_index)

In [None]:
def load_positional_index(file_path):
    with open(file_path, "rb") as file_obj:
        loaded_inverted_index = pickle.load(file_obj)
    return loaded_inverted_index

# Loading
positional_index = load_positional_index(file_path)

**QUERY HANDALING**

In [None]:
def preprocess(text):
    lem = WordNetLemmatizer()
    text = text.lower()
    text = re.sub('[^A-Z a-z ]+', ' ', text)
    text = text.split()
    tokens = []
    for j in text :
        tokens.append(j)
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    tokens = [token for token in tokens if token.strip()]
    tokens = [lem.lemmatize(token) for token in tokens]
    tokens = ' '.join(tokens)
    return tokens

In [None]:
def retrieve_documents(queries, positional_index):
    results = []
    error_messages = []

    for query in queries:
        processed_query = preprocess(query)
        terms = processed_query.split(' ')

        if len(terms) > 5:
            error_messages.append(f"Length of the input sequence of query '{query}' is >5.")
            continue

        last_index = positional_index[terms[0]]['docs']
        for i in range(1, len(terms)):
            new_term = terms[i]
            if new_term not in positional_index:
                error_messages.append(f"Word '{new_term}' not in dictionary for query '{query}'!")
                break
            new_last_index = {}
            for fid in last_index:
                if fid in positional_index[new_term]['docs']:
                    found = []
                    for old_position in last_index[fid]:
                        for new_position in positional_index[new_term]['docs'][fid]:
                            if new_position == (old_position + 1):
                                found.append(old_position)
                    if len(found):
                        new_last_index[fid] = found
            last_index = new_last_index
        documents_found = list(last_index.keys())
        results.append(documents_found)

    return results, error_messages

**USER QUERY INPUT**

In [None]:
n = int(input())
queries = [input().strip() for _ in range(n)]

query_results, error_messages = retrieve_documents(queries, positional_index)

for i, result in enumerate(query_results, start=1):
    print(f"Number of documents retrieved for query {i} using positional index: {len(result)}")
    if result:
        filenames = [f"file{value}.txt" for value in result]
        print(f"Names of documents retrieved for query {i} using positional index: ", end="")
        print(*filenames, sep=", ")
    else:
        print("No documents found.")

for error_message in error_messages:
    print(error_message)

1
great is value
Number of documents retrieved for query 1 using positional index: 10
Names of documents retrieved for query 1 using positional index: file65.txt, file103.txt, file330.txt, file466.txt, file597.txt, file748.txt, file767.txt, file789.txt, file899.txt, file993.txt
