# **Phase 1**


In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
#maps: file_name to list of DOCNOs
FTFileMapping = {}
#maps
FileDictionary = {}  # Mapping: (file_name, local_doc_num) -> global_doc_id
DocumentToFileMapping = []  # list of (file_name, numeric_DOCNO)

# Files are sorted numerically according to suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> DOCNOs
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save cleaned (FT911-X, numeric_DOCNO)
            for docno in parsed_docs.keys():
                numeric_docno = docno.split("-")[-1]
                ft_formatted = f"FT911-{file_name.split('_')[-1]}"
                DocumentToFileMapping.append((ft_formatted, numeric_docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens
# Create WordDictionary: sort all unique words alphabetically, then assign IDs
all_words = set()
for words in processed_docs.values():
    all_words.update(words)

WordDictionary = {}
for idx, word in enumerate(sorted(all_words), start=1):
    WordDictionary[word] = idx

# Sort FileDictionary
FileDictionary = dict(sorted(FileDictionary.items()))

# Sort DocumentToFileMapping: sort FT911-14 before FT911-15
def sort_key(doc):
    prefix, number = doc[0].split("-")
    return int(number), int(doc[1])

DocumentToFileMapping = sorted(DocumentToFileMapping, key=sort_key)

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary (filename + local doc number)
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping (file -> list of DOCNOs)
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# Save final formatted DocumentToFileMapping
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for formatted_file, numeric_docno in DocumentToFileMapping:
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Create parser_output.txt with token dictionary and document ID mapping
parser_output_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
with open(parser_output_path, "w") as out_file:
    # First print word -> ID mapping
    for word, word_id in WordDictionary.items():
        out_file.write(f"{word}\t\t{word_id}\n")

    out_file.write("\n")

    # Then print document -> ID mapping
    for idx, (formatted_file, numeric_docno) in enumerate(DocumentToFileMapping, start=1):
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")
print(f"parser_output.txt saved to {parser_output_path}")

# Print sample of token to ID mapping
print("Token and Token ID Mapping (First 20):")
for token, token_id in list(WordDictionary.items())[:20]:
    print(f"{token}\t{token_id}")

# Print DocumentToFileMapping like your format
print("DocumentToFileMapping (Sample):")
for pair in DocumentToFileMapping[900:910]:
    print(pair)

Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

# **Phase 2**

In [None]:
import os
import re
import nltk
import time
import sys
from collections import defaultdict
from nltk.stem import PorterStemmer

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

# === Test mode with testdata.txt ===
test_file_path = "/content/drive/MyDrive/InfoRetrieval/testdata.txt"
processed_docs = {}

with open(test_file_path, 'r', encoding='utf-8') as f:
    content = f.read()

# Measure start time
start_time = time.time()

# Parse test documents
parsed_docs = {}
current_doc_lines = []
current_doc_id = None
for line in content.splitlines():
    if line.startswith("<DOCNO>"):
        if current_doc_id is not None:
            parsed_docs[current_doc_id] = " ".join(current_doc_lines)
        current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
        current_doc_lines = []
    elif not line.startswith("<") and current_doc_id:
        current_doc_lines.append(line.strip())
if current_doc_id is not None:
    parsed_docs[current_doc_id] = " ".join(current_doc_lines)

# Preprocess and stem
for doc_id, doc_text in parsed_docs.items():
    text = doc_text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    filtered_tokens = [word for word in tokens if word not in stopwords]
    stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
    processed_docs[doc_id] = stemmed_tokens

# Build dictionary and indices
all_words = set(word for words in processed_docs.values() for word in words)
WordDictionary = {word: idx for idx, word in enumerate(sorted(all_words), start=1)}
forward_index = defaultdict(lambda: defaultdict(int))
inverted_index = defaultdict(lambda: defaultdict(int))

for doc_id, words in processed_docs.items():
    for word in words:
        wid = WordDictionary[word]
        forward_index[doc_id][wid] += 1
        inverted_index[wid][doc_id] += 1

# Save forward index
forward_index_path = "/content/drive/MyDrive/InfoRetrieval/forward_index.txt"
with open(forward_index_path, "w") as f:
    for doc_id in sorted(forward_index):
        entries = [f"{wid}:{freq}" for wid, freq in forward_index[doc_id].items()]
        f.write(f"{doc_id}: {'; '.join(entries)}\n")

# Save inverted index
inverted_index_path = "/content/drive/MyDrive/InfoRetrieval/inverted_index.txt"
with open(inverted_index_path, "w") as f:
    for wid in sorted(inverted_index):
        entries = [f"{doc_id}:{freq}" for doc_id, freq in inverted_index[wid].items()]
        f.write(f"{wid}: {'; '.join(entries)}\n")

# Measure end time
end_time = time.time()
print(f"Indexing completed in {end_time - start_time:.4f} seconds.")

# Estimate memory usage (roughly)
def get_dict_size(d):
    return sys.getsizeof(d) + sum(sys.getsizeof(k) + sys.getsizeof(v) for k, v in d.items())

forward_size = get_dict_size(forward_index)
inverted_size = get_dict_size(inverted_index)
word_dict_size = get_dict_size(WordDictionary)

total_index_size_kb = (forward_size + inverted_size + word_dict_size) / 1024
print(f"Estimated index size in memory: {total_index_size_kb:.2f} KB")

# Estimate file sizes
def file_size_in_kb(path):
    return os.path.getsize(path) / 1024

forward_file_size_kb = file_size_in_kb(forward_index_path)
inverted_file_size_kb = file_size_in_kb(inverted_index_path)

print(f"forward_index.txt size: {forward_file_size_kb:.2f} KB")
print(f"inverted_index.txt size: {inverted_file_size_kb:.2f} KB")

# Test query
test_query = input("Enter a word to look up (testdata): ").strip().lower()
if test_query in stopwords:
    print("This is a stopword and is ignored.")
else:
    stemmed = word_stemmer.stem(test_query)
    if stemmed in WordDictionary:
        wid = WordDictionary[stemmed]
        postings = inverted_index[wid]
        output = "; ".join([f"{doc_id}:{freq}" for doc_id, freq in postings.items()])
        print(f"{stemmed} ({wid}): {output}")
    else:
        print("Word not found in the dictionary.")

Indexing completed in 0.0219 seconds.
Estimated index size in memory: 3.40 KB
forward_index.txt size: 0.05 KB
inverted_index.txt size: 0.06 KB
Enter a word to look up (testdata): word
Word not found in the dictionary.
