In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

*Load stopwords from file*

In [None]:
def load_stopwords(FileName):
    with open(FileName, 'r') as f:
        stop_words = set(f.read().split())
    return stop_words

Lowercase, Non-alphabetic Removal, Tokenize

In [None]:
Text="/content/drive/MyDrive/InfoRetrieval/ft911/ft911_14"
Text = Text.lower()
Text = re.sub(r'[^a-zA-Z\s]', ' ', Text)
tokens = nltk.word_tokenize(Text)
tokens = [word for word in tokens if word.isalpha()]
print(tokens)

['content', 'drive', 'mydrive', 'inforetrieval', 'ft', 'ft']


In [None]:
def tokenize(Text):
    Text = Text.lower()
    Text = re.sub(r'[^a-zA-Z\s]', ' ', Text)
    tokens = nltk.word_tokenize(Text)
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

Remove stepwords, Stemming

In [None]:
def process_text(Text, Stopwords, Stemmer):
    #Tokenization
    tokens = tokenize(Text)

    #Remove stopwords
    filtered_tokens = [word for word in tokens if word not in Stopwords]

    #Stemming
    stemmed_tokens = [Stemmer.stem(word) for word in filtered_tokens]

    return stemmed_tokens

Parsing the Documents

In [None]:
def parse_trec_documents(file_content):
    # Dictionary thas stores the parsed documents
    parsed_docs = {}


    current_doc_lines = []
    current_doc_id = None


    # Split the file content into individual lines and process each line
    for line in file_content.splitlines():

        if line.startswith("<DOCNO>"):

            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)


            current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()


            current_doc_lines = []

        elif not line.startswith("<") and current_doc_id:
            current_doc_lines.append(line.strip())

    if current_doc_id is not None:
        parsed_docs[current_doc_id] = " ".join(current_doc_lines)


    return parsed_docs

Assign IDs to words and documents

In [None]:

def create_dictionaries(parsed_docs):
    # Dictionaries for word and doc IDs
    doc_to_id_map = {}
    word_to_id_map = {}

    #Counters initialization and doc IDs
    current_doc_id = 1
    current_word_id = 1

    word_id_lines = []
    all_output_lines = []
    doc_id_lines = []

    #looping through each document and its list of words
    for doc_name, words in parsed_docs.items():
        if doc_name not in doc_to_id_map:
            doc_to_id_map[doc_name] = current_doc_id
            doc_id_lines.append(f"{doc_name}\t{current_doc_id}")
            current_doc_id += 1

        # loop through each word in the document
        for word in words:
            if word not in word_to_id_map:
                word_to_id_map[word] = current_word_id
                word_id_lines.append(f"{word}\t{current_word_id}")
                current_word_id += 1

    # Combine the word and document lines into one list
    all_output_lines.extend(word_id_lines)
    all_output_lines.append("\n")
    all_output_lines.extend(doc_id_lines)


    return word_to_id_map, doc_to_id_map, all_output_lines


Procesing files. Output generation

In [None]:
def process_documents(folder_path, stopwords):
    # Initialize the stemmer
    word_stemmer = PorterStemmer()

    # Dictionary that store processed documents
    processed_docs = {}

    # Loop through all files in the specified folder
    for file_name in os.listdir(folder_path):
        full_file_path = os.path.join(folder_path, file_name)

        if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
            with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
                file_content = file.read()

                # Parse the file content into individual documents
                parsed_documents = parse_trec_documents(file_content)

                # Process each document
                for doc_id, doc_text in parsed_documents.items():
                    processed_docs[doc_id] = process_text(doc_text, stopwords, word_stemmer)

    # Create dictionaries mapping words and documents to unique IDs
    word_to_id_map, doc_to_id_map, output_lines = create_dictionaries(processed_docs)

    output_file_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"

    with open(output_file_path, "w") as output_file:
        for line in output_lines:
            output_file.write(line + "\n")

    # Print sample data
    print(f"The parsed data has been saved to {output_file_path}")
    print(f"Sample document IDs: {list(doc_to_id_map.items())[:15]}")
    print(f"Sample tokens: {list(word_to_id_map.items())[:20]}")




**MAIN FUNCTION**

In [None]:
# Main function
def main():
    docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"

    stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"

    stopwords = load_stopwords(stopwords_path)

    process_documents(docs_path, stopwords)

if __name__ == "__main__":
    main()


The parsed data has been saved to /content/drive/MyDrive/InfoRetrieval/parser_output.txt
Sample document IDs: [('FT911-1', 1), ('FT911-2', 2), ('FT911-3', 3), ('FT911-4', 4), ('FT911-5', 5), ('FT911-6', 6), ('FT911-7', 7), ('FT911-8', 8), ('FT911-9', 9), ('FT911-10', 10), ('FT911-11', 11), ('FT911-12', 12), ('FT911-13', 13), ('FT911-14', 14), ('FT911-15', 15)]
Sample tokens: [('ft', 1), ('correct', 2), ('jubile', 3), ('jet', 4), ('design', 5), ('publish', 6), ('append', 7), ('articl', 8), ('frank', 9), ('fli', 10), ('shout', 11), ('sir', 12), ('whittl', 13), ('maiden', 14), ('flight', 15), ('british', 16), ('repli', 17), ('patent', 18), ('aircraft', 19), ('ga', 20)]


In [None]:
def process_documents(folder_path, stopwords):
    # Initialize the stemmer
    word_stemmer = PorterStemmer()

    # Dictionary that stores processed documents
    processed_docs = {}

    # Loop through all files in the specified folder
    for file_name in os.listdir(folder_path):
        full_file_path = os.path.join(folder_path, file_name)

        if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
            print(f"Checking file: {full_file_path}")  # Debugging line
            with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
                file_content = file.read()

                # Parse the file content into individual documents
                parsed_documents = parse_trec_documents(file_content)
                print(f"Parsed documents (first 10): {list(parsed_documents.keys())[:10]}")  # Debugging line

                # Process each document
                for doc_id, doc_text in parsed_documents.items():
                    processed_docs[doc_id] = process_text(doc_text, stopwords, word_stemmer)

    # Create dictionaries mapping words and documents to unique IDs
    word_to_id_map, doc_to_id_map, output_lines = create_dictionaries(processed_docs)

    output_file_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"

    with open(output_file_path, "w") as output_file:
        for line in output_lines:
            output_file.write(line + "\n")

    # Print formatted token and token ID (first 100 only)
    print("Token and Token ID Mapping (First 100):")
    for token, token_id in list(word_to_id_map.items())[:100]:
        print(f"{token}\t{token_id}")

    print(f"The parsed data has been saved to {output_file_path}")
    print(f"Sample document IDs: {list(doc_to_id_map.items())[:10]}")
    print(f"Sample tokens: {list(word_to_id_map.items())[:20]}")

def main():
    docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
    stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"

    stopwords = load_stopwords(stopwords_path)
    process_documents(docs_path, stopwords)

if __name__ == "__main__":
    main()


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r') as f:
    stopwords = set(f.read().split())
    print(stopwords)

{'per', 'wherever', 'h', 'been', 'asking', 'you', 'mainly', 'usually', 'thanks', 'o', 'course', 'et', 'may', 'at', 'neither', 'perhaps', 'allow', 's', 'name', 'rather', 'anywhere', 'whence', 'elsewhere', 'aside', 'indicate', 'mostly', 'everywhere', 'right', 'qv', 'possible', 'otherwise', 'once', 'ex', 'just', 'always', 'unless', 'appreciate', 'little', 'consider', 'tell', 'thanx', 'themselves', 'went', 'until', 'must', 'followed', 'obviously', 'thus', 'ever', 'though', 'two', 'j', 'am', 'plus', 'able', 'c', 'to', 'ltd', 'mean', 'trying', 'sup', 'hereafter', 'besides', 'while', 'considering', 'rd', 'yet', 'see', 'whether', 'vs', 'its', 'her', 'exactly', 'que', 'want', 'x', 'own', 'on', 'consequently', 'doing', 'th', 'looks', 'ourselves', 'about', 'go', 'me', 'y', 'them', 't', 'anyhow', 'concerning', 'cause', 'these', 'yourself', 'm', 'last', 'if', 'wish', 'really', 'k', 'indicated', 'got', 'whatever', 'thorough', 'n', 'say', 'very', 'okay', 'myself', 'sensible', 'seeing', 'what', 'often

In [None]:
# Initialize the stemmer
word_stemmer = PorterStemmer()

In [None]:
docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}

In [None]:
# Process each file in the dataset
for file_name in os.listdir(docs_path):
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
        print(f"Checking file: {full_file_path}")  # Debugging line
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")  # Debugging line

            # Process text
            for doc_id, doc_text in parsed_docs.items():
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens

Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
# Create dictionaries mapping words and documents to unique IDs
doc_to_id_map = {}
word_to_id_map = {}
output_lines = []

current_doc_id = 1
current_word_id = 1

for doc_name, words in processed_docs.items():
    if doc_name not in doc_to_id_map:
        doc_to_id_map[doc_name] = current_doc_id
        output_lines.append(f"{doc_name}\t{current_doc_id}")
        current_doc_id += 1

    for word in words:
        if word not in word_to_id_map:
            word_to_id_map[word] = current_word_id
            output_lines.append(f"{word}\t{current_word_id}")
            current_word_id += 1

In [None]:
# Ensure consistent word order
word_to_id_map = dict(sorted(word_to_id_map.items()))

output_file_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
with open(output_file_path, "w") as output_file:
    for token, token_id in list(word_to_id_map.items()):
        output_file.write(f"{token}\t{token_id}\n")

print("Token and Token ID Mapping (First 100):")
for token, token_id in list(word_to_id_map.items())[:100]:
    print(f"{token}\t{token_id}")

Token and Token ID Mapping (First 100):
aa	6530
aaa	13313
aachen	13936
aaf	8268
aah	20455
aakvaag	32703
aalborg	30434
aaron	17933
ab	1016
ababa	22651
aback	13024
abalkin	3974
abandon	390
abash	15264
abat	15396
abattoir	22113
abb	4205
abba	22179
abbacchio	18245
abbado	6922
abbatoir	20349
abbey	11270
abbot	18730
abbott	923
abbrevi	20194
abc	14815
abcc	22751
abcd	30331
abci	27009
abdali	23047
abdel	21093
abdelaziz	24529
abdic	15804
abduct	7269
abdul	6810
abdulla	12153
abdullah	20274
abe	32960
abel	7042
abela	27132
abercorn	24021
aberdeen	4924
aberdeenshir	14726
aberforth	9459
abergavenni	12926
aberr	16785
abet	20983
abey	17054
abhor	2101
abhorr	17326
abi	30652
abid	9704
abidin	28714
abil	1429
abingdon	14899
abington	24389
abingworth	27431
abitibi	20809
abitur	31992
abject	17375
abk	30859
ablaz	18224
abli	15487
ablitt	26515
ablut	10456
abn	13229
abnorm	13927
aboard	18096
abod	29376
abol	28004
abolhassan	19623
abolish	3168
abolit	1725
abolitionist	26881
abomin	32251
aborigin	13858
abort	168

In [None]:
# Create WordDictionary and FileDictionary
WordDictionary = {}
FileDictionary = {}
output_lines = []

current_doc_id = 1
current_word_id = 1

for doc_name, words in processed_docs.items():
    if doc_name not in FileDictionary:
        FileDictionary[doc_name] = current_doc_id
        output_lines.append(f"{doc_name}\t{current_doc_id}")
        current_doc_id += 1

    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            output_lines.append(f"{word}\t{current_word_id}")
            current_word_id += 1

# Ensure consistent word order
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary to file
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary to file
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for file_name, file_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{file_id}\n")

print(f"WordDictionary has been saved to {word_dict_path}")
print(f"FileDictionary has been saved to {file_dict_path}")

# Print sample output
print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

print(f"Sample document IDs: {list(FileDictionary.items())[:10]}")
print(f"Sample tokens: {list(WordDictionary.items())[:20]}")

WordDictionary has been saved to /content/drive/MyDrive/InfoRetrieval/word_dictionary.txt
FileDictionary has been saved to /content/drive/MyDrive/InfoRetrieval/file_dictionary.txt
Token and Token ID Mapping (First 100):
aa	6530
aaa	13313
aachen	13936
aaf	8268
aah	20455
aakvaag	32703
aalborg	30434
aaron	17933
ab	1016
ababa	22651
aback	13024
abalkin	3974
abandon	390
abash	15264
abat	15396
abattoir	22113
abb	4205
abba	22179
abbacchio	18245
abbado	6922
abbatoir	20349
abbey	11270
abbot	18730
abbott	923
abbrevi	20194
abc	14815
abcc	22751
abcd	30331
abci	27009
abdali	23047
abdel	21093
abdelaziz	24529
abdic	15804
abduct	7269
abdul	6810
abdulla	12153
abdullah	20274
abe	32960
abel	7042
abela	27132
abercorn	24021
aberdeen	4924
aberdeenshir	14726
aberforth	9459
abergavenni	12926
aberr	16785
abet	20983
abey	17054
abhor	2101
abhorr	17326
abi	30652
abid	9704
abidin	28714
abil	1429
abingdon	14899
abington	24389
abingworth	27431
abitibi	20809
abitur	31992
abject	17375
abk	30859
ablaz	18224
abli	15487
a

In [None]:
# Ensure consistent word order
word_to_id_map = dict(sorted(word_to_id_map.items()))
print("Token and Token ID Mapping (First 100):")
for token, token_id in list(word_to_id_map.items())[:100]:
    print(f"{token}\t{token_id}")

output_file_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
with open(output_file_path, "w") as output_file:
    for line in output_lines:
        output_file.write(line + "\n")

print(f"The parsed data has been saved to {output_file_path}")
print(f"Sample document IDs: {list(doc_to_id_map.items())[:10]}")
print(f"Sample tokens: {list(word_to_id_map.items())[:20]}")

Token and Token ID Mapping (First 100):
aa	6530
aaa	13313
aachen	13936
aaf	8268
aah	20455
aakvaag	32703
aalborg	30434
aaron	17933
ab	1016
ababa	22651
aback	13024
abalkin	3974
abandon	390
abash	15264
abat	15396
abattoir	22113
abb	4205
abba	22179
abbacchio	18245
abbado	6922
abbatoir	20349
abbey	11270
abbot	18730
abbott	923
abbrevi	20194
abc	14815
abcc	22751
abcd	30331
abci	27009
abdali	23047
abdel	21093
abdelaziz	24529
abdic	15804
abduct	7269
abdul	6810
abdulla	12153
abdullah	20274
abe	32960
abel	7042
abela	27132
abercorn	24021
aberdeen	4924
aberdeenshir	14726
aberforth	9459
abergavenni	12926
aberr	16785
abet	20983
abey	17054
abhor	2101
abhorr	17326
abi	30652
abid	9704
abidin	28714
abil	1429
abingdon	14899
abington	24389
abingworth	27431
abitibi	20809
abitur	31992
abject	17375
abk	30859
ablaz	18224
abli	15487
ablitt	26515
ablut	10456
abn	13229
abnorm	13927
aboard	18096
abod	29376
abol	28004
abolhassan	19623
abolish	3168
abolit	1725
abolitionist	26881
abomin	32251
aborigin	13858
abort	168

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}

# Process each file in the dataset
for file_name in os.listdir(docs_path):
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
        print(f"Checking file: {full_file_path}")  # Debugging line
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")  # Debugging line

            # Process text
            for doc_id, doc_text in parsed_docs.items():
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[f"{file_name}-{doc_id}"] = stemmed_tokens

# Create WordDictionary and FileDictionary
WordDictionary = {}
FileDictionary = {}
output_lines = []

current_doc_id = 1
current_word_id = 1

for doc_name, words in processed_docs.items():
    if doc_name not in FileDictionary:
        FileDictionary[doc_name] = current_doc_id
        output_lines.append(f"{doc_name}\t{current_doc_id}")
        current_doc_id += 1

    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            output_lines.append(f"{word}\t{current_word_id}")
            current_word_id += 1

# Ensure consistent word order
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary to file
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary to file
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for file_name, file_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{file_id}\n")

print(f"WordDictionary has been saved to {word_dict_path}")
print(f"FileDictionary has been saved to {file_dict_path}")

# Print sample output
print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

print(f"Sample document IDs: {list(FileDictionary.items())[:10]}")
print(f"Sample tokens: {list(WordDictionary.items())[:20]}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}

# Process each file in the dataset
for file_name in os.listdir(docs_path):
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
        print(f"Checking file: {full_file_path}")  # Debugging line
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")  # Debugging line

            # Process text
            for doc_id, doc_text in parsed_docs.items():
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                formatted_doc_id = f"{file_name.replace('_', '-')} {doc_id.split('-')[-1]}"
                processed_docs[formatted_doc_id] = stemmed_tokens

# Create WordDictionary and FileDictionary
WordDictionary = {}
FileDictionary = {}
output_lines = []

current_doc_id = 1
current_word_id = 1

for doc_name, words in processed_docs.items():
    if doc_name not in FileDictionary:
        FileDictionary[doc_name] = current_doc_id
        output_lines.append(f"{doc_name}\t{current_doc_id}")
        current_doc_id += 1

    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            output_lines.append(f"{word}\t{current_word_id}")
            current_word_id += 1

# Ensure consistent word order
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary to file
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary to file
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for file_name, file_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{file_id}\n")

print(f"WordDictionary has been saved to {word_dict_path}")
print(f"FileDictionary has been saved to {file_dict_path}")

# Print sample output
print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

print(f"Sample document IDs: {list(FileDictionary.items())[:10]}")
print(f"Sample tokens: {list(WordDictionary.items())[:20]}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}

doc_id_counter = 1


# Process each file in the dataset
for file_name in sorted(os.listdir(docs_path)):
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
        print(f"Checking file: {full_file_path}")  # Debugging line
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")  # Debugging line

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                formatted_doc_id = f"{file_name} {doc_id.split('-')[-1]}"
                processed_docs[formatted_doc_id] = stemmed_tokens

# Create WordDictionary and FileDictionary
WordDictionary = {}
FileDictionary = {}
output_lines = []

current_word_id = 1

for doc_name, words in sorted(processed_docs.items()):
    if doc_name not in FileDictionary:
        FileDictionary[doc_name] = doc_id_counter
        output_lines.append(f"{doc_name}\t{doc_id_counter}")
        doc_id_counter += 1

    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            output_lines.append(f"{word}\t{current_word_id}")
            current_word_id += 1

# Ensure consistent word order
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary to file
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary to file
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for file_name, file_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{file_id}\n")

print(f"WordDictionary has been saved to {word_dict_path}")
print(f"FileDictionary has been saved to {file_dict_path}")

# Print sample output
print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

print(f"Sample document IDs: {list(FileDictionary.items())[:10]}")
print(f"Sample tokens: {list(WordDictionary.items())[:20]}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_10
Parsed documents (first 10): ['FT911-3323', 'FT911-3324', 'FT911-3325', 'FT911-3326', 'FT911-3327', 'FT911-3328', 'FT911-3329', 'FT911-3330', 'FT911-3331', 'FT911-3332']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_11
Parsed documents (first 10): ['FT911-3694', 'FT911-3695', 'FT911-3696', 'FT911-3697', 'FT911-3698', 'FT911-3699', 'FT911-3700', 'FT911-3701', 'FT911-3702', 'FT911-3703']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_12
Parsed documents (first 10): ['FT911-4016', 'FT911-4017', 'FT911-4018', 'FT911-4019', 'FT911-4020', 'FT911-4021', 'FT911-4022', 'FT911-4023', 'FT911-4024', 'FT911-4025']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_13
Parsed document

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}

# Process each file in the dataset
for file_name in sorted(os.listdir(docs_path)):  # Ensure correct file order
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path) and file_name.startswith("ft911_"):
        print(f"Checking file: {full_file_path}")  # Debugging line
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")  # Debugging line

            # Process text
            doc_counter = 1  # Reset document counter per file
            for doc_id, doc_text in sorted(parsed_docs.items(), key=lambda x: int(x[0].split('-')[-1])):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                formatted_doc_id = f"{file_name} {doc_counter}"  # Ensure sequential numbering
                processed_docs[formatted_doc_id] = stemmed_tokens
                doc_counter += 1

# Create WordDictionary and FileDictionary
WordDictionary = {}
FileDictionary = {}
output_lines = []

current_word_id = 1
current_doc_id = 1  # Sequential document numbering

for doc_name, words in sorted(processed_docs.items(), key=lambda x: (x[0].split()[0], int(x[0].split()[1]))):
    if doc_name not in FileDictionary:
        FileDictionary[doc_name] = current_doc_id
        output_lines.append(f"{doc_name}\t{current_doc_id}")
        current_doc_id += 1

    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            output_lines.append(f"{word}\t{current_word_id}")
            current_word_id += 1

# Ensure consistent word order
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items(), key=lambda x: (x[0].split()[0], int(x[0].split()[1]))))

# Save WordDictionary to file
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary to file
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for file_name, file_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{file_id}\n")

print(f"WordDictionary has been saved to {word_dict_path}")
print(f"FileDictionary has been saved to {file_dict_path}")

# Print sample output
print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

print(f"Sample document IDs: {list(FileDictionary.items())[:10]}")
print(f"Sample tokens: {list(WordDictionary.items())[:20]}")


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
FTFileMapping = {}  # Mapping: file_name -> list of DOCNOs
FileDictionary = {}  # Mapping: (file_name, local_doc_num) -> global_doc_id
DocumentToFileMapping = []  # NEW: list of (file_name, DOCNO)

# Sort files numerically based on suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> doc list
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save (file_name, DOCNO) mapping
            for docno in parsed_docs.keys():
                DocumentToFileMapping.append((file_name, docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens

# Create WordDictionary
WordDictionary = {}
current_word_id = 1
for words in processed_docs.values():
    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            current_word_id += 1

# Sort dictionaries
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary (filename + local doc number)
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping (file -> list of DOCNOs)
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# Save DocumentToFileMapping (each line: file_name<TAB>DOCNO)
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for file_name, docno in DocumentToFileMapping:
        docno=docno.split("-")[-1]
        out_file.write(f"{file_name}\t{docno}\n")


# Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")

print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

print("DocumentToFileMapping (First 1000):")
for entry in DocumentToFileMapping[721:1100]:
    print(entry)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
FTFileMapping = {}  # Mapping: file_name -> list of DOCNOs
FileDictionary = {}  # Mapping: (file_name, local_doc_num) -> global_doc_id
DocumentToFileMapping = []  # list of (file_name, numeric_DOCNO)

# Sort files numerically based on suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> DOCNOs
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save cleaned (file_name, numeric_DOCNO)
            for docno in parsed_docs.keys():
                numeric_docno = docno.split("-")[-1]
                DocumentToFileMapping.append((file_name, numeric_docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens

# Create WordDictionary
WordDictionary = {}
current_word_id = 1
for words in processed_docs.values():
    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            current_word_id += 1

# Sort dictionaries
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary (filename + local doc number)
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping (file -> list of DOCNOs)
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# Save DocumentToFileMapping with numeric DOCNOs
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for file_name, numeric_docno in DocumentToFileMapping:
        out_file.write(f"{file_name}\t{numeric_docno}\n")

# Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")

print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

# Print cleaned DocumentToFileMapping
print("DocumentToFileMapping (First 1000):")
for file_name, numeric_docno in DocumentToFileMapping[718:1100]:
    print((file_name, numeric_docno))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
FTFileMapping = {}  # Mapping: file_name -> list of DOCNOs
FileDictionary = {}  # Mapping: (file_name, local_doc_num) -> global_doc_id
DocumentToFileMapping = []  # list of (file_name, numeric_DOCNO)

# Sort files numerically based on suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> DOCNOs
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save cleaned (FT911-X, numeric_DOCNO)
            for docno in parsed_docs.keys():
                numeric_docno = docno.split("-")[-1]
                ft_formatted = f"FT911-{file_name.split('_')[-1]}"
                DocumentToFileMapping.append((ft_formatted, numeric_docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens

# Create WordDictionary
WordDictionary = {}
current_word_id = 1
for words in processed_docs.values():
    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            current_word_id += 1

# Sort dictionaries
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary (filename + local doc number)
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping (file -> list of DOCNOs)
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# Save final formatted DocumentToFileMapping
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for formatted_file, numeric_docno in DocumentToFileMapping:
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")

print("Token and Token ID Mapping (First 100):")
for token, token_id in list(WordDictionary.items())[:100]:
    print(f"{token}\t{token_id}")

# Print the final formatted output
print("DocumentToFileMapping (First 1000):")
for formatted_file, numeric_docno in DocumentToFileMapping[717:1100]:
    print((formatted_file, numeric_docno))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
# Combine word_dictionary.txt and doc_to_file_mapping.txt into parser_output.txt
parser_output_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"

with open(parser_output_path, "w") as output_file:
    # Write word dictionary first
    with open(word_dict_path, "r") as word_file:
        output_file.write("### Word Dictionary ###\n")
        output_file.writelines(word_file.readlines())
        output_file.write("\n")

    # Write document-to-file mapping next
    with open(doc_file_map_path, "r") as map_file:
        output_file.write("### Document to File Mapping ###\n")
        output_file.writelines(map_file.readlines())

print(f"Combined output saved to {parser_output_path}")

Combined output saved to /content/drive/MyDrive/InfoRetrieval/parser_output.txt


In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
FTFileMapping = {}
FileDictionary = {}
DocumentToFileMapping = []

# Sort files numerically based on suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> DOCNOs
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save cleaned (FT911-X, numeric_DOCNO)
            for docno in parsed_docs.keys():
                numeric_docno = docno.split("-")[-1]
                ft_formatted = f"FT911-{file_name.split('_')[-1]}"
                DocumentToFileMapping.append((ft_formatted, numeric_docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens

# Create WordDictionary
WordDictionary = {}
current_word_id = 1
for words in processed_docs.values():
    for word in words:
        if word not in WordDictionary:
            WordDictionary[word] = current_word_id
            current_word_id += 1

# Sort dictionaries
WordDictionary = dict(sorted(WordDictionary.items()))
FileDictionary = dict(sorted(FileDictionary.items()))

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# ✅ Sort DocumentToFileMapping numerically by FT file number and then by docno
def sort_key(item):
    ft_number = int(item[0].split("-")[-1])
    doc_number = int(item[1])
    return (ft_number, doc_number)

DocumentToFileMapping.sort(key=sort_key)

# Save final formatted DocumentToFileMapping
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for formatted_file, numeric_docno in DocumentToFileMapping:
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# ✅ Combine into parser_output.txt
parser_output_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
with open(parser_output_path, "w") as output_file:
    # Write word dictionary
    output_file.write("### token:  token ID ###\n")
    with open(word_dict_path, "r") as word_file:
        output_file.writelines(word_file.readlines())
    output_file.write("\n")

    # Write sorted document-to-file mapping
    output_file.write("### document name: doc ID ###\n")
    with open(doc_file_map_path, "r") as map_file:
        output_file.writelines(map_file.readlines())

# ✅ Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")
print(f"Combined parser_output saved to {parser_output_path}")

# Optional print
print("DocumentToFileMapping (Sample):")
for entry in DocumentToFileMapping[-10:]:
    print(entry)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
import os
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
FTFileMapping = {}  # Mapping: file_name -> list of DOCNOs
FileDictionary = {}  # Mapping: (file_name, local_doc_num) -> global_doc_id
DocumentToFileMapping = []  # list of (file_name, numeric_DOCNO)

# Sort files numerically based on suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> DOCNOs
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save cleaned (FT911-X, numeric_DOCNO)
            for docno in parsed_docs.keys():
                numeric_docno = docno.split("-")[-1]
                ft_formatted = f"FT911-{file_name.split('_')[-1]}"
                DocumentToFileMapping.append((ft_formatted, numeric_docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens
# Create WordDictionary: sort all unique words alphabetically, then assign IDs
all_words = set()
for words in processed_docs.values():
    all_words.update(words)

WordDictionary = {}
for idx, word in enumerate(sorted(all_words), start=1):
    WordDictionary[word] = idx

# Sort FileDictionary
FileDictionary = dict(sorted(FileDictionary.items()))

# Sort DocumentToFileMapping: sort FT911-14 before FT911-15
def sort_key(doc):
    prefix, number = doc[0].split("-")
    return int(number), int(doc[1])

DocumentToFileMapping = sorted(DocumentToFileMapping, key=sort_key)

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary (filename + local doc number)
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping (file -> list of DOCNOs)
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# Save final formatted DocumentToFileMapping
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for formatted_file, numeric_docno in DocumentToFileMapping:
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Create parser_output.txt with token dictionary and document ID mapping
parser_output_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
with open(parser_output_path, "w") as out_file:
    # First print word -> ID mapping
    for word, word_id in WordDictionary.items():
        out_file.write(f"{word}\t\t{word_id}\n")

    out_file.write("\n")

    # Then print document -> ID mapping
    for idx, (formatted_file, numeric_docno) in enumerate(DocumentToFileMapping, start=1):
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")
print(f"parser_output.txt saved to {parser_output_path}")

# Print sample of token to ID mapping
print("Token and Token ID Mapping (First 20):")
for token, token_id in list(WordDictionary.items())[:20]:
    print(f"{token}\t{token_id}")

# Print DocumentToFileMapping like your format
print("DocumentToFileMapping (Sample):")
for pair in DocumentToFileMapping[717:1100]:
    print(pair)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14

In [None]:
# Load stopwords
stopwords_path = "/content/drive/MyDrive/InfoRetrieval/stopwordlist.txt"
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().split())

# Initialize the stemmer
word_stemmer = PorterStemmer()

docs_path = "/content/drive/MyDrive/InfoRetrieval/ft911/"
processed_docs = {}
FTFileMapping = {}  # Mapping: file_name -> list of DOCNOs
FileDictionary = {}  # Mapping: (file_name, local_doc_num) -> global_doc_id
DocumentToFileMapping = []  # list of (file_name, numeric_DOCNO)

# Sort files numerically based on suffix
file_list = sorted(
    [f for f in os.listdir(docs_path) if f.startswith("ft911_")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

# Process each file
global_doc_id = 1
for file_name in file_list:
    full_file_path = os.path.join(docs_path, file_name)

    if os.path.isfile(full_file_path):
        print(f"Checking file: {full_file_path}")
        with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()

            # Parse TREC documents
            parsed_docs = {}
            current_doc_lines = []
            current_doc_id = None
            for line in file_content.splitlines():
                if line.startswith("<DOCNO>"):
                    if current_doc_id is not None:
                        parsed_docs[current_doc_id] = " ".join(current_doc_lines)
                    current_doc_id = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
                    current_doc_lines = []
                elif not line.startswith("<") and current_doc_id:
                    current_doc_lines.append(line.strip())
            if current_doc_id is not None:
                parsed_docs[current_doc_id] = " ".join(current_doc_lines)

            print(f"Parsed documents (first 10): {list(parsed_docs.keys())[:10]}")

            # Save file -> DOCNOs
            FTFileMapping[file_name] = list(parsed_docs.keys())

            # Save cleaned (FT911-X, numeric_DOCNO)
            for docno in parsed_docs.keys():
                numeric_docno = docno.split("-")[-1]
                ft_formatted = f"FT911-{file_name.split('_')[-1]}"
                DocumentToFileMapping.append((ft_formatted, numeric_docno))

            # Assign global and local doc IDs
            local_id = 1
            for doc_id in parsed_docs:
                FileDictionary[(file_name, local_id)] = global_doc_id
                local_id += 1
                global_doc_id += 1

            # Process text
            for doc_id, doc_text in sorted(parsed_docs.items()):
                text = doc_text.lower()
                text = re.sub(r'[^a-zA-Z\s]', ' ', text)
                tokens = nltk.word_tokenize(text)
                tokens = [word for word in tokens if word.isalpha()]
                filtered_tokens = [word for word in tokens if word not in stopwords]
                stemmed_tokens = [word_stemmer.stem(word) for word in filtered_tokens]
                processed_docs[doc_id] = stemmed_tokens
# Create WordDictionary: sort all unique words alphabetically, then assign IDs
all_words = set()
for words in processed_docs.values():
    all_words.update(words)

WordDictionary = {}
for idx, word in enumerate(sorted(all_words), start=1):
    WordDictionary[word] = idx

# Sort FileDictionary
FileDictionary = dict(sorted(FileDictionary.items()))

# Sort DocumentToFileMapping: sort FT911-14 before FT911-15
def sort_key(doc):
    prefix, number = doc[0].split("-")
    return int(number), int(doc[1])

DocumentToFileMapping = sorted(DocumentToFileMapping, key=sort_key)

# Save WordDictionary
word_dict_path = "/content/drive/MyDrive/InfoRetrieval/word_dictionary.txt"
with open(word_dict_path, "w") as word_file:
    for word, word_id in WordDictionary.items():
        word_file.write(f"{word}\t{word_id}\n")

# Save FileDictionary (filename + local doc number)
file_dict_path = "/content/drive/MyDrive/InfoRetrieval/file_dictionary.txt"
with open(file_dict_path, "w") as file_file:
    for (file_name, local_doc_num), global_id in FileDictionary.items():
        file_file.write(f"{file_name}\t{local_doc_num}\t{global_id}\n")

# Save FTFileMapping (file -> list of DOCNOs)
ft_map_path = "/content/drive/MyDrive/InfoRetrieval/ft_file_mapping.txt"
with open(ft_map_path, "w") as map_file:
    for file, doc_ids in FTFileMapping.items():
        map_file.write(f"{file}:\t{', '.join(doc_ids)}\n")

# Save final formatted DocumentToFileMapping
doc_file_map_path = "/content/drive/MyDrive/InfoRetrieval/doc_to_file_mapping.txt"
with open(doc_file_map_path, "w") as out_file:
    for formatted_file, numeric_docno in DocumentToFileMapping:
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Create parser_output.txt with token dictionary and document ID mapping
parser_output_path = "/content/drive/MyDrive/InfoRetrieval/parser_output.txt"
with open(parser_output_path, "w") as out_file:
    # First print word -> ID mapping
    for word, word_id in WordDictionary.items():
        out_file.write(f"{word}\t\t{word_id}\n")

    out_file.write("\n")

    # Then print document -> ID mapping
    for idx, (formatted_file, numeric_docno) in enumerate(DocumentToFileMapping, start=1):
        out_file.write(f"{formatted_file}\t{numeric_docno}\n")

# Logs
print(f"WordDictionary saved to {word_dict_path}")
print(f"FileDictionary saved to {file_dict_path}")
print(f"FTFileMapping saved to {ft_map_path}")
print(f"DocumentToFileMapping saved to {doc_file_map_path}")
print(f"parser_output.txt saved to {parser_output_path}")

# Print sample of token to ID mapping
print("Token and Token ID Mapping (First 20):")
for token, token_id in list(WordDictionary.items())[:20]:
    print(f"{token}\t{token_id}")

# Print DocumentToFileMapping like your format
print("DocumentToFileMapping (Sample):")
for pair in DocumentToFileMapping[900:910]:
    print(pair)

Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_1
Parsed documents (first 10): ['FT911-1', 'FT911-2', 'FT911-3', 'FT911-4', 'FT911-5', 'FT911-6', 'FT911-7', 'FT911-8', 'FT911-9', 'FT911-10']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_2
Parsed documents (first 10): ['FT911-376', 'FT911-377', 'FT911-378', 'FT911-379', 'FT911-380', 'FT911-381', 'FT911-382', 'FT911-383', 'FT911-384', 'FT911-385']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_3
Parsed documents (first 10): ['FT911-722', 'FT911-723', 'FT911-724', 'FT911-725', 'FT911-726', 'FT911-727', 'FT911-728', 'FT911-729', 'FT911-730', 'FT911-731']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_4
Parsed documents (first 10): ['FT911-1099', 'FT911-1100', 'FT911-1101', 'FT911-1102', 'FT911-1103', 'FT911-1104', 'FT911-1105', 'FT911-1106', 'FT911-1107', 'FT911-1108']
Checking file: /content/drive/MyDrive/InfoRetrieval/ft911/ft911_5
Parsed documents (first 10): ['FT911-14