In [1]:
from coll import parse_rcv_coll
from coll import BowColl
import math
import sys
import glob, os
import string
from stemming.porter2 import stem

# get the current working directory
cwd = os.getcwd()

# specify the file name and path relative to the cwd
inputpath = "DataSets"
file_stop_words = "common-english-words.txt"
inputpath_files = os.path.join(cwd, inputpath)
stop_words_list = os.path.join(cwd, file_stop_words)

# Read the custom set of stop words from a file
with open('common-english-words.txt', 'r') as f:
    stop_words = set(f.read().split(','))

# Define a Porter stemmer object
def stemmer(word):
    return stem(word)

datasets = {}

for dirpath, dirnames, filenames in os.walk(inputpath_files):
    for filename in filenames:
        if filename.endswith('.xml'):
            filepath = os.path.join(dirpath, filename)
            subfoldername = os.path.basename(dirpath)  # get the name of the subfolder
            with open(filepath, 'r') as f:
                contents = f.read()
                datasets[filename] = contents

N = len(datasets)
avgdl = sum(len(dataset) for dataset in datasets.values()) / N

# Define constants for BM25
k1 = 1.2
b = 0.75
k2 = 500

# Define constants for Rocchio
alpha = 1
beta = 0.75
gamma = 0.25

# Read the queries from the file and store them in a dictionary
queries = {}

os.chdir(cwd)
with open('Queries.txt', 'r') as f:
    query_lines = f.readlines()
    for i in range(len(query_lines)):
        if query_lines[i].startswith('<num>'):
            query_num = query_lines[i].strip()[6:]
            j = i + 1
            while not query_lines[j].startswith('<title>'):
                j += 1
            query_title = query_lines[j].strip()[7:]

            # Tokenize the title into a list of words
            words = query_title.split()

            # Remove stop words and punctuation from the list of words
            words = [w.lower() for w in words if w.lower() not in string.punctuation]

            # Stem the remaining words using the Porter stemming algorithm
            words = [stemmer(w) for w in words]

            # Store the preprocessed query title in the dictionary
            queries[query_num] = words

# Process each query-subfolder pair
query_nums = list(queries.keys())

for i, query_num in enumerate(query_nums):
    query_words = queries[query_num]
    scores = {}

    # Sanitize the query number for subfolder name
    sanitized_query_num = ''.join(c for c in query_num if c.isdigit())

    # Create the subfolder name based on the query number
    subfolder_name = f"Dataset{sanitized_query_num}"

    # Create the main folder to save the output
    main_folder = "Result_Model2v1"
    os.makedirs(main_folder, exist_ok=True)

    # Create the subfolder for the query
    output_subfolder = os.path.join(main_folder, subfolder_name)
    os.makedirs(output_subfolder, exist_ok=True)

    main_folder_path = cwd + "/"+ "DataSets"

    # Process XML files in the subfolder
    subfolder_path = os.path.join(main_folder_path, subfolder_name)
        
    for filename in os.listdir(subfolder_path):
        if filename.endswith(".xml"):
            dataset_name = os.path.splitext(filename)[0]
            dataset_path = os.path.join(subfolder_path, filename)

            with open(dataset_path, 'r') as f:
                dataset_content = f.read()
                
                doc_words = []

                # Process document words
                start_end = False
                for line in dataset_content.split('\n'):
                    if line.startswith("<text>"):
                        start_end = True
                    elif line.startswith("</text>"):
                        break
                    elif start_end:
                        # Preprocess the line
                        line = line.replace("<p>", "").replace("</p>", "")
                        line = line.translate(str.maketrans('', '', string.digits)).translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
                        line = line.replace("\\s+", "")

                        # Tokenize the line into words
                        words = line.split()

                        # Remove stop words and punctuation from the list of words
                        words = [w.lower() for w in words if w.lower() not in string.punctuation]

                        # Stem the remaining words using the Porter stemming algorithm
                        words = [stemmer(w) for w in words]

                        # Add the preprocessed words to the document words list
                        doc_words.extend(words)

                # Calculate the idf score for each word in the query
                idf_scores = {}
                for word in query_words:
                    if word not in idf_scores:
                        df = sum(1 for filename in os.listdir(subfolder_path) if filename.endswith(".xml") and word in open(os.path.join(subfolder_path, filename)).read())
                        idf = math.log10((len(os.listdir(subfolder_path)) - df + 0.5) / (df + 0.5))
                        idf_scores[word] = idf

                # Calculate the BM25 score for the query-document pair
                K = k1 * ((1 - b) + b * len(doc_words) / avgdl)
                score = 0
                for word in query_words:
                    if word not in doc_words:
                        continue
                    tf = doc_words.count(word)
                    idf = idf_scores[word]
                    qf = query_words.count(word)
                    score += idf * ((k1 + 1) * tf / (K + tf)) * ((k2 + 1) * qf / (k2 + qf))

                # Store the score for the query-document pair
                scores[dataset_name] = score

    # Apply Rocchio's algorithm for pseudo relevance feedback
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    for filename in os.listdir(subfolder_path):
        if filename.endswith(".xml"):
            dataset_name = os.path.splitext(filename)[0]
            dataset_path = os.path.join(subfolder_path, filename)

            with open(dataset_path, 'r') as f:
                dataset_content = f.read()

                doc_words = []

                # Process document words
                start_end = False
                for line in dataset_content.split('\n'):
                    if line.startswith("<text>"):
                        start_end = True
                    elif line.startswith("</text>"):
                        break
                    elif start_end:
                        # Preprocess the line
                        line = line.replace("<p>", "").replace("</p>", "")
                        line = line.translate(str.maketrans('', '', string.digits)).translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
                        line = line.replace("\\s+", "")

                        # Tokenize the line into words
                        words = line.split()

                        # Remove stop words and punctuation from the list of words
                        words = [w.lower() for w in words if w.lower() not in string.punctuation]

                        # Stem the remaining words using the Porter stemming algorithm
                        words = [stemmer(w) for w in words]

                        # Add the preprocessed words to the document words list
                        doc_words.extend(words)

                # Modify the query vector with pseudo relevance feedback
                rel_docs = [doc[0] for doc in sorted_scores[:12]]  # Get the top 12 documents
                non_rel_docs = [doc[0] for doc in sorted_scores[12:]]  # Get the remaining documents

                if dataset_name in rel_docs:
                    rel_index = rel_docs.index(dataset_name)
                    rel_score = sorted_scores[rel_index][1]
                    non_rel_score = 0
                elif dataset_name in non_rel_docs:
                    non_rel_index = non_rel_docs.index(dataset_name)
                    non_rel_score = sorted_scores[non_rel_index + 12][1]
                    rel_score = 0

                # Update the query vector
                for word in query_words:
                    if word in doc_words:
                        tf = doc_words.count(word)
                        idf = idf_scores[word]
                        qf = query_words.count(word)

                        # Calculate the new score with Rocchio's algorithm
                        score = alpha * (idf * qf) + beta * (rel_score * tf / len(rel_docs)) - gamma * (non_rel_score * tf / len(non_rel_docs))
                        scores[dataset_name] = score

                        score += idf * ((k1 + 1) * tf / (K + tf)) * ((k2 + 1) * qf / (k2 + qf))

    # Save the output for the query in a file
    output_file = os.path.join(output_subfolder, "output.txt")
    with open(output_file, "w") as f:
        sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        for dataset_name, score in sorted_scores:
            f.write(f"{dataset_name}: {score}\n")

    print(f"Query {query_num}: Output saved in the subfolder for the query.")
    print("------------------------------")


ModuleNotFoundError: No module named 'coll'

In [None]:
pip install coll