In [5]:
import json
import math
import heapq
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix, vstack
from concurrent.futures import ProcessPoolExecutor
import os
import time
from datetime import datetime

# Global variables accessible by worker processes
GLOBAL_IDF_DICT = {}
GLOBAL_TERM_TO_INDEX = {}

def init_worker(idf_dict, term_to_index):
    global GLOBAL_IDF_DICT
    global GLOBAL_TERM_TO_INDEX
    GLOBAL_IDF_DICT = idf_dict
    GLOBAL_TERM_TO_INDEX = term_to_index

# Helper function to print messages with timestamps
def log(message):
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {message}")

def compute_smoothed_tf(tokens):
    """Compute term frequency with log scaling for each token in the list of tokens."""
    tf_dict = {}
    
    for term in tokens:
        tf_dict[term] = tf_dict.get(term, 0) + 1
    # Apply log scaling and smoothing
    for term in tf_dict:
        tf_dict[term] = 1 + math.log(tf_dict[term])  # Smoothing
    return tf_dict

def compute_smoothed_idf(corpus_terms_set, num_docs):
    """Compute smoothed inverse document frequency for terms across the entire corpus."""
    idf_dict = {}
    for term, doc_freq in corpus_terms_set.items():
        # Adjusted IDF calculation to match sklearn's implementation
        idf_dict[term] = math.log((1 + num_docs) / (1 + doc_freq)) + 1
    return idf_dict

def compute_tf_idf_vector(tokens, idf_dict, term_to_index):
    """Compute normalized TF-IDF vector and store it as a sparse vector."""
    tf = compute_smoothed_tf(tokens)
    indices = []
    data = []
    for term, tf_value in tf.items():
        if term in term_to_index:
            idx = term_to_index[term]
            idf_value = idf_dict.get(term, 0)
            tf_idf_value = tf_value * idf_value
            indices.append(idx)
            data.append(tf_idf_value)
    # Create sparse vector
    vector_size = len(term_to_index)
    tf_idf_vector = csr_matrix((data, indices, [0, len(indices)]), shape=(1, vector_size))
    # Normalize the vector
    norm = np.linalg.norm(tf_idf_vector.data)
    if norm > 0:
        tf_idf_vector.data = tf_idf_vector.data / norm
    return tf_idf_vector

def calculate_idf_from_corpus(corpus_path, vocab_output_path='vocab.json'):
    """First pass over the corpus to calculate document frequencies for each term."""
    log("Starting IDF calculation from corpus...")
    start_time = time.time()
    corpus_terms_set = {}
    num_docs = 0
    term_to_index = {}
    index = 0
    with open(corpus_path, 'r') as f:
        for idx, line in enumerate(f):
            doc = json.loads(line)
            tokens = set(doc['tokens'])
            num_docs += 1
            for term in tokens:
                if term not in corpus_terms_set:
                    corpus_terms_set[term] = 1
                    term_to_index[term] = index
                    index += 1
                else:
                    corpus_terms_set[term] += 1
            if idx % 50000 == 0 and idx > 0:
                log(f"Processed {idx} documents...")
    idf_dict = compute_smoothed_idf(corpus_terms_set, num_docs)
    log("IDF calculation complete.")
    # Save the vocabulary
    #with open(vocab_output_path, 'w') as vocab_file:
    #    json.dump(term_to_index, vocab_file)
    #end_time = time.time()
    #log(f"IDF calculation took {end_time - start_time:.2f} seconds.")
    return idf_dict, term_to_index

def process_document_tf_idf(line, idf_dict, term_to_index):
    """Process a single document to compute its TF-IDF vector."""
    doc = json.loads(line)
    doc_id = doc['doc_id']
    tokens = doc['tokens']
    tf_idf_vector = compute_tf_idf_vector(tokens, idf_dict, term_to_index)
    # Save the vector as indices and data
    indices = tf_idf_vector.indices.tolist()
    data = tf_idf_vector.data.tolist()
    return json.dumps({doc_id: {"indices": indices, "data": data}})

def wrapper_process_document_tf_idf(args):
    """Wrapper function to unpack arguments for multiprocessing."""
    return process_document_tf_idf(*args)

def compute_tf_idf_for_documents(corpus_path, idf_dict, term_to_index, output_path='doc_tf_idf_vectors.json'):
    log("Starting TF-IDF computation for documents...")
    start_time = time.time()
    with open(output_path, 'w') as output_file, open(corpus_path, 'r') as f:
        total_docs = sum(1 for _ in f)  # Count total documents
        f.seek(0)  # Reset file pointer to beginning
        log(f"Total documents to process: {total_docs}")
        with ProcessPoolExecutor() as executor:
            # Use a generator to avoid loading all lines into memory
            args = ((line, idf_dict, term_to_index) for line in f)
            results = executor.map(wrapper_process_document_tf_idf, args, chunksize=1000)
            for idx, result in enumerate(results, 1):
                output_file.write(result + '\n')
                if idx % 50000 == 0:
                    log(f"Processed {idx}/{total_docs} documents.")
    end_time = time.time()
    log(f"TF-IDF computation for documents complete. Took {end_time - start_time:.2f} seconds.")
    log(f"TF-IDF vectors saved to {output_path}.")

def process_query_tf_idf(line):
    """Process a single query to compute its TF-IDF vector using global IDF and term mappings."""
    query = json.loads(line)
    query_id = query['query_id']
    tokens = query['tokens']
    tf_idf_vector = compute_tf_idf_vector(tokens, GLOBAL_IDF_DICT, GLOBAL_TERM_TO_INDEX)
    # Save the vector as indices and data
    indices = tf_idf_vector.indices.tolist()
    data = tf_idf_vector.data.tolist()
    return json.dumps({query_id: {"indices": indices, "data": data}})

def wrapper_process_query_tf_idf(line):
    """Wrapper function to process a query line."""
    return process_query_tf_idf(line)

def compute_tf_idf_for_queries(queries_path, idf_dict, term_to_index, output_path='query_tf_idf_vectors.json'):
    """Compute TF-IDF vectors for all queries using multiprocessing with shared IDF and term mappings."""
    log("Starting TF-IDF computation for queries...")
    start_time = time.time()
    with open(output_path, 'w') as output_file, open(queries_path, 'r') as f:
        # Generator to read lines one by one
        lines = f
        total_queries = sum(1 for _ in f)
        f.seek(0)  # Reset file pointer after counting
        log(f"Total queries to process: {total_queries}")
        with ProcessPoolExecutor(initializer=init_worker, initargs=(idf_dict, term_to_index)) as executor:
            # Process queries with a reasonable chunksize
            results = executor.map(wrapper_process_query_tf_idf, lines, chunksize=100)
            for idx, result in enumerate(results, 1):
                output_file.write(result + '\n')
                if idx % 100 == 0 or idx == total_queries:
                    log(f"Processed {idx}/{total_queries} queries.")
    end_time = time.time()
    log(f"TF-IDF computation for queries complete. Took {end_time - start_time:.2f} seconds.")
    log(f"TF-IDF vectors saved to {output_path}.")

def build_inverted_index_on_disk(doc_tf_idf_path, output_index_path='inverted_index.json'):
    """Build an inverted index from the document TF-IDF vectors stored in JSON format and save it to disk."""
    log("Starting inverted index construction...")
    start_time = time.time()
    chunk_size = 50000  # Number of documents to process per chunk
    temp_index_files = []
    term_to_doc = defaultdict(set)
    idx = 0
    chunk_idx = 0
    with open(doc_tf_idf_path, 'r') as f:
        for line in f:
            doc_entry = json.loads(line)
            doc_id = list(doc_entry.keys())[0]
            indices = doc_entry[doc_id]["indices"]
            # For each term index, add document ID to term's set
            for term_idx in indices:
                term_to_doc[term_idx].add(doc_id)
            idx += 1
            if idx % chunk_size == 0:
                # Write the partial inverted index to a temporary file
                temp_file_path = f"temp_index_{chunk_idx}.json"
                with open(temp_file_path, 'w') as temp_file:
                    for term_idx, doc_ids in term_to_doc.items():
                        temp_file.write(json.dumps({term_idx: list(doc_ids)}) + '\n')
                        temp_index_files.append(temp_file_path)
                        term_to_doc.clear()
                        chunk_idx += 1
                        log(f"Processed {idx} documents. Partial inverted index saved to {temp_file_path}.")
    # Write any remaining terms
    if term_to_doc:
        temp_file_path = f"temp_index_{chunk_idx}.json"
        with open(temp_file_path, 'w') as temp_file:
            for term_idx, doc_ids in term_to_doc.items():
                temp_file.write(json.dumps({term_idx: list(doc_ids)}) + '\n')
        temp_index_files.append(temp_file_path)
        term_to_doc.clear()
        log(f"Processed {idx} documents. Final partial inverted index saved to {temp_file_path}.")
    # Now merge the partial inverted indexes
    log("Merging partial inverted indexes...")
    term_to_doc_merged = defaultdict(set)
    for temp_file_path in temp_index_files:
        with open(temp_file_path, 'r') as temp_file:
            for line in temp_file:
                term_entry = json.loads(line)
                term_idx = list(term_entry.keys())[0]
                doc_ids = term_entry[term_idx]
                term_to_doc_merged[term_idx].update(doc_ids)
        # Remove temporary file to save space
        os.remove(temp_file_path)
        term_to_doc.clear()
        log(f"Merged and removed temporary file {temp_file_path}.")

    # Write the final inverted index
    with open(output_index_path, 'w') as output_file:
        for term_idx, doc_ids in term_to_doc_merged.items():
            output_file.write(json.dumps({term_idx: list(doc_ids)}) + '\n')

    end_time = time.time()
    log("Inverted index building complete and saved to disk.")
    log(f"Inverted index saved to {output_index_path}.")
    log(f"Inverted index construction took {end_time - start_time:.2f} seconds.")

def build_term_to_queries_mapping_top_terms(queries, K):
    """
    Build a mapping from term index to the list of query IDs that contain the term,
    considering only the top K terms per query.
    """
    term_to_queries = defaultdict(list)
    query_top_terms = {}
    for query_id, query_vector in queries.items():
        indices = query_vector.indices
        data = query_vector.data
        if len(data) > K:
            top_indices = np.argsort(data)[-K:]
            top_terms = indices[top_indices]
        else:
            top_terms = indices
        query_top_terms[query_id] = set(top_terms)
        for term_idx in top_terms:
            term_to_queries[term_idx].append(query_id)
    return term_to_queries, query_top_terms

def retrieve_top_documents_with_limited_candidates(query_tf_idf_path, doc_tf_idf_path, index_path, term_to_index_path, top_n=10, K=5, M=200):
    """Retrieve top N documents for each query using the inverted index with limited candidates."""
    log("Starting retrieval of top documents for each query...")
    start_time = time.time()

    # Load term_to_index mapping
    with open(term_to_index_path, 'r') as f:
        term_to_index = json.load(f)
    log("Term to index mapping loaded.")

    # Load query vectors and get top K terms
    queries = {}
    query_ids_list = []
    with open(query_tf_idf_path, 'r') as f:
        for line in f:
            query_entry = json.loads(line)
            query_id = list(query_entry.keys())[0]
            data = query_entry[query_id]
            indices = np.array(data['indices'])
            values = np.array(data['data'])
            vector_size = len(term_to_index)
            query_vector = csr_matrix((values, indices, [0, len(indices)]), shape=(1, vector_size))
            queries[query_id] = query_vector
            query_ids_list.append(query_id)
    log("Query vectors loaded.")

    # Build term_to_queries mapping considering top K terms per query
    term_to_queries, query_top_terms = build_term_to_queries_mapping_top_terms(queries, K)
    log("Term to queries mapping for top terms created.")

    # Initialize candidate document lists
    query_candidates = defaultdict(set)
    all_candidate_doc_ids = set()

    # Iterate through the inverted index and assign documents to queries
    with open(index_path, 'r') as index_file:
        for idx, line in enumerate(index_file, 1):
            term_entry = json.loads(line)
            term_idx_str = list(term_entry.keys())[0]
            doc_ids = term_entry[term_idx_str]
            term_idx = int(term_idx_str)

            # Retrieve queries that contain this term in their top K terms
            relevant_queries = term_to_queries.get(term_idx, [])
            if relevant_queries:
                # Limit the number of documents per term to M
                limited_doc_ids = doc_ids[:M]
                for query_id in relevant_queries:
                    query_candidates[query_id].update(limited_doc_ids)
                    all_candidate_doc_ids.update(limited_doc_ids)

            # Optional: Log progress every 1,000,000 terms
            if idx % 1000000 == 0:
                log(f"Processed {idx} terms from the inverted index.")

    log("Candidate documents collected for queries.")

    # Build doc_vectors mapping for candidate documents
    doc_vectors = {}
    with open(doc_tf_idf_path, 'r') as doc_file:
        for line in doc_file:
            doc_entry = json.loads(line)
            doc_id = list(doc_entry.keys())[0]
            if doc_id in all_candidate_doc_ids:
                data = doc_entry[doc_id]
                indices = data['indices']
                values = data['data']
                vector_size = len(term_to_index)
                doc_vector = csr_matrix((values, indices, [0, len(indices)]), shape=(1, vector_size))
                doc_vectors[doc_id] = doc_vector
    log("Document vectors for candidates loaded.")

    # Now computing similarities for each query...
    submission_data = []
    total_queries = len(query_ids_list)

    for idx, query_id in enumerate(query_ids_list):
        candidate_doc_ids = query_candidates.get(query_id, set())
        query_vector = queries[query_id]
        result = process_single_query(query_id, query_vector, candidate_doc_ids, doc_vectors, top_n)
        submission_data.append({
            'id': idx,
            'docids': result['docids']
        })
        if (idx + 1) % 100 == 0 or (idx + 1) == total_queries:
            log(f"Processed {idx + 1}/{total_queries} queries.")

    log("Top documents retrieved for all queries.")
    end_time = time.time()
    log(f"Retrieval completed in {end_time - start_time:.2f} seconds.")
    return submission_data

def process_single_query(query_id, query_vector, candidate_doc_ids, doc_vectors, top_n):
    candidate_doc_ids_list = list(candidate_doc_ids)
    docs = []
    doc_ids_list = []
    for doc_id in candidate_doc_ids_list:
        doc_vector = doc_vectors.get(doc_id)
        if doc_vector is not None:
            docs.append(doc_vector)
            doc_ids_list.append(doc_id)
    if not docs:
        retrieved_doc_ids = []
    else:
        doc_matrix = vstack(docs)
        similarities = doc_matrix.dot(query_vector.T).toarray().flatten()
        # Use argpartition to get indices of top N similarities
        top_n = min(top_n, len(similarities))
        top_n_indices = np.argpartition(-similarities, top_n - 1)[:top_n]
        top_docs = [(similarities[i], doc_ids_list[i]) for i in top_n_indices]
        # Sort the top documents by similarity
        top_docs.sort(reverse=True)
        retrieved_doc_ids = [doc_id for sim, doc_id in top_docs]
    formatted_doc_ids = str(retrieved_doc_ids)
    return {'docids': formatted_doc_ids}

# Example usage

def load_vocab(vocab_path):
    """
    Load the vocabulary from a JSON file.

    Args:
        vocab_path (str): Path to the vocabulary JSON file.

    Returns:
        dict: The term-to-index mapping.
    """
    with open(vocab_path, 'r') as f:
        term_to_index = json.load(f)
    return term_to_index

if __name__ == "__main__":
    # Paths to the TF-IDF vector files
    corpus_path = '/kaggle/input/dis-tokens/corpus_tokens.json'
    queries_path = '/kaggle/input/dis-tokens/test_tokens.json'
    doc_tf_idf_path = '/kaggle/input/tf-idf/doc_tf_idf_vectors.json'
    query_tf_idf_path = 'query_tf_idf_vectors.json'
    index_path = '/kaggle/input/tf-idf/inverted_index.json'
    term_to_index_path = '/kaggle/input/tf-idf/vocab.json'

    # Step 1: Calculate IDF using the entire corpus
    # Uncomment if running for the first time
    idf_dict, term_to_index = calculate_idf_from_corpus(corpus_path, term_to_index_path)

    term_to_index = load_vocab(term_to_index_path)

    # Step 2: Compute TF-IDF vectors for all documents
    # Uncomment if running for the first time
    # compute_tf_idf_for_documents(corpus_path, idf_dict, term_to_index, doc_tf_idf_path)

    # Step 3: Compute TF-IDF vectors for all queries
    # Uncomment if running for the first time
    compute_tf_idf_for_queries(queries_path, idf_dict, term_to_index, query_tf_idf_path)

    # Step 4: Build the inverted index and save it to disk incrementally
    # Uncomment if running for the first time
    # build_inverted_index_on_disk(doc_tf_idf_path, index_path)

    # Step 5: Retrieve top documents using the optimized inverted index with limited candidates
    submission_data = retrieve_top_documents_with_limited_candidates(
        query_tf_idf_path,
        doc_tf_idf_path,
        index_path,
        term_to_index_path,
        top_n=10,
        K=30,   # Top K terms per query
        M=2000  # Top M documents per term
    )

    # Step 6: Convert to DataFrame and save to CSV
    submission_df = pd.DataFrame(submission_data)
    submission_df = submission_df[['id', 'docids']]  # Ensure correct column order

    # Verify that there are 2000 lines plus header
    assert len(submission_df) == 2000, f"Expected 2000 queries, but got {len(submission_df)}"

    submission_file_path = 'submission.csv'
    submission_df.to_csv(submission_file_path, index=False)

    log(f"Submission file '{submission_file_path}' created successfully.")
    log(submission_df.head())


2024-11-09 20:35:59 Starting IDF calculation from corpus...
2024-11-09 20:36:53 Processed 50000 documents...
2024-11-09 20:37:40 Processed 100000 documents...
2024-11-09 20:38:27 Processed 150000 documents...
2024-11-09 20:39:14 Processed 200000 documents...
2024-11-09 20:42:16 Processed 250000 documents...
2024-11-09 20:43:29 IDF calculation complete.
2024-11-09 20:43:49 Starting TF-IDF computation for queries...
2024-11-09 20:43:49 Total queries to process: 2000
2024-11-09 20:43:49 Processed 100/2000 queries.
2024-11-09 20:43:49 Processed 200/2000 queries.
2024-11-09 20:43:49 Processed 300/2000 queries.
2024-11-09 20:43:49 Processed 400/2000 queries.
2024-11-09 20:43:49 Processed 500/2000 queries.
2024-11-09 20:43:49 Processed 600/2000 queries.
2024-11-09 20:43:49 Processed 700/2000 queries.
2024-11-09 20:43:49 Processed 800/2000 queries.
2024-11-09 20:43:49 Processed 900/2000 queries.
2024-11-09 20:43:49 Processed 1000/2000 queries.
2024-11-09 20:43:49 Processed 1100/2000 queries.
2

In [1]:
import json
import math
import heapq
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix, vstack
import os
import time
from datetime import datetime

# Helper function to print messages with timestamps
def log(message):
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {message}")

def compute_smoothed_tf(tokens):
    """Compute term frequency with log scaling for each token in the list of tokens."""
    tf_dict = {}
    for term in tokens:
        tf_dict[term] = tf_dict.get(term, 0) + 1
    # Apply log scaling and smoothing
    for term in tf_dict:
        tf_dict[term] = 1 + math.log(tf_dict[term])  # Smoothing
    return tf_dict

def compute_smoothed_idf(corpus_terms_set, num_docs):
    """Compute smoothed inverse document frequency for terms across the entire corpus."""
    idf_dict = {}
    for term, doc_freq in corpus_terms_set.items():
        # Adjusted IDF calculation to match sklearn's implementation
        idf_dict[term] = math.log((1 + num_docs) / (1 + doc_freq)) + 1
    return idf_dict

def compute_tf_idf_vector(tokens, idf_dict, term_to_index):
    """Compute normalized TF-IDF vector and store it as a sparse vector."""
    tf = compute_smoothed_tf(tokens)
    indices = []
    data = []
    for term, tf_value in tf.items():
        if term in term_to_index:
            idx = term_to_index[term]
            idf_value = idf_dict.get(term, 0)
            tf_idf_value = tf_value * idf_value
            indices.append(idx)
            data.append(tf_idf_value)
    # Create sparse vector
    vector_size = len(term_to_index)
    tf_idf_vector = csr_matrix((data, indices, [0, len(indices)]), shape=(1, vector_size))
    # Normalize the vector
    norm = np.linalg.norm(tf_idf_vector.data)
    if norm > 0:
        tf_idf_vector.data = tf_idf_vector.data / norm
    return tf_idf_vector

def build_term_to_queries_mapping_top_terms(queries, K):
    """
    Build a mapping from term index to the list of query IDs that contain the term,
    considering only the top K terms per query.
    """
    term_to_queries = defaultdict(list)
    query_top_terms = {}
    for query_id, query_vector in queries.items():
        indices = query_vector.indices
        data = query_vector.data
        if len(data) > K:
            top_indices = np.argsort(data)[-K:]
            top_terms = indices[top_indices]
        else:
            top_terms = indices
        query_top_terms[query_id] = set(top_terms)
        for term_idx in top_terms:
            term_to_queries[term_idx].append(query_id)
    return term_to_queries, query_top_terms

def retrieve_top_documents_with_limited_candidates(query_tf_idf_path, doc_tf_idf_path, index_path, term_to_index_path, top_n=10, K=5, M=200):
    """Retrieve top N documents for each query using the inverted index with limited candidates."""
    log("Starting retrieval of top documents for each query...")
    start_time = time.time()

    # Load term_to_index mapping
    with open(term_to_index_path, 'r') as f:
        term_to_index = json.load(f)
    log("Term to index mapping loaded.")

    # Load query vectors and get top K terms
    queries = {}
    query_ids_list = []
    with open(query_tf_idf_path, 'r') as f:
        for line in f:
            query_entry = json.loads(line)
            query_id = list(query_entry.keys())[0]
            data = query_entry[query_id]
            indices = np.array(data['indices'])
            values = np.array(data['data'])
            vector_size = len(term_to_index)
            query_vector = csr_matrix((values, indices, [0, len(indices)]), shape=(1, vector_size))
            queries[query_id] = query_vector
            query_ids_list.append(query_id)
    log("Query vectors loaded.")

    # Build term_to_queries mapping considering top K terms per query
    term_to_queries, query_top_terms = build_term_to_queries_mapping_top_terms(queries, K)
    log("Term to queries mapping for top terms created.")

    # Initialize candidate document lists
    query_candidates = defaultdict(set)
    all_candidate_doc_ids = set()

    # Iterate through the inverted index and assign documents to queries
    with open(index_path, 'r') as index_file:
        for idx, line in enumerate(index_file, 1):
            term_entry = json.loads(line)
            term_idx_str = list(term_entry.keys())[0]
            doc_ids = term_entry[term_idx_str]
            term_idx = int(term_idx_str)

            # Retrieve queries that contain this term in their top K terms
            relevant_queries = term_to_queries.get(term_idx, [])
            if relevant_queries:
                # Limit the number of documents per term to M
                limited_doc_ids = doc_ids[:M]
                for query_id in relevant_queries:
                    query_candidates[query_id].update(limited_doc_ids)
                    all_candidate_doc_ids.update(limited_doc_ids)

            # Optional: Log progress every 1,000,000 terms
            if idx % 1000000 == 0:
                log(f"Processed {idx} terms from the inverted index.")

    log("Candidate documents collected for queries.")

    # Build doc_vectors mapping for candidate documents
    doc_vectors = {}
    with open(doc_tf_idf_path, 'r') as doc_file:
        for line in doc_file:
            doc_entry = json.loads(line)
            doc_id = list(doc_entry.keys())[0]
            if doc_id in all_candidate_doc_ids:
                data = doc_entry[doc_id]
                indices = data['indices']
                values = data['data']
                vector_size = len(term_to_index)
                doc_vector = csr_matrix((values, indices, [0, len(indices)]), shape=(1, vector_size))
                doc_vectors[doc_id] = doc_vector
    log("Document vectors for candidates loaded.")

    # Now computing similarities for each query...
    submission_data = []
    total_queries = len(query_ids_list)

    for idx, query_id in enumerate(query_ids_list):
        candidate_doc_ids = query_candidates.get(query_id, set())
        query_vector = queries[query_id]
        result = process_single_query(query_id, query_vector, candidate_doc_ids, doc_vectors, top_n)
        submission_data.append({
            'id': idx,
            'docids': result['docids']
        })
        if (idx + 1) % 100 == 0 or (idx + 1) == total_queries:
            log(f"Processed {idx + 1}/{total_queries} queries.")

    log("Top documents retrieved for all queries.")
    end_time = time.time()
    log(f"Retrieval completed in {end_time - start_time:.2f} seconds.")
    return submission_data

def process_single_query(query_id, query_vector, candidate_doc_ids, doc_vectors, top_n):
    candidate_doc_ids_list = list(candidate_doc_ids)
    docs = []
    doc_ids_list = []
    for doc_id in candidate_doc_ids_list:
        doc_vector = doc_vectors.get(doc_id)
        if doc_vector is not None:
            docs.append(doc_vector)
            doc_ids_list.append(doc_id)
    if not docs:
        retrieved_doc_ids = []
    else:
        doc_matrix = vstack(docs)
        similarities = doc_matrix.dot(query_vector.T).toarray().flatten()
        # Use argpartition to get indices of top N similarities
        top_n = min(top_n, len(similarities))
        top_n_indices = np.argpartition(-similarities, top_n - 1)[:top_n]
        top_docs = [(similarities[i], doc_ids_list[i]) for i in top_n_indices]
        # Sort the top documents by similarity
        top_docs.sort(reverse=True)
        retrieved_doc_ids = [doc_id for sim, doc_id in top_docs]
    formatted_doc_ids = str(retrieved_doc_ids)
    return {'docids': formatted_doc_ids}

# Example usage

if __name__ == "__main__":
    # Paths to the TF-IDF vector files
    corpus_path = '/kaggle/input/dis-tokens/corpus_tokens.json'
    queries_path = '/kaggle/input/dis-tokens/test_tokens.json'
    doc_tf_idf_path = '/kaggle/input/tf-idf/doc_tf_idf_vectors.json'
    query_tf_idf_path = '/kaggle/input/tf-idf/query_tf_idf_vectors.json'
    index_path = '/kaggle/input/tf-idf/inverted_index.json'
    term_to_index_path = '/kaggle/input/tf-idf/vocab.json'

    # Step 1: Calculate IDF using the entire corpus
    # Uncomment if running for the first time
    # idf_dict, term_to_index = calculate_idf_from_corpus(corpus_path, term_to_index_path)

    # Step 2: Compute TF-IDF vectors for all documents
    # Uncomment if running for the first time
    # compute_tf_idf_for_documents(corpus_path, idf_dict, term_to_index, doc_tf_idf_path)

    # Step 3: Compute TF-IDF vectors for all queries
    # Uncomment if running for the first time
    # compute_tf_idf_for_queries(queries_path, idf_dict, term_to_index, query_tf_idf_path)

    # Step 4: Build the inverted index and save it to disk incrementally
    # Uncomment if running for the first time
    # build_inverted_index_on_disk(doc_tf_idf_path, index_path)

    # Step 5: Retrieve top documents using the optimized inverted index with limited candidates
    submission_data = retrieve_top_documents_with_limited_candidates(
        query_tf_idf_path,
        doc_tf_idf_path,
        index_path,
        term_to_index_path,
        top_n=10,
        K=5,   # Top K terms per query
        M=200  # Top M documents per term
    )

    # Step 6: Convert to DataFrame and save to CSV
    submission_df = pd.DataFrame(submission_data)
    submission_df = submission_df[['id', 'docids']]  # Ensure correct column order

    # Verify that there are 2000 lines plus header
    assert len(submission_df) == 2000, f"Expected 2000 queries, but got {len(submission_df)}"

    submission_file_path = 'submission.csv'
    submission_df.to_csv(submission_file_path, index=False)

    log(f"Submission file '{submission_file_path}' created successfully.")
    log(submission_df.head())

2024-11-09 20:02:04 Starting retrieval of top documents for each query...
2024-11-09 20:02:23 Term to index mapping loaded.
2024-11-09 20:02:23 Query vectors loaded.
2024-11-09 20:02:23 Term to queries mapping for top terms created.
2024-11-09 20:03:13 Processed 1000000 terms from the inverted index.
2024-11-09 20:03:21 Processed 2000000 terms from the inverted index.
2024-11-09 20:03:28 Processed 3000000 terms from the inverted index.
2024-11-09 20:03:35 Processed 4000000 terms from the inverted index.
2024-11-09 20:03:43 Processed 5000000 terms from the inverted index.
2024-11-09 20:03:49 Processed 6000000 terms from the inverted index.
2024-11-09 20:03:56 Processed 7000000 terms from the inverted index.
2024-11-09 20:04:05 Processed 8000000 terms from the inverted index.
2024-11-09 20:04:11 Processed 9000000 terms from the inverted index.



KeyboardInterrupt

