# Part 3. Ranking & Filtering

Author/s: <font color="blue">Jhonatan Barcos Gambaro | Daniel Alexander Yearwood</font>

E-mail: <font color="blue">jhonatan.barcos01@estudiant.upf.edu | danielalexander.yearwood01@estudiant.upf.edu </font>

Date: <font color="blue">20/11/2025</font>

In [23]:
# Import libraries
import numpy as np
import pandas as pd
import re

from collections import defaultdict
from array import array
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la

import time



## 0. Data Preprocesing, Indexing and Queries (Recap Part 1-2)

In [None]:
# Upload dataset
data_path = '../../data/fashion_products_dataset.json'
products = pd.read_json(data_path)

# Text Preprocessing
stop_words = set(stopwords.words("english"))
stemmer = nltk.PorterStemmer()

# Define new stop words that depends on the domain of the data
stop_words_domain = {
    'made', 'india', 'proudly', 'use', 'year', 'round', 
    'look', 'design', 'qualiti', 'day', 'make',       
    'feel', 'perfect', 'great', 'wash', 'style',      
}
stop_words = stop_words.union(stop_words_domain)

# Redefine clean_text function to build_terms to return a list of tokens
def build_terms(text):
    text = re.sub(r'\d+', '', text)
    word_tokens = word_tokenize(text.lower())
    textos_limpios = [word for word in word_tokens if word not in stop_words and word.isalnum()]      
    textos_limpios = [stemmer.stem(word) for word in textos_limpios]
    return textos_limpios


# Helper function to clean numeric fields (EL TEU CODI)
def clean_numeric(value):
    if isinstance(value, str):
        value = re.sub(r'[^\d.,]', '', value).replace(',', '')
    try:
        return float(value)
    except:
        return np.nan

# Apply cleaning (EL TEU CODI)
for col in ['selling_price', 'actual_price', 'discount', 'average_rating']:
    products[col] = products[col].apply(clean_numeric)

# Assegurem que els NaN es converteixin en 0
products['average_rating'] = products['average_rating'].fillna(0)

# Apply build_terms function to the columns 'title' and 'description' of the products dataset
products_cleaned = products.copy()

products_cleaned['title'] = products_cleaned['title'].apply(build_terms)
products_cleaned['description'] = products_cleaned['description'].apply(build_terms)

In [None]:
# Define page_contents and title_index
title_index = products['title'].to_dict()
products['content_to_index'] = products['title'].fillna('') + ' ' + products['description'].fillna('')
page_contents = products['content_to_index'].tolist()

# Mappings between doc_ids and pids
doc_id_to_pid = products['pid'].to_dict()
pid_to_doc_id = {pid: i for i, pid in doc_id_to_pid.items()}

N = len(page_contents) # Number of documents

In [None]:
# We adapt our function create_index_tfidf from part 2 to return also the length of each document vector and the normalized term frequencies.
def create_index_part3(documents_content, title_index_map, num_documents):

    # Initialize data structures
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    title_index = defaultdict(str)
    
    doc_lengths = {} 
    
    # Process each document
    for page_id, content in enumerate(documents_content):
        
        # Build terms for the document
        terms = build_terms(content)
        
        doc_lengths[page_id] = len(terms) 

        # Build index
        title = title_index_map.get(page_id, "No Title Found")
        title_index[page_id] = title    

        
        current_page_index = {}

        for position, term in enumerate(terms): 
            try:
                current_page_index[term][1].append(position)
            except KeyError:
                current_page_index[term] = [page_id, array('I', [position])]
                
        # Normalize term frequencies
        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # Calculate tf normalized and DF
        for term, posting in current_page_index.items():
            if norm > 0:
                tf[term].append(np.round(len(posting[1]) / norm, 4))
            else:
                tf[term].append(0.0)
            
            df[term] += 1 

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    # Calculate IDF 
    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    return index, tf, df, idf, title_index, doc_lengths

In [None]:
# Execution of the index construction
start_time = time.time()

# Call the new function
index, tf_norm, df, idf, title_index, doc_lengths = create_index_part3(page_contents, title_index, N)

# Print total time taken
print("Total time to create the TD-IDF index: {} seconds" .format(np.round(time.time() - start_time, 2)))

# Calculate average document length
avg_doc_length = sum(doc_lengths.values()) / N

Total time to create the TD-IDF index: 59.73 seconds


In [None]:
# Same function search_tfidf as in part 2, but isolated here for clarity
def find_candidate_docs(query, index):
    query_terms = build_terms(query)

    if not query_terms:
        return set()

    candidate_docs = None 
    
    for term in query_terms:
        
        term_docs = {posting[0] for posting in index[term]}
            
        if candidate_docs is None:
            candidate_docs = term_docs
        else:
            candidate_docs &= term_docs 

    return candidate_docs 

In [None]:
test_queries = {
        # Q1: Compulsory (validation_labels.csv)
        "q1": "women full sleeve sweatshirt cotton",
        
        # Q2: Compulsory (validation_labels.csv)
        "q2": "men slim jeans blue",
        
        # Q3: High Frequency Query (Based on Top DF)
        "q3": "neck solid fit",

        # Q4: Small Frequency Query (Based on Low DF)
        "q4": "trendiest glossi",
        
        # Q5: Combined Query (User Simulation)
        "q5": "trendiest women"
    }

## 1. TF-IDF Ranking + cosine similarity

In [None]:
def rank_documents_tfidf(query_terms, docs_to_rank, index, df, N):
    # Build query vector
    query_vector = {}
    query_term_counts = collections.Counter(query_terms) 
    
    # Build vector for the query
    for term, count in query_term_counts.items():
        if term not in df: continue
            
        # TF de la consulta
        tf_q = 1 + math.log10(count)
        
        # IDF de la consulta 
        idf_q = math.log10(N / df[term])
        
        query_vector[term] = tf_q * idf_q

    # Calculate document scores (Dot Product)
    # Create a dictionary {doc_id: score}
    doc_scores = defaultdict(float)
    
    # Iterate over each term in the QUERY (
    for term, query_weight in query_vector.items():
        # Calculate the weight of this term for the documents
        idf_d = math.log10(N / df[term]) 
        
        for posting in index[term]:
            doc_id = posting[0]
            
            if doc_id in docs_to_rank:
                
                # Calculate Raw TF
                raw_tf = len(posting[1]) 
                
                # Calculate TF of the document (logarithmic)
                tf_d = 1 + math.log10(raw_tf)
                
                doc_weight = tf_d * idf_d

                # Accumulate the dot product (Q_weight * D_weight)
                doc_scores[doc_id] += query_weight * doc_weight
    
    # Sort documents by score
    ranked_docs = sorted(doc_scores.keys(), key=lambda d: doc_scores[d], reverse=True)
    
    scores_dict = {doc_id: doc_scores[doc_id] for doc_id in ranked_docs}
    return ranked_docs, scores_dict

In [None]:
# Rank test queries
for qid, query in test_queries.items():
    query_terms = build_terms(query)
    candidate_docs = find_candidate_docs(query, index)
    ranked_docs, scores = rank_documents_tfidf(query_terms, candidate_docs, index, df, N)

    print('Query ID:', qid)
    print('Query:', query)
    print('Top 5 Ranked Document IDs:', ranked_docs[:5])
    print('Scores:', [scores[doc_id] for doc_id in ranked_docs[:5]])
    print("-" * 50)

Query ID: q1
Query: women full sleeve sweatshirt cotton
Top 5 Ranked Document IDs: [4288, 4290, 24129, 25300, 23179]
Scores: [4.957146787053152, 4.957146787053152, 4.92628436019739, 4.92628436019739, 4.853784875481479]
--------------------------------------------------
Query ID: q2
Query: men slim jeans blue
Top 5 Ranked Document IDs: [7595, 7605, 7617, 7623, 7634]
Scores: [4.0298552985852965, 4.0298552985852965, 4.0298552985852965, 4.0298552985852965, 4.0298552985852965]
--------------------------------------------------
Query ID: q3
Query: neck solid fit
Top 5 Ranked Document IDs: [9808, 9810, 25250, 25251, 25281]
Scores: [1.0420485082185222, 0.9794479497805191, 0.9383224938524526, 0.9383224938524526, 0.9383224938524526]
--------------------------------------------------
Query ID: q4
Query: trendiest glossi
Top 5 Ranked Document IDs: []
Scores: []
--------------------------------------------------
Query ID: q5
Query: trendiest women
Top 5 Ranked Document IDs: [794, 821, 826, 852, 178

## 2. BM25 Ranking

In [None]:
# Define BM25 parameters
K1 = 1.2
B = 0.75

def rank_documents_bm25(query_terms, docs_to_rank, index, df, N, doc_lengths, avg_doc_length):

    # Initialize document scores
    doc_scores = defaultdict(float)
    
    # Precompute IDF for query terms
    idf_cache = {}
    for term in query_terms:
        if term not in df: continue
        # Fórmula IDF de BM25 (compte, logaritme natural)
        df_term = df[term]
        idf_cache[term] = math.log(1 + (N - df_term + 0.5) / (df_term + 0.5))

    # Itearate over documents to rank
    for doc_id in docs_to_rank:
        doc_len = doc_lengths[doc_id] 
        
        # Iterate over query terms
        for term in query_terms:
            if term not in idf_cache: continue 
                
            # Obtain Raw TF for the term in the document
            raw_tf = 0
            for posting in index[term]:
                if posting[0] == doc_id:
                    raw_tf = len(posting[1]) 
                    break 
            
            if raw_tf == 0: continue 
            
            # BM25 TF component
            tf_num = raw_tf * (K1 + 1)
            tf_den = raw_tf + K1 * (1 - B + B * (doc_len / avg_doc_length))
            tf_score = tf_num / tf_den
            
            # Scoring
            doc_scores[doc_id] += idf_cache[term] * tf_score

    # Sort documents by score
    ranked_docs = sorted(doc_scores.keys(), key=lambda d: doc_scores[d], reverse=True)
    scores_dict = {doc_id: doc_scores[doc_id] for doc_id in ranked_docs}
    return ranked_docs, scores_dict

In [None]:
# Rank test queries
for qid, query in test_queries.items():
    query_terms = build_terms(query)
    candidate_docs = find_candidate_docs(query, index)
    ranked_docs_bm25, scores_bm25 = rank_documents_bm25(query_terms, candidate_docs, index, df, N, doc_lengths, avg_doc_length)

    print('Query ID:', qid)
    print('Query:', query)
    print('Top 5 Ranked Document IDs:', ranked_docs_bm25[:5])
    print('Scores:', [scores_bm25[doc_id] for doc_id in ranked_docs_bm25[:5]])
    print("-" * 50)

Query ID: q1
Query: women full sleeve sweatshirt cotton
Top 5 Ranked Document IDs: [4288, 4290, 14655, 25149, 25151]
Scores: [12.100026652674897, 12.100026652674897, 11.889998681164478, 11.127435413300294, 10.922320440059151]
--------------------------------------------------
Query ID: q2
Query: men slim jeans blue
Top 5 Ranked Document IDs: [24544, 24547, 11292, 10283, 26174]
Scores: [11.063364735088644, 11.01005550711725, 10.567155611301338, 10.567155611301338, 10.567155611301338]
--------------------------------------------------
Query ID: q3
Query: neck solid fit
Top 5 Ranked Document IDs: [12184, 12143, 12152, 12224, 24712]
Scores: [4.392796330249203, 4.374840279049088, 4.374840279049088, 4.2867579498439055, 4.2229630784650825]
--------------------------------------------------
Query ID: q4
Query: trendiest glossi
Top 5 Ranked Document IDs: []
Scores: []
--------------------------------------------------
Query ID: q5
Query: trendiest women
Top 5 Ranked Document IDs: [821, 794, 826

## 3. NEW Ranking

### 3.1. Recap Hybrid approach (Part 1)

In [None]:
products_cleaned = products_cleaned.copy()

# Convert to boolean
products_cleaned['out_of_stock'] = products_cleaned['out_of_stock'].astype(bool)

# Helper function to clean numeric fields
def clean_numeric(value):
    if isinstance(value, str):
        value = re.sub(r'[^\d.,]', '', value).replace(',', '')
    try:
        return float(value)
    except:
        return np.nan

# Apply cleaning to numeric columns
for col in ['selling_price', 'actual_price', 'discount', 'average_rating']:
    products_cleaned[col] = products_cleaned[col].apply(clean_numeric)

# Ensure discount is in valid range
products_cleaned['discount'] = products_cleaned['discount'].clip(0, 100)

#Display result to verify
products_cleaned[['pid', 'out_of_stock', 'selling_price', 'actual_price', 'discount', 'average_rating']].head()

In [None]:
def rank_documents_your_score(ranked_docs_bm25, scores_bm25, products_df):
    """
    Re-puntua els resultats de BM25 utilitzant una fórmula híbrida
    que inclou la valoració (average_rating) del producte.
    """
    
    W_BM25 = 0.7  # 70% rellevància textual
    W_RATING = 0.3 # 30% qualitat del producte
    
    your_scores = {}
    
    max_bm25_score = next(iter(scores_bm25.values())) if scores_bm25 else 0
    if max_bm25_score == 0: 
        max_bm25_score = 1 
        
    MAX_RATING = 5.0
    
    for doc_id in ranked_docs_bm25:
        
        norm_bm25 = scores_bm25[doc_id] / max_bm25_score
        
        try:
            rating = products_df.at[doc_id, 'average_rating']
            if not isinstance(rating, (int, float)): rating = 0
        except:
            rating = 0
            
        norm_rating = rating / MAX_RATING
        
        final_score = (W_BM25 * norm_bm25) + (W_RATING * norm_rating)
        
        your_scores[doc_id] = final_score
        
    ranked_docs = sorted(your_scores.keys(), key=lambda d: your_scores[d], reverse=True)
    scores_dict = {doc_id: your_scores[doc_id] for doc_id in ranked_docs}
    
    return ranked_docs, scores_dict

print("Funció 'rank_documents_your_score' definida.")