# Part 3. Ranking & Filtering

Author/s: <font color="blue">Jhonatan Barcos Gambaro | Daniel Alexander Yearwood</font>

E-mail: <font color="blue">jhonatan.barcos01@estudiant.upf.edu | danielalexander.yearwood01@estudiant.upf.edu </font>

Date: <font color="blue">20/11/2025</font>

In [17]:
# Import libraries
import numpy as np
import pandas as pd
import re

from collections import defaultdict
from array import array
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
from gensim.models.word2vec import Word2Vec

import time


## 0. Data Preprocesing, Indexing and Queries (Recap Part 1-2)

In [2]:
# Upload dataset
data_path = '../../data/fashion_products_dataset.json'
products = pd.read_json(data_path)

# Text Preprocessing
stop_words = set(stopwords.words("english"))
stemmer = nltk.PorterStemmer()

# Define new stop words that depends on the domain of the data
stop_words_domain = {
    'made', 'india', 'proudly', 'use', 'year', 'round', 
    'look', 'design', 'qualiti', 'day', 'make',       
    'feel', 'perfect', 'great', 'wash', 'style',      
}
stop_words = stop_words.union(stop_words_domain)

# Redefine clean_text function to build_terms to return a list of tokens
def build_terms(text):
    text = re.sub(r'\d+', '', text)
    word_tokens = word_tokenize(text.lower())
    textos_limpios = [word for word in word_tokens if word not in stop_words and word.isalnum()]      
    textos_limpios = [stemmer.stem(word) for word in textos_limpios]
    return textos_limpios

# Helper function to clean numeric fields 
def clean_numeric(value):
    if isinstance(value, str):
        value = re.sub(r'[^\d.,]', '', value).replace(',', '')
    try:
        return float(value)
    except:
        return np.nan

# Apply cleaning to numeric columns
for col in ['selling_price', 'actual_price', 'discount', 'average_rating']:
    products[col] = products[col].apply(clean_numeric)

# Assegurem que els NaN es converteixin en 0
products['average_rating'] = products['average_rating'].fillna(0)

# Apply build_terms function to the columns 'title' and 'description' of the products dataset
products_cleaned = products.copy()

products_cleaned['title'] = products_cleaned['title'].apply(build_terms)
products_cleaned['description'] = products_cleaned['description'].apply(build_terms)

In [3]:
# Define page_contents and title_index
title_index = products['title'].to_dict()
products['content_to_index'] = products['title'].fillna('') + ' ' + products['description'].fillna('')
page_contents = products['content_to_index'].tolist()

# Mappings between doc_ids and pids
doc_id_to_pid = products['pid'].to_dict()
pid_to_doc_id = {pid: i for i, pid in doc_id_to_pid.items()}

N = len(page_contents) # Number of documents

In [4]:
# We adapt our function create_index_tfidf from part 2 to return also the length of each document vector and the normalized term frequencies.
def create_index_part3(documents_content, title_index_map, num_documents):

    # Initialize data structures
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    title_index = defaultdict(str)
    
    doc_lengths = {} 
    
    # Process each document
    for page_id, content in enumerate(documents_content):
        
        # Build terms for the document
        terms = build_terms(content)
        
        doc_lengths[page_id] = len(terms) 

        # Build index
        title = title_index_map.get(page_id, "No Title Found")
        title_index[page_id] = title    

        
        current_page_index = {}

        for position, term in enumerate(terms): 
            try:
                current_page_index[term][1].append(position)
            except KeyError:
                current_page_index[term] = [page_id, array('I', [position])]
                
        # Normalize term frequencies
        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # Calculate tf normalized and DF
        for term, posting in current_page_index.items():
            if norm > 0:
                tf[term].append(np.round(len(posting[1]) / norm, 4))
            else:
                tf[term].append(0.0)
            
            df[term] += 1 

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    # Calculate IDF 
    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    return index, tf, df, idf, title_index, doc_lengths

In [5]:
# Execution of the index construction
start_time = time.time()

# Call the new function
index, tf_norm, df, idf, title_index, doc_lengths = create_index_part3(page_contents, title_index, N)

# Print total time taken
print("Total time to create the TD-IDF index: {} seconds" .format(np.round(time.time() - start_time, 2)))

# Calculate average document length
avg_doc_length = sum(doc_lengths.values()) / N

Total time to create the TD-IDF index: 14.62 seconds


In [6]:
# Same function search_tfidf as in part 2, but isolated here for clarity
def find_candidate_docs(query, index):
    query_terms = build_terms(query)

    if not query_terms:
        return set()

    candidate_docs = None 
    
    for term in query_terms:
        
        term_docs = {posting[0] for posting in index[term]}
            
        if candidate_docs is None:
            candidate_docs = term_docs
        else:
            candidate_docs &= term_docs 

    return candidate_docs 

In [7]:
test_queries = {
        # Q1: Compulsory (validation_labels.csv)
        "q1": "women full sleeve sweatshirt cotton",
        
        # Q2: Compulsory (validation_labels.csv)
        "q2": "men slim jeans blue",
        
        # Q3: High Frequency Query (Based on Top DF)
        "q3": "neck solid fit",

        # Q4: Small Frequency Query (Based on Low DF)
        "q4": "trendiest glossi",
        
        # Q5: Combined Query (User Simulation)
        "q5": "trendiest women"
    }

## 1. Rankings approaches

### 1.1. TF-IDF Ranking + cosine similarity

In [8]:
def rank_documents_tfidf(query_terms, docs_to_rank, index, df, N):
    # Build query vector
    query_vector = {}
    query_term_counts = collections.Counter(query_terms) 
    
    # Build vector for the query
    for term, count in query_term_counts.items():
        if term not in df: continue
            
        # TF de la consulta
        tf_q = 1 + math.log10(count)
        
        # IDF de la consulta 
        idf_q = math.log10(N / df[term])
        
        query_vector[term] = tf_q * idf_q

    # Calculate document scores (Dot Product)
    # Create a dictionary {doc_id: score}
    doc_scores = defaultdict(float)
    
    # Iterate over each term in the QUERY (
    for term, query_weight in query_vector.items():
        # Calculate the weight of this term for the documents
        idf_d = math.log10(N / df[term]) 
        
        for posting in index[term]:
            doc_id = posting[0]
            
            if doc_id in docs_to_rank:
                
                # Calculate Raw TF
                raw_tf = len(posting[1]) 
                
                # Calculate TF of the document (logarithmic)
                tf_d = 1 + math.log10(raw_tf)
                
                doc_weight = tf_d * idf_d

                # Accumulate the dot product (Q_weight * D_weight)
                doc_scores[doc_id] += query_weight * doc_weight
    
    # Sort documents by score
    ranked_docs = sorted(doc_scores.keys(), key=lambda d: doc_scores[d], reverse=True)
    
    scores_dict = {doc_id: doc_scores[doc_id] for doc_id in ranked_docs}
    return ranked_docs, scores_dict

In [9]:
# Rank test queries
for qid, query in test_queries.items():
    query_terms = build_terms(query)
    candidate_docs = find_candidate_docs(query, index)
    ranked_docs, scores = rank_documents_tfidf(query_terms, candidate_docs, index, df, N)

    print('Query ID:', qid)
    print('Query:', query)
    print('Top 5 Ranked Document IDs:', ranked_docs[:5])
    print('Scores:', [scores[doc_id] for doc_id in ranked_docs[:5]])
    print("-" * 50)

Query ID: q1
Query: women full sleeve sweatshirt cotton
Top 5 Ranked Document IDs: [4288, 4290, 24129, 25300, 23179]
Scores: [4.957146787053152, 4.957146787053152, 4.92628436019739, 4.92628436019739, 4.853784875481479]
--------------------------------------------------
Query ID: q2
Query: men slim jeans blue
Top 5 Ranked Document IDs: [7595, 7605, 7617, 7623, 7634]
Scores: [4.0298552985852965, 4.0298552985852965, 4.0298552985852965, 4.0298552985852965, 4.0298552985852965]
--------------------------------------------------
Query ID: q3
Query: neck solid fit
Top 5 Ranked Document IDs: [9808, 9810, 25250, 25251, 25281]
Scores: [1.0420485082185222, 0.9794479497805191, 0.9383224938524526, 0.9383224938524526, 0.9383224938524526]
--------------------------------------------------
Query ID: q4
Query: trendiest glossi
Top 5 Ranked Document IDs: []
Scores: []
--------------------------------------------------
Query ID: q5
Query: trendiest women
Top 5 Ranked Document IDs: [794, 821, 826, 852, 178

**TF-IDF (Dot Product)**

*Pros:*
- Simple & Intuitive: The model is easy to understand and implement.
- Good Baseline: It provides a solid, classic baseline to measure other models against.

*Cons:*
- Biased Towards Length: Favors longer documents, which is often not what a user wants.
- No TF Saturation: Can be "gamed" by keyword-stuffing, rewarding documents that overuse a query term.

### 1.2. BM25 Ranking

In [10]:
# Define BM25 parameters
K1 = 1.2
B = 0.75

def rank_documents_bm25(query_terms, docs_to_rank, index, df, N, doc_lengths, avg_doc_length):

    # Initialize document scores
    doc_scores = defaultdict(float)
    
    # Precompute IDF for query terms
    idf_cache = {}
    for term in query_terms:
        if term not in df: continue
        # Fórmula IDF de BM25 (compte, logaritme natural)
        df_term = df[term]
        idf_cache[term] = math.log(1 + (N - df_term + 0.5) / (df_term + 0.5))

    # Itearate over documents to rank
    for doc_id in docs_to_rank:
        doc_len = doc_lengths[doc_id] 
        
        # Iterate over query terms
        for term in query_terms:
            if term not in idf_cache: continue 
                
            # Obtain Raw TF for the term in the document
            raw_tf = 0
            for posting in index[term]:
                if posting[0] == doc_id:
                    raw_tf = len(posting[1]) 
                    break 
            
            if raw_tf == 0: continue 
            
            # BM25 TF component
            tf_num = raw_tf * (K1 + 1)
            tf_den = raw_tf + K1 * (1 - B + B * (doc_len / avg_doc_length))
            tf_score = tf_num / tf_den
            
            # Scoring
            doc_scores[doc_id] += idf_cache[term] * tf_score

    # Sort documents by score
    ranked_docs = sorted(doc_scores.keys(), key=lambda d: doc_scores[d], reverse=True)
    scores_dict = {doc_id: doc_scores[doc_id] for doc_id in ranked_docs}
    return ranked_docs, scores_dict

In [11]:
# Rank test queries
for qid, query in test_queries.items():
    query_terms = build_terms(query)
    candidate_docs = find_candidate_docs(query, index)
    ranked_docs_bm25, scores_bm25 = rank_documents_bm25(query_terms, candidate_docs, index, df, N, doc_lengths, avg_doc_length)

    print('Query ID:', qid)
    print('Query:', query)
    print('Top 5 Ranked Document IDs:', ranked_docs_bm25[:5])
    print('Scores:', [scores_bm25[doc_id] for doc_id in ranked_docs_bm25[:5]])
    print("-" * 50)

Query ID: q1
Query: women full sleeve sweatshirt cotton
Top 5 Ranked Document IDs: [4288, 4290, 14655, 25149, 25151]
Scores: [12.100026652674897, 12.100026652674897, 11.889998681164478, 11.127435413300294, 10.922320440059151]
--------------------------------------------------
Query ID: q2
Query: men slim jeans blue
Top 5 Ranked Document IDs: [24544, 24547, 11292, 10283, 26174]
Scores: [11.063364735088644, 11.01005550711725, 10.567155611301338, 10.567155611301338, 10.567155611301338]
--------------------------------------------------
Query ID: q3
Query: neck solid fit
Top 5 Ranked Document IDs: [12184, 12143, 12152, 12224, 24712]
Scores: [4.392796330249203, 4.374840279049088, 4.374840279049088, 4.2867579498439055, 4.2229630784650825]
--------------------------------------------------
Query ID: q4
Query: trendiest glossi
Top 5 Ranked Document IDs: []
Scores: []
--------------------------------------------------
Query ID: q5
Query: trendiest women
Top 5 Ranked Document IDs: [821, 794, 826

**BM25**

*Pros:*
- State-of-the-Art: It is the industry-standard lexical ranking function for a reason, generally providing superior relevance.
- Sophisticated Normalization: It intelligently balances TF saturation ($k_1$) and document length ($b$).

*Cons:*
- "Black Box" Parameters: Requires tuning $k_1$ and $b$ (we used standard defaults), which can be complex.
- Computationally Heavier: Requires more pre-calculated data (specifically doc_lengths and avg_doc_length).

### 1.3. Hybrid Ranking (your score)

In [12]:
def rank_documents_your_score(ranked_docs_bm25, scores_bm25, products_df):
    # Define weights
    W_BM25 = 0.8  # 80% textual relevance
    W_RATING = 0.2 # 20% quality of the product
    
    your_scores = {}
    
    max_bm25_score = next(iter(scores_bm25.values())) if scores_bm25 else 0
    if max_bm25_score == 0: 
        max_bm25_score = 1 
     
    # Define maximum rating   
    MAX_RATING = 5.0
    
    # Iterate over documents
    for doc_id in ranked_docs_bm25:
     
        norm_bm25 = scores_bm25[doc_id] / max_bm25_score
        
        try:
            rating = products_df.at[doc_id, 'average_rating']
            if not isinstance(rating, (int, float)): rating = 0
        except:
            rating = 0
        
        # Normalize rating
        norm_rating = rating / MAX_RATING
        
        # Calculate final score
        final_score = (W_BM25 * norm_bm25) + (W_RATING * norm_rating)
        
        # Store final score
        your_scores[doc_id] = final_score
        
    ranked_docs = sorted(your_scores.keys(), key=lambda d: your_scores[d], reverse=True)
    scores_dict = {doc_id: your_scores[doc_id] for doc_id in ranked_docs}
    
    return ranked_docs, scores_dict


In [13]:
# Rank test queries
for qid, query in test_queries.items():
    query_terms = build_terms(query)
    candidate_docs = find_candidate_docs(query, index)
    ranked_docs_bm25, scores_bm25 = rank_documents_bm25(query_terms, candidate_docs, index, df, N, doc_lengths, avg_doc_length)
    ranked_docs_yourscore, scores_yourscore = rank_documents_your_score(ranked_docs_bm25, scores_bm25, products_cleaned)

    print('Query ID:', qid)
    print('Query:', query)
    print('Top 5 Ranked Document IDs:', ranked_docs_yourscore[:5])
    print('Scores:', [f"{scores_yourscore[doc_id]:.4f}" for doc_id in ranked_docs_yourscore[:5]])
    print("-" * 50)
    

Query ID: q1
Query: women full sleeve sweatshirt cotton
Top 5 Ranked Document IDs: [4288, 4290, 14655, 25149, 25015]
Scores: ['0.9880', '0.9880', '0.9781', '0.9277', '0.9221']
--------------------------------------------------
Query ID: q2
Query: men slim jeans blue
Top 5 Ranked Document IDs: [10303, 24544, 24547, 10401, 10348]
Scores: ['0.9641', '0.9520', '0.9481', '0.9481', '0.9361']
--------------------------------------------------
Query ID: q3
Query: neck solid fit
Top 5 Ranked Document IDs: [21243, 21273, 14713, 13512, 24730]
Scores: ['0.9449', '0.9449', '0.9357', '0.9348', '0.9331']
--------------------------------------------------
Query ID: q4
Query: trendiest glossi
Top 5 Ranked Document IDs: []
Scores: []
--------------------------------------------------
Query ID: q5
Query: trendiest women
Top 5 Ranked Document IDs: [821, 794, 17877, 17869, 826]
Scores: ['0.9600', '0.9600', '0.8192', '0.7498', '0.7376']
--------------------------------------------------


**Hybrid Ranking**

*Pros:*
- Smarter Tie-Breaker: When textual relevance is similar, the product with the better rating wins.
- Better User Experience: Aligns with real-world user intent (users want good, relevant products, not just textually relevant ones).
- Leverages Part 1 Work: Directly uses the average_rating field we identified and cleaned in Part 1.

*Cons:*
- Popularity Bias: This is the main drawback. New products with 0 ratings are unfairly penalized, making it very difficult for them to ever appear in the top results.
- Arbitrary Weights: The 80/2s0 split is an educated guess. The optimal weights would need to be found experimentally (e.g., via a Grid Search).

## 2. Word2vec + cosine ranking score

### 2.1. Training the Word2Vec model


In [18]:
corpus = (products_cleaned["title"] + products_cleaned["description"]).tolist()

print("Number of documents in corpus:", len(corpus))
print("Example document tokens:", corpus[0][:20]) 

# Train Word2Vec model
w2v_model = Word2Vec(
    sentences=corpus,
    vector_size=100,  
    window=5,       
    min_count=2,       
    workers=4,        
    sg=1,            
    epochs=10         
)


Number of documents in corpus: 28080
Example document tokens: ['solid', 'women', 'multicolor', 'track', 'pant', 'yorker', 'trackpant', 'rich', 'comb', 'cotton', 'give', 'rich', 'comfort', 'skin', 'friendli', 'fabric', 'waistband']


### 2.2 Building text embeddings (average of word vectors)


In [19]:
def terms_to_w2v_vector(terms, model):
    vectors = []
    for t in terms:
        if t in model.wv:
            vectors.append(model.wv[t])
    if not vectors:
        return None
    return np.mean(vectors, axis=0)

doc_w2v_vectors = {}

for doc_id in range(N):
    terms = products_cleaned.loc[doc_id, "title"] + products_cleaned.loc[doc_id, "description"]
    vec = terms_to_w2v_vector(terms, w2v_model)
    if vec is not None:
        doc_w2v_vectors[doc_id] = vec

print("Docs with Word2Vec vector:", len(doc_w2v_vectors), "of", N)

Docs with Word2Vec vector: 28080 of 28080


### 2.3 Ranking documents with Word2Vec + cosine (top-20 for the 5 queries)


In [None]:
def cosine_sim(a, b):
    denom = la.norm(a) * la.norm(b)
    if denom == 0:
        return 0.0
    return float(np.dot(a, b) / denom)

def rank_documents_word2vec(query, docs_to_rank, model, doc_vectors):
    query_terms = build_terms(query)
    q_vec = terms_to_w2v_vector(query_terms, model)
    if q_vec is None or not docs_to_rank:
        return [], {}
    
    scores = {}
    for doc_id in docs_to_rank:
        d_vec = doc_vectors.get(doc_id)
        if d_vec is None:
            continue
        scores[doc_id] = cosine_sim(q_vec, d_vec)
    
    ranked_docs = sorted(scores.keys(), key=lambda d: scores[d], reverse=True)
    return ranked_docs, scores

for qid, query in test_queries.items():
    candidate_docs = find_candidate_docs(query, index)
    ranked_docs_w2v, scores_w2v = rank_documents_word2vec(
        query, candidate_docs, w2v_model, doc_w2v_vectors
    )
    print("Query ID:", qid)
    print("Query:", query)
    print("Number of candidate docs:", len(candidate_docs))
    print("Top 20 doc IDs:", ranked_docs_w2v[:20])
    print("-" * 60)


Query ID: q1
Query: women full sleeve sweatshirt cotton
Number of candidate docs: 215
Top 20 doc IDs: [14655, 4288, 4290, 22995, 23042, 23044, 23046, 23054, 23056, 23060, 22856, 22869, 22892, 22924, 22985, 22990, 23006, 23015, 23021, 23029]
------------------------------------------------------------
Query ID: q2
Query: men slim jeans blue
Number of candidate docs: 176
Top 20 doc IDs: [11292, 10283, 26174, 10303, 10308, 26184, 10313, 26186, 11339, 11350, 10348, 10391, 10401, 5797, 10415, 10416, 10417, 5827, 5828, 6858]
------------------------------------------------------------
Query ID: q3
Query: neck solid fit
Number of candidate docs: 742
Top 20 doc IDs: [13512, 12990, 12992, 12988, 12184, 12989, 12223, 26052, 12143, 12208, 11403, 12104, 12224, 21243, 21270, 21334, 21352, 12222, 21762, 21802]
------------------------------------------------------------
Query ID: q4
Query: trendiest glossi
Number of candidate docs: 0
Top 20 doc IDs: []
-----------------------------------------------

## 3. Can you imagine a better representation than word2vec?



Word2Vec gives us a good starting point, but it is still quite simple: it learns a separate vector for each word and then we represent a document by just averaging those word vectors. In that process we lose information about document structure and some nuances of meaning.

One possible improvement is **Doc2Vec**, which directly learns a vector for each document instead of only for individual words. In this way, the model can capture more global information about the document (topics, style, etc.) in a single embedding. The drawback is that Doc2Vec is usually harder to train and more sensitive to hyperparameters and data size than the simple “average of Word2Vec vectors”, so it is also more difficult to tune and debug in practice.
