# Part 2. Indexing and Evaluation

Author/s: <font color="blue">Jhonatan Barcos Gambaro | Daniel Alexander Yearwood</font>

E-mail: <font color="blue">jhonatan.barcos01@estudiant.upf.edu | danielalexander.yearwood01@estudiant.upf.edu </font>

Date: <font color="blue">31/10/2025</font>

In [23]:
# Import libraries
import numpy as np
import pandas as pd
import re

from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la

import time


## 0. Data Preprocesing (Recap Part 1)

For the implementation and development of this part of the project, we will only need to clean up the “title” and “description” variables at the textual level. Therefore, we will limit part 1 to only what is essential and necessary for this part.

In [11]:
# Upload dataset
data_path = '../../data/fashion_products_dataset.json'
products = pd.read_json(data_path)

# Text Preprocessing
stop_words = set(stopwords.words("english"))
stemmer = nltk.PorterStemmer()

# Define new stop words that depends on the domain of the data
stop_words_domain = {
    'made', 'india', 'proudly', 'use', 'year', 'round', 
    'look', 'design', 'qualiti', 'day', 'make',       
    'feel', 'perfect', 'great', 'wash', 'style',      
}
stop_words = stop_words.union(stop_words_domain)

# Redefine clean_text function to build_terms to return a list of tokens
def build_terms(text):
    text = re.sub(r'\d+', '', text)
    word_tokens = word_tokenize(text.lower())
    textos_limpios = [word for word in word_tokens if word not in stop_words and word.isalnum()]      
    textos_limpios = [stemmer.stem(word) for word in textos_limpios]
    return textos_limpios

# Apply build_terms function to the columns 'title' and 'description' of the products dataset
products_cleaned = products.copy()
products_cleaned['title'] = products_cleaned['title'].apply(build_terms)
products_cleaned['description'] = products_cleaned['description'].apply(build_terms)

In [None]:
# Define page_contents and title_index
title_index = products['title'].to_dict()
products['content_to_index'] = products['title'].fillna('') + ' ' + products['description'].fillna('')
page_contents = products['content_to_index'].tolist()

# Mappings between doc_ids and pids
doc_id_to_pid = products['pid'].to_dict()
pid_to_doc_id = {pid: i for i, pid in doc_id_to_pid.items()}

N = len(page_contents) # Number of documents

Creant estructures de dades (page_contents, title_index)...


## 1. Indexing

### 1.1. Build inverted index

In [None]:
# Function adapted from Lab 1
def create_index_tfidf(documents_content, title_index_map, num_documents):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    documents_content -- collection of document contents
    title_index_map -- mapping of document IDs to titles
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    title_index = defaultdict(str)
    
    # process each document
    for page_id, content in enumerate(documents_content):
        
        # build current page index
        terms = build_terms(content)

        # build current page index
        title = title_index_map.get(page_id, "No Title Found")
        title_index[page_id] = title    

        ## ===============================================================
        ## create the index for the **current page** and store it in current_page_index
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and its text is
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0,
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================


        
        # initialize current page index
        current_page_index = {}

        for position, term in enumerate(terms): 
            try:
                current_page_index[term][1].append(position)
            except KeyError:
                current_page_index[term] = [page_id, array('I', [position])]
                
        # normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document.
            # posting ==> [current_doc, [list of positions]]
            # you can use it to infer the frequency of current term.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1]) / norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    # Compute IDF following the formula (3) above. HINT: use np.log
    # Note: It is computed later after we know the df.
    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)


    return index, tf, df, idf, title_index


In [None]:
# Execution of the index construction
start_time = time.time()

# Call the new function
index, tf, df, idf, title_index = create_index_tfidf(page_contents, title_index, N)

# Print total time taken
print("Total time to create the TD-IDF index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the TD-IDF index: 24.39 seconds


### 1.2. Propose test queries

Definides 5 consultes de prova:
  q1: women full sleeve sweatshirt cotton
  q2: men slim jeans blue
  q3: yorker trackpants
  q4: saree silk traditional
  q5: black solid jacket


### 1.3. Rank your results

In [None]:
# Function adapted from Lab 1 for AND search

def search_tf_idf(query, index):
    """
    Returns the list of documents that contain all of the query terms (conjunctive AND).
    """
    query = build_terms(query)
    docs = None
    for term in query:
        try:
            term_docs = {posting[0] for posting in index[term]}
            if docs is None:
                docs = term_docs
            else:
                docs &= term_docs
        except KeyError:
            # If any term is not in the index, no document can match all terms
            return [], []
    docs = list(docs) if docs is not None else []
    ranked_docs, doc_scores = rank_documents(query, docs, index, idf, tf, title_index)
    return ranked_docs, doc_scores


In [None]:
#Represent the query as a weighted tf-idf vector
#Represent each document as a weighted tfidf vector
#Compute the cosine similarity score for the
#query vector and each document vector
#Rank documents with respect to the query by score
#Return the top K (e.g., K = 10) to the user

# Function copied from Lab 1 for ranking documents based on TF-IDF
def rank_documents(terms, docs, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title

    Returns:
    Print the list of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms
    # The remaining elements would become 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  

    # Calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    #print ('\n'.join(result_docs), '\n')
    return result_docs, doc_scores

In [28]:
print("Insert your query (i.e.: women full sleeve sweatshirt cotton):\n")
query = "women full sleeve sweatshirt cotton"
ranked_docs, scores = search_tf_idf(query, index)
top = 10

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for d_id in ranked_docs[:top]:
    print("page_id= {} - page_title: {}".format(d_id, title_index[d_id]))

Insert your query (i.e.: women full sleeve sweatshirt cotton):


Top 10 results out of 215 for the searched query:

page_id= 4290 - page_title: Full Sleeve Solid Women Sweatshirt
page_id= 4288 - page_title: Full Sleeve Solid Women Sweatshirt
page_id= 25149 - page_title: Full Sleeve Self Design Women Sweatshirt
page_id= 25300 - page_title: Full Sleeve Solid Women Sweatshirt
page_id= 14655 - page_title: Full Sleeve Solid Women Sweatshirt
page_id= 25151 - page_title: Full Sleeve Color Block Women Sweatshirt
page_id= 25015 - page_title: Full Sleeve Color Block Women Sweatshirt
page_id= 22995 - page_title: Full Sleeve Color Block Women Sweatshirt
page_id= 25142 - page_title: Full Sleeve Self Design, Color Block Women Sweatshirt
page_id= 24129 - page_title: Full Sleeve Graphic Print Women Sweatshirt
