# Part 1: Text Processing and Exploratory Data Analysis

Author/s: <font color="blue">Jhonatan Barcos Gambaro | Daniel Alexander Yearwood</font>

E-mail: <font color="blue">jhonatan.barcos01@estudiant.upf.edu | danielalexander.yearwood01@estudiant.upf.edu </font>

Date: <font color="blue">24/10/2025</font>

In [66]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [67]:
# Download nltk resources
#nltk.download('punkt')
#nltk.download('stopwords')

In [68]:
# Upload dataset
data_path = '../../data/fashion_products_dataset.json'
products = pd.read_json(data_path)

# Display head of the dataset
display(products.head(5))

Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999,3.9,York,Clothing and Accessories,2021-02-10 20:11:51,Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EA7H5FYZH,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,921,Bottomwear,Solid Women Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499,3.9,York,Clothing and Accessories,2021-02-10 20:11:52,Yorker trackpants made from 100% rich combed c...,66% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EJZV2UVRZ,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,499,Bottomwear,Solid Men Blue Track Pants,https://www.flipkart.com/yorker-solid-men-blue...
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999,3.9,York,Clothing and Accessories,2021-02-10 20:11:52,Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EHFCY5Z4Y,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,931,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
3,3f3f97bb-5faf-57df-a9ff-1af24e2b1045,2999,3.9,York,Clothing and Accessories,2021-02-10 20:11:53,Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9ESZZ7YWEF,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,911,Bottomwear,Solid Women Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
4,750caa3d-6264-53ca-8ce1-94118a1d8951,2999,3.9,York,Clothing and Accessories,2021-02-10 20:11:53,Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EVXKBSUD7,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,943,Bottomwear,"Solid Women Brown, Grey Track Pants",https://www.flipkart.com/yorker-solid-men-brow...


## 1.1. Pre-Processing text

Pre-process of the documents. In particular, for the text fields (title,
description)

In [69]:
# Define clean_text function to preprocess documents:
# 1. Removing stop words with nltk
# 2. Tokenization with nltk
# 3. Removing punctuation marks
# 4. Stemming with nltk's PorterStemmer
stop_words = set(stopwords.words("english"))
stemmer = nltk.PorterStemmer()

def clean_text(text):
    word_tokens = word_tokenize(text.lower())
    textos_limpios = ' '.join([word for word in word_tokens if word not in stop_words and word.isalnum()])
    textos_limpios = ' '.join([stemmer.stem(word) for word in word_tokenize(textos_limpios)])
    return textos_limpios

In [70]:
def clean_text_without_stemming(text):
    word_tokens = word_tokenize(text.lower())
    clean_text = ' '.join([word for word in word_tokens if word not in stop_words and word.isalnum()])
    return clean_text

In [71]:
# Apply clean_text function to the columns 'title' and 'description' of the products dataset
products_cleaned = products.copy()
products_cleaned['title'] = products_cleaned['title'].apply(clean_text)
products_cleaned['cleaned'] = products_cleaned['description'].apply(clean_text)

In [72]:
# Print title, cleaned_title, description, cleaned_description of the first 2 products
for i in range(2):
    print("Title:", products['title'].iloc[i])
    print("Cleaned Title:", products_cleaned['title'].iloc[i])
    print("Description:", products['description'].iloc[i])
    print("Cleaned Description:", products_cleaned['cleaned'].iloc[i], "\n")

Title: Solid Women Multicolor Track Pants
Cleaned Title: solid women multicolor track pant
Description: Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India
Cleaned Description: yorker trackpant made 100 rich comb cotton give rich comfort skin friendli fabric waistband great year round use proudli made india 

Title: Solid Men Blue Track Pants
Cleaned Title: solid men blue track pant
Description: Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India
Cleaned Description: yorker trackpant made 100 rich comb cotton give rich comfort skin friendli fabric waistband great year round use proudli made india 



## 1.2. Handle of category, sub_category, brand, product_details, and seller during pre-processing. 


In [73]:
# First we'll analyze the columns category, sub_category, brand, product_details, and seller to decide how to handle them during pre-processing
columns_to_analyze = ['category', 'sub_category', 'brand', 'product_details', 'seller']
columns_to_analyze_reduced = ['category', 'sub_category', 'brand', 'seller'] # Remove product_details due to is a list

# Display first 3 products of the columns to analyze
print("Displaying first 3 products of the columns to analyze:")
display(products[columns_to_analyze].head(3))

# Analyze the unique values in each column to analyze
print("\nDisplaying unique values in each column to analyze:")
display(products[columns_to_analyze_reduced].nunique())

# Analyze the values of product_details list of the first 10 products
print("\nDisplaying values in product_details for the first 10 products:\n")
for i in range(10):
    print("Product", i+1, "product_details:", products['product_details'].iloc[i])

Displaying first 3 products of the columns to analyze:


Unnamed: 0,category,sub_category,brand,product_details,seller
0,Clothing and Accessories,Bottomwear,York,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises
1,Clothing and Accessories,Bottomwear,York,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises
2,Clothing and Accessories,Bottomwear,York,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises



Displaying unique values in each column to analyze:


category          4
sub_category     24
brand           325
seller          535
dtype: int64


Displaying values in product_details for the first 10 products:

Product 1 product_details: [{'Style Code': '1005COMBO2'}, {'Closure': 'Elastic'}, {'Pockets': 'Side Pockets'}, {'Fabric': 'Cotton Blend'}, {'Pattern': 'Solid'}, {'Color': 'Multicolor'}]
Product 2 product_details: [{'Style Code': '1005BLUE'}, {'Closure': 'Drawstring, Elastic'}, {'Pockets': 'Side Pockets'}, {'Fabric': 'Cotton Blend'}, {'Pattern': 'Solid'}, {'Color': 'Blue'}]
Product 3 product_details: [{'Style Code': '1005COMBO4'}, {'Closure': 'Elastic'}, {'Pockets': 'Side Pockets'}, {'Fabric': 'Cotton Blend'}, {'Pattern': 'Solid'}, {'Color': 'Multicolor'}]
Product 4 product_details: [{'Style Code': '1005COMBO3'}, {'Closure': 'Elastic'}, {'Pockets': 'Side Pockets'}, {'Fabric': 'Cotton Blend'}, {'Pattern': 'Solid'}, {'Color': 'Multicolor'}]
Product 5 product_details: [{'Style Code': '1005COMBO1'}, {'Closure': 'Drawstring, Elastic'}, {'Pockets': 'Side Pockets'}, {'Fabric': 'Cotton Blend'}, {'Pattern': 'Solid'}, {'Color': 'Br

One we analyze the columns to pre-process, to take the best decision we'll study the validation_labels.csv, which will play a pivotal role in the project’s
second phase.

```python
For your reference: 
query_1: women full sleeve sweatshirt cotton,
query_2: men slim jeans blue


In [74]:
explanation = """
Recommendation — hybrid approach
- Index category, sub_category, brand and seller as separate fields (fielded index / facets).
- Parse product_details (list of dicts) into structured attribute fields (e.g. color, fabric, pattern) and also produce a flattened details_text.
- Additionally create a merged `all_text` field (title + description + flattened product_details + brand) for full‑text retrieval and ranking.

Justification
- Distinctiveness: category/sub_category give coarse topical signals; brand and seller are metadata/facets that strongly affect filtering and precision; product_details contain fine-grained attributes (color, fabric, fit) that are crucial for exact-match and attribute queries.
- Retrieval effectiveness improves when you can both do free-text matching (high recall) on an aggregated field and apply fielded boosting/filters (high precision) on structured fields.

Pros & cons
- Merged single field only
    - Pros: simpler index; higher recall for free-text queries.
    - Cons: loses attribute semantics (cannot boost by brand or filter by color easily); ranking cannot exploit field importance.
- Separate fields only
    - Pros: precise filtering, faceted navigation, field-specific boosting, better precision.
    - Cons: may reduce recall for queries where words appear across different fields; more index complexity.
- Hybrid (recommended)
    - Pros: best of both — use `all_text` for recall and phrase matching, use structured fields and facets for precision and business rules (boost brand, filter by category, exact match on attributes).
    - Cons: larger index size, slightly more complex ingestion (parse product_details), need to tune field weights and analyzers.

Implementation notes
- Normalize values: lowercase, canonicalize brand/seller names, map synonyms (e.g., "tee" → "t-shirt").
- product_details: extract key/value pairs; index keys as fields and also index "key:value" tokens (e.g., `color_blue`) to preserve semantics and support exact attribute queries.
- Field analyzers: use keyword (non-tokenized) for brand and category IDs, use text analyzer (with stemming/stopword removal) for title/description/details_text.
- Use field boosting at query time (e.g., title>brand>details>all_text) and provide faceted filters on category/sub_category/brand/seller/attributes.

Short summary
- Parse product_details into structured attributes, keep category/sub_category/brand/seller as separate, and also maintain a merged `all_text` for full-text ranking. Tune analyzers and field weights for best balance of recall and precision.
"""

display(Markdown(explanation))


Recommendation — hybrid approach
- Index category, sub_category, brand and seller as separate fields (fielded index / facets).
- Parse product_details (list of dicts) into structured attribute fields (e.g. color, fabric, pattern) and also produce a flattened details_text.
- Additionally create a merged `all_text` field (title + description + flattened product_details + brand) for full‑text retrieval and ranking.

Justification
- Distinctiveness: category/sub_category give coarse topical signals; brand and seller are metadata/facets that strongly affect filtering and precision; product_details contain fine-grained attributes (color, fabric, fit) that are crucial for exact-match and attribute queries.
- Retrieval effectiveness improves when you can both do free-text matching (high recall) on an aggregated field and apply fielded boosting/filters (high precision) on structured fields.

Pros & cons
- Merged single field only
    - Pros: simpler index; higher recall for free-text queries.
    - Cons: loses attribute semantics (cannot boost by brand or filter by color easily); ranking cannot exploit field importance.
- Separate fields only
    - Pros: precise filtering, faceted navigation, field-specific boosting, better precision.
    - Cons: may reduce recall for queries where words appear across different fields; more index complexity.
- Hybrid (recommended)
    - Pros: best of both — use `all_text` for recall and phrase matching, use structured fields and facets for precision and business rules (boost brand, filter by category, exact match on attributes).
    - Cons: larger index size, slightly more complex ingestion (parse product_details), need to tune field weights and analyzers.

Implementation notes
- Normalize values: lowercase, canonicalize brand/seller names, map synonyms (e.g., "tee" → "t-shirt").
- product_details: extract key/value pairs; index keys as fields and also index "key:value" tokens (e.g., `color_blue`) to preserve semantics and support exact attribute queries.
- Field analyzers: use keyword (non-tokenized) for brand and category IDs, use text analyzer (with stemming/stopword removal) for title/description/details_text.
- Use field boosting at query time (e.g., title>brand>details>all_text) and provide faceted filters on category/sub_category/brand/seller/attributes.

Short summary
- Parse product_details into structured attributes, keep category/sub_category/brand/seller as separate, and also maintain a merged `all_text` for full-text ranking. Tune analyzers and field weights for best balance of recall and precision.


**Should they be merged into a single text field, indexed as separate fields in the inverted index or any other alternative?**

**Justify your choice, considering how their distinctiveness may affect retrieval effectiveness.**

**What are pros and cons of each approach?**

In [None]:
# Hybrid approach implementation (fixed flattening for product_details structured as list of single-key dicts)

# Define auxiliar functions 
# 1. preprocess_full_text: to preprocess text for full-text field (All Text)
def preprocess_full_text(text):
    if not isinstance(text, str):
        return [] 
    
    word_tokens = nltk.word_tokenize(text.lower())
    
    tokens_nets = [
        stemmer.stem(word) 
        for word in word_tokens 
        if word not in stop_words and word.isalnum() 
    ]
    
    return tokens_nets

# 2. preprocess_keyword: to preprocess text for keyword fields (Fielded Index)
def preprocess_keyword(text):
    if not isinstance(text, str):
        return None
    
    text = text.lower().strip()
    if not text:
        return None
        
    text = re.sub(r'\s+', '_', text) 
    text_net = ''.join(char for char in text if char.isalnum() or char == '_')

    return text_net

# 3. process_product_details: to process product_details list of dicts into text for All Text and keyword fields for Fielded Index
def process_product_details(details_list):
    text_for_all_text_list = []
    keyword_fields_dict = {}
    if not isinstance(details_list, list):
        return '', keyword_fields_dict
    for item_dict in details_list:
        if isinstance(item_dict, dict):
            for key, value in item_dict.items():
                if isinstance(value, str):
                    # 1. Recall fields per al Canal 1 (All Text)
                    text_for_all_text_list.append(key)
                    text_for_all_text_list.append(value)
                    
                    # 2. Precision fields per al Canal 2 (Fielded Index)
                    kw_key_name = f"detail_{preprocess_keyword(key)}"
                    kw_value = preprocess_keyword(value) 
                    if kw_key_name and kw_value:
                        keyword_fields_dict[kw_key_name] = kw_value
    
    return ' '.join(text_for_all_text_list), keyword_fields_dict

In [76]:
# Hybrid architecture implementation
products_cleaned['brand_kw'] = products['brand'].apply(preprocess_keyword)
products_cleaned['category_kw'] = products['category'].apply(preprocess_keyword)
products_cleaned['sub_category_kw'] = products['sub_category'].apply(preprocess_keyword)
products_cleaned['seller_kw'] = products['seller'].apply(preprocess_keyword)

temp_df = products['product_details'].apply(lambda x: pd.Series(process_product_details(x)))
temp_df.columns = ['details_text_chunk', 'details_kw_dict']
details_kw_df = temp_df['details_kw_dict'].apply(pd.Series)
products_cleaned = pd.concat([products_cleaned, details_kw_df], axis=1)

# Apply preprocess_full_text to create 'all_text' field
title_cleaned_tokens = products['title'].apply(preprocess_full_text)
description_cleaned_tokens = products['description'].apply(preprocess_full_text)
brand_cleaned_tokens = products['brand'].apply(preprocess_full_text)
category_cleaned_tokens = products['category'].apply(preprocess_full_text)
sub_category_cleaned_tokens = products['sub_category'].apply(preprocess_full_text)
seller_cleaned_tokens = products['seller'].apply(preprocess_full_text)
details_cleaned_tokens = temp_df['details_text_chunk'].apply(preprocess_full_text)

# Combine all tokens into 'all_text' field
products_cleaned['all_text'] = (
    title_cleaned_tokens +
    description_cleaned_tokens +
    brand_cleaned_tokens +
    category_cleaned_tokens +
    sub_category_cleaned_tokens +
    seller_cleaned_tokens +
    details_cleaned_tokens
)

In [77]:
# Test final verification
print("\n--- Verificació del Resultat Final ---")
cols_to_show = ['pid', 'all_text', 'brand_kw', 'category_kw']
if 'detail_color' in products_cleaned.columns:
    cols_to_show.append('detail_color')
if 'detail_fabric' in products_cleaned.columns:
    cols_to_show.append('detail_fabric')

display(products_cleaned[cols_to_show].head())


--- Verificació del Resultat Final ---


Unnamed: 0,pid,all_text,brand_kw,category_kw,detail_color,detail_fabric
0,TKPFCZ9EA7H5FYZH,"[solid, women, multicolor, track, pant, yorker...",york,clothing_and_accessories,multicolor,cotton_blend
1,TKPFCZ9EJZV2UVRZ,"[solid, men, blue, track, pant, yorker, trackp...",york,clothing_and_accessories,blue,cotton_blend
2,TKPFCZ9EHFCY5Z4Y,"[solid, men, multicolor, track, pant, yorker, ...",york,clothing_and_accessories,multicolor,cotton_blend
3,TKPFCZ9ESZZ7YWEF,"[solid, women, multicolor, track, pant, yorker...",york,clothing_and_accessories,multicolor,cotton_blend
4,TKPFCZ9EVXKBSUD7,"[solid, women, brown, grey, track, pant, yorke...",york,clothing_and_accessories,brown_grey,cotton_blend


## 1.3. Handle of out_of_stock, selling_price, discount, actual_price, and average_rating.

**Decide how these should be handled during pre-processing to use in further search.**


**Should they be indexed as textual terms?**

# PART 2: Exploratory Data Analysis