In [1]:
import os
import json
import nltk # if you do not have 'nltk', the following command should work "python -m pip install nltk"
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math
import numpy as np
import collections
from numpy import linalg as la
import string
import re

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julialopezpinot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/julialopezpinot/nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/julialopezpinot/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/julialopezpinot/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/julialopezpinot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
docs_path = os.path.join('..', '..', 'data', 'fashion_products_dataset.json')

with open(docs_path, 'r') as f:
    products = json.load(f)

print("Total number of products in the corpus: {}".format(len(products)))

Total number of products in the corpus: 28080


In [4]:
#Display the first 5 products
display(products[:5])

#Display the keys of the first product to see available information
if products:
    print("\nKeys available for each product:")
    print(products[0].keys())

[{'_id': 'fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a',
  'actual_price': '2,999',
  'average_rating': '3.9',
  'brand': 'York',
  'category': 'Clothing and Accessories',
  'crawled_at': 1612987911000,
  'description': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India',
  'discount': '69% off',
  'images': ['https://rukminim1.flixcart.com/image/128/128/jr3t5e80/track-pant/z/y/n/m-1005combo2-yorker-original-imafczg3xfh5qqd4.jpeg?q=70',
   'https://rukminim1.flixcart.com/image/128/128/jr58l8w0/track-pant/w/d/a/l-1005combo8-yorker-original-imafczg3pgtxgraq.jpeg?q=70'],
  'out_of_stock': False,
  'pid': 'TKPFCZ9EA7H5FYZH',
  'product_details': [{'Style Code': '1005COMBO2'},
   {'Closure': 'Elastic'},
   {'Pockets': 'Side Pockets'},
   {'Fabric': 'Cotton Blend'},
   {'Pattern': 'Solid'},
   {'Color': 'Multicolor'}],
  'seller': 'Shyam Enterprises',
  'selling_


Keys available for each product:
dict_keys(['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url'])


In [5]:
# display list of selling_price values for each product (products is a list of dicts)
display([p.get('discount') for p in products])

['69% off',
 '66% off',
 '68% off',
 '69% off',
 '68% off',
 '74% off',
 '68% off',
 '63% off',
 '67% off',
 '64% off',
 '76% off',
 '49% off',
 '49% off',
 '61% off',
 '61% off',
 '48% off',
 '61% off',
 '61% off',
 '61% off',
 '69% off',
 '70% off',
 '67% off',
 '70% off',
 '76% off',
 '71% off',
 '70% off',
 '71% off',
 '76% off',
 '67% off',
 '67% off',
 '66% off',
 '66% off',
 '68% off',
 '64% off',
 '29% off',
 '79% off',
 '79% off',
 '73% off',
 '80% off',
 '79% off',
 '79% off',
 '81% off',
 '79% off',
 '80% off',
 '79% off',
 '72% off',
 '80% off',
 '79% off',
 '79% off',
 '81% off',
 '79% off',
 '79% off',
 '77% off',
 '80% off',
 '79% off',
 '81% off',
 '79% off',
 '79% off',
 '67% off',
 '79% off',
 '71% off',
 '79% off',
 '79% off',
 '79% off',
 '77% off',
 '77% off',
 '76% off',
 '79% off',
 '77% off',
 '77% off',
 '77% off',
 '79% off',
 '79% off',
 '77% off',
 '79% off',
 '79% off',
 '79% off',
 '77% off',
 '77% off',
 '77% off',
 '81% off',
 '79% off',
 '82% off',
 '81

# Preprocessing Functions

In [6]:
def setup_preprocessing_tools():
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    stop_words.update(['made', 'wear', 'buy', 'product', 'latest'])
    stop_words.discard('no')
    stop_words.discard('not')
    return stemmer, lemmatizer, stop_words

In [7]:
def preprocess_text(text, lemmatizer, stemmer, stop_words):
    """
    Preprocess natural text:
    - lowercase
    - remove punctuation/numbers
    - tokenize
    - remove stopwords and non-alphabetic tokens
    - stem + lemmatize
    """
    if not isinstance(text, str):
        return []
    
    text = text.replace('-', ' ')

    # lowercase
    text = text.lower()

    # remove punctuation and digits
    text = re.sub(f"[{re.escape(string.punctuation)}0-9]", " ", text)

    # tokenize
    tokens = word_tokenize(text)

    # filter tokens (stopwords, non-alpha, short tokens)
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]

    # stem + lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    tokens = [stemmer.stem(w) for w in tokens]

    # Optional: normalize color/material terms (bonus)
    color_map = {'navy': 'blue', 'grey': 'gray', 'fucsia': 'pink', 'burgundy': 'red', 'violet': 'purple', 'beige': 'brown', 'magenta': 'pink', 'indigo': 'blue', 
                 'charcoal': 'gray', 'crimson': 'red', 'teal': 'green', 'lavender': 'purple', 'mustard': 'yellow', 'turquoise': 'blue', 'peach': 'orange'}
    
    #!!!!!!!!!!!!!!!!!!!!!!!!!hink more materials to add here
    material_map = {'polyester': 'synthetic', 'denim': 'cotton'}
    tokens = [color_map.get(w, material_map.get(w, w)) for w in tokens]

    return tokens


In [8]:
def clean_numeric(value, value_type='float'):
  """Convert strings to numeric values"""
  if not isinstance(value, str):
      return None

  #Eliminate non-numerical characters (except dots)
  cleaned_value = re.sub(r'[^0-9.]', '', value)
  if not cleaned_value:
      return None

  try:
      if value_type == 'int':
          return int(float(cleaned_value))
      else:
          return float(cleaned_value)
  except (ValueError, TypeError):
      return None

In [9]:
#Apply some preprocessing to categorical fields such as "brand". Here we only use conversion to lowercase and join everything into a single token, since it's already clean for the most part
def clean_categorical(value):
    """Few processing for categorical fields"""
    if not isinstance(value, str):
        return None

    #Just convert to lowercase and join everything into a single token separating words by "_"
    return value.lower().strip().replace(' ', '_')

In [None]:
stemmer, lemmatizer, stop_words = setup_preprocessing_tools()

processed_corpus = []
for prod in products:
    processed_prod={
        'pid': prod.get('pid'),
        'url': prod.get('url'),
    }

    # text fields

    title_text = prod.get('title', '') or ''
    description_text = prod.get('description', '') or ''
    full_text = title_text + " " + description_text

    processed_prod['processed_text'] = preprocess_text(full_text, lemmatizer, stemmer, stop_words)

    #store raw title/description to include in final output
    processed_prod['title'] = prod.get('title')
    processed_prod['description'] = prod.get('description')

    #Processing of categorical fields
    processed_prod['brand_facet'] = clean_categorical(prod.get('brand'))
    processed_prod['category_facet'] = clean_categorical(prod.get('category'))
    processed_prod['subcategory_facet'] = clean_categorical(prod.get('sub_category'))
    processed_prod['seller_facet'] = clean_categorical(prod.get('seller'))

    #Processing of numerical fields (passed to float)
    processed_prod['discount'] = clean_numeric(prod.get('discount'), 'float')
    processed_prod['selling_price'] = clean_numeric(prod.get('selling_price'), 'float')
    processed_prod['actual_price'] = clean_numeric(prod.get('actual_price'), 'float')
    processed_prod['average_rating'] = clean_numeric(prod.get('average_rating'), 'float')
    
    
    #Process product_details separately
    #Extract the different values and process them as natural text alltogether
    details_text = ''
    if isinstance(prod.get('product_details'), list):
        details_values = [list(item.values())[0] for item in prod.get('product_details', []) if item and isinstance(list(item.values())[0], str)]
        details_text = ' '.join(details_values)
    processed_prod['attributes'] = preprocess_text(details_text, lemmatizer, stemmer, stop_words)

    processed_corpus.append(processed_prod)
    

In [15]:
#Example of product preprocessed
display(processed_corpus[0])

{'pid': 'TKPFCZ9EA7H5FYZH',
 'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itmd2c76aadce459?pid=TKPFCZ9EA7H5FYZH&lid=LSTTKPFCZ9EA7H5FYZHVYXWP0&marketplace=FLIPKART&srno=b_1_1&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFCZ9EA7H5FYZH.SEARCH&ssid=utkd4t3gb40000001612415717799',
 'processed_text': ['solid',
  'woman',
  'multicolor',
  'track',
  'pant',
  'yorker',
  'trackpant',
  'rich',
  'comb',
  'cotton',
  'give',
  'rich',
  'look',
  'design',
  'comfort',
  'skin',
  'friendli',
  'fabric',
  'itch',
  'free',
  'waistband',
  'great',
  'year',
  'round',
  'use',
  'proudli',
  'india'],
 'brand_facet': 'york',
 'category_facet': 'clothing_and_accessories',
 'subcategory_facet': 'bottomwear',
 'seller_facet': 'shyam_enterprises',
 'discount': 69.0,
 'selling_price': 921.0,
 'actual_price': 2999.0,
 'average_rating': 3.9,
 'attributes': ['combo',
  'elast',
  'side',
  'pocket',
  'cotton',
  'blend',
  'solid',
  'multic