In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
# if you do not have 'nltk', the following command should work "python -m pip install nltk"
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math
import numpy as np
import collections
from numpy import linalg as la
import string
import re

In [None]:
import json

docs_path = '/content/drive/Shareddrives/RIAW/DELIV 1/fashion_products_dataset.json'

products = []
with open(docs_path, 'r') as f:
    products = json.load(f)

print("Total number of products in the corpus: {}".format(len(products)))

Total number of products in the corpus: 28080


In [None]:
#Display the first 5 products
display(products[:5])

#Display the keys of the first product to see available information
if products:
    print("\nKeys available for each product:")
    print(products[0].keys())

[{'_id': 'fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a',
  'actual_price': '2,999',
  'average_rating': '3.9',
  'brand': 'York',
  'category': 'Clothing and Accessories',
  'crawled_at': 1612987911000,
  'description': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India',
  'discount': '69% off',
  'images': ['https://rukminim1.flixcart.com/image/128/128/jr3t5e80/track-pant/z/y/n/m-1005combo2-yorker-original-imafczg3xfh5qqd4.jpeg?q=70',
   'https://rukminim1.flixcart.com/image/128/128/jr58l8w0/track-pant/w/d/a/l-1005combo8-yorker-original-imafczg3pgtxgraq.jpeg?q=70'],
  'out_of_stock': False,
  'pid': 'TKPFCZ9EA7H5FYZH',
  'product_details': [{'Style Code': '1005COMBO2'},
   {'Closure': 'Elastic'},
   {'Pockets': 'Side Pockets'},
   {'Fabric': 'Cotton Blend'},
   {'Pattern': 'Solid'},
   {'Color': 'Multicolor'}],
  'seller': 'Shyam Enterprises',
  'selling_


Keys available for each product:
dict_keys(['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url'])


In [None]:
#PREPROCESSING FUNCTIONS

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
stop_words.add('made') #We add some stopwords not considered by the nltk library
#We remove negation stopwords, as eliminating negations could make us obtain the opposite of what we are looking for
stop_words.remove('no')
stop_words.remove('not')
translator = str.maketrans('', '', string.punctuation)

processed_tokens = []

#Preprocessing of natural text. We apply conversion to lowercase, tokenization, elimination of punctuation and numbers and stemming since we need to clean a lot the text
def preprocess_text(text):
  """Preprocessing of natural text"""
  if not isinstance(text, str):
      return []

  #Convert to lowercase
  text = text.lower()

  #Eliminate punctuation and numbers by substituting them by blank spaces (to prevent 2 words being combined or losing info if a number is together with text)
  text = re.sub(f'[{re.escape(string.punctuation)}0-9]', ' ', text)

  #Tokenize (we use word_tokenize instead of split() as it's more accurate)
  tokens = word_tokenize(text)

  #Eliminate stop words and ensure they are aphabetical tokens
  tokens = [word for word in tokens if word not in stop_words and word.isalpha()]

  #Stemming
  tokens = [stemmer.stem(word) for word in tokens]

  return tokens

#Conversion of numbers that appear inside a string to numerical values
def clean_numeric(value, value_type='float'):
  """Convert strings to numeric values"""
  if not isinstance(value, str):
      return None

  #Eliminate non-numerical characters (except dots)
  cleaned_value = re.sub(r'[^0-9.]', '', value)
  if not cleaned_value:
      return None

  try:
      if value_type == 'int':
          return int(float(cleaned_value))
      else:
          return float(cleaned_value)
  except (ValueError, TypeError):
      return None

#Apply some preprocessing to categorical fields such as "brand". Here we only use conversion to lowercase and join everything into a single token, since it's already clean for the most part
def clean_categorical(value):
    """Few processing for categorical fields"""
    if not isinstance(value, str):
        return None

    #Just convert to lowercase and join everything into a single token separating words by "_"
    return value.lower().strip().replace(' ', '_')

In [None]:
#APPLY PREPROCESSING TO THE CORPUS

processed_corpus = []
for prod in products:
        #Dictionary for processed product
        processed_prod = {
            'pid': prod.get('pid'),
            'title': prod.get('title'),
            'description': prod.get('description'),
            'brand': prod.get('brand'),
            'category': prod.get('category'),
            'sub_category': prod.get('sub_category'),
            'product_details': prod.get('product_details'),
            'seller': prod.get('seller'),
            'url': prod.get('url'),
        }

        #Processing of natural text fields
        title_text = prod.get('title', '') or ''
        description_text = prod.get('description', '') or ''
        full_text = title_text + " " + description_text
        #Combine title and description into a single field
        processed_prod['processed_text'] = preprocess_text(full_text)

        #Processing of categorical fields
        processed_prod['brand_facet'] = clean_categorical(prod.get('brand'))
        processed_prod['category_facet'] = clean_categorical(prod.get('category'))
        processed_prod['subcategory_facet'] = clean_categorical(prod.get('sub_category'))
        processed_prod['seller_facet'] = clean_categorical(prod.get('seller'))

        #Process product_details separately
        #Extract the different values and process them as natural text alltogether
        details_text = ''
        if isinstance(prod.get('product_details'), list):
            details_values = [list(item.values())[0] for item in prod.get('product_details', []) if item and isinstance(list(item.values())[0], str)]
            details_text = ' '.join(details_values)
        processed_prod['attributes'] = preprocess_text(details_text)

        #Processing of numerical and boolean fields
        processed_prod['out_of_stock'] = prod.get('out_of_stock', True)
        processed_prod['selling_price'] = clean_numeric(prod.get('selling_price'), 'float')
        processed_prod['actual_price'] = clean_numeric(prod.get('actual_price'), 'float')
        processed_prod['discount'] = clean_numeric(prod.get('discount'), 'int')
        processed_prod['average_rating'] = clean_numeric(prod.get('average_rating'), 'float')

        processed_corpus.append(processed_prod)

In [None]:
#Example of product preprocessed
display(processed_corpus[0])

{'pid': 'TKPFCZ9EA7H5FYZH',
 'title': 'Solid Women Multicolor Track Pants',
 'description': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India',
 'brand': 'York',
 'category': 'Clothing and Accessories',
 'sub_category': 'Bottomwear',
 'product_details': [{'Style Code': '1005COMBO2'},
  {'Closure': 'Elastic'},
  {'Pockets': 'Side Pockets'},
  {'Fabric': 'Cotton Blend'},
  {'Pattern': 'Solid'},
  {'Color': 'Multicolor'}],
 'seller': 'Shyam Enterprises',
 'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itmd2c76aadce459?pid=TKPFCZ9EA7H5FYZH&lid=LSTTKPFCZ9EA7H5FYZHVYXWP0&marketplace=FLIPKART&srno=b_1_1&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFCZ9EA7H5FYZH.SEARCH&ssid=utkd4t3gb40000001612415717799',
 'processed_text': ['solid',
  'women',
  'multicolor',
  'track',
  'pant',
  'yorker',
  'trackpa