# unprocessed product metadata

In [10]:
import json
import gzip
import pandas as pd
from collections import Counter
from tqdm import tqdm
import numpy as np

In [2]:
amazon_meta_raw_path = '../../data/amazon/All_Amazon_Meta.json.gz'

In [5]:
amazon_cats_counter = Counter()
data = {}
with gzip.open(amazon_meta_raw_path) as f:
    for l in tqdm(f):
        dat = json.loads(l.strip())
        dat['category'] = [
            i.lower() for i in dat['category']
        ]
        data[dat['title']] = dat['category']
        amazon_cats_counter.update(dat['category'])


15023059it [29:47, 8402.56it/s] 


# 1. raw data has too many cateogry nodes and paths (need cleaning)

In [11]:
{
    '# products': len(data),
    '# category paths': len(set([tuple(data[i]) for i in data])),
    '# category nodes': len(amazon_cats_counter),
    'avg path length': np.mean([len(data[i]) for i in data])
}

{'# products': 14127828,
 '# category paths': 1430707,
 '# category nodes': 2405474,
 'avg path length': 4.259480296617427}

In [12]:
# just on average 9 products for each category path (too few)
14127828 / 1430707

9.874717884234858

In [13]:
# way more category nodes than normal (400 times more)
2405474 / 6000

400.9123333333333

In [15]:
# l1 and l2 category size seems reasonable
{
    '# l1 nodes': len(set([data[i][0] for i in data if len(data[i])>0])),
    '# l2 nodes': len(set([data[i][1] for i in data if len(data[i])>1])),
    '# l3 nodes': len(set([data[i][2] for i in data if len(data[i])>2])),
    '# l4 nodes': len(set([data[i][3] for i in data if len(data[i])>3])),
}

{'# l1 nodes': 29, '# l2 nodes': 740, '# l3 nodes': 14139, '# l4 nodes': 48417}

In [17]:
# l1 -> l2 category expansion seems reasonable
740 / 29

25.517241379310345

In [19]:
# clothing, shows & jewelry dominates
Counter([data[i][0] for i in data if len(data[i])>0]).most_common()

[('clothing, shoes & jewelry', 2499458),
 ('books', 2416775),
 ('home & kitchen', 1263764),
 ('automotive', 877805),
 ('sports & outdoors', 846780),
 ('electronics', 740508),
 ('toys & games', 556598),
 ('tools & home improvement', 526186),
 ('cell phones & accessories', 518564),
 ('kindle store', 473020),
 ('cds & vinyl', 327383),
 ('office products', 286094),
 ('arts, crafts & sewing', 281825),
 ('grocery & gourmet food', 280126),
 ('patio, lawn & garden', 272410),
 ('movies & tv', 176207),
 ('pet supplies', 175083),
 ('industrial & scientific', 155848),
 ('musical instruments', 105877),
 ('video games', 66537),
 ('appliances', 29172),
 ('collectibles & fine art', 26330),
 ('software', 19211),
 ('gift cards', 1334),
 ('alexa skills', 914),
 ('home & business services', 179),
 ('magazine subscriptions', 91),
 ('handmade', 7),
 ('digital music', 2)]

## 2. raw data's category is noisy

In [4]:
dat

{'category': ['clothing, shoes & jewelry',
  'women',
  'clothing',
  'tops, tees & blouses',
  'blouses & button-down shirts',
  'import',
  'versatile occasions - great for daily,casual,i am sure you will like it!',
  'black friday cyber monday christmas loose blouse v-neck blouse solid color blouse sling blouse fashion blouse cool blouse vintage blouse popular blouse fun blouse sexy blouse lace vest blouse wild blouse sleeveless blouse lace blouse closure',
  'printing vest rose blouse velvet shirt couple blouse elegant blouse long sleeve blouse lace vest printed blouse high low hem blouse top short sleeve blouses hollow clothing casual shirts out shoulder blouse three quarter sleeve shirts chiffon blouse floral printed button shirt stailored blouse button blouse crew neck blouse zip blouse cheap blouse expensive blouse discount blouse box pattern blouse twill fabric blouse straight blouse rolling blouse crimping blouse leaf blouse sling blouse',
  'shawl blouse transparent blouse c

# processed data (restrict number of category nodes and remove badly-named categories)

In [28]:
# process script: ../preprocess/process_amazon_product_categories.py

In [20]:
import pandas as pd
from collections import Counter
from tqdm import tqdm
import numpy as np

In [21]:
df_train = pd.read_json('../../data/amazon/All_Amazon_Meta_Train.json', lines=True)
df_test = pd.read_json('../../data/amazon/All_Amazon_Meta_Test.json', lines=True)

In [22]:
{
    '# products': len(df_train),
    '# category paths': len(set(df_train.category.apply(tuple))),
    '# category nodes': len(set([j for i in df_train.to_dict('records') for j in i['category']])),
    'avg path length': df_train.category.apply(len).mean()
}

{'# products': 12794705,
 '# category paths': 36354,
 '# category nodes': 5701,
 'avg path length': 3.8124077890033417}

In [26]:
# after cleaning, on average 350 products for each category path, more reasonable
12794705 / 36354

351.94765362821147

In [25]:
# after cleaning, l1 to l4 category size are resonable
{
    '# l1 nodes': len(set(df_train.category.apply(lambda x: x[0] if len(x) > 0 else ''))),
    '# l2 nodes': len(set(df_train.category.apply(lambda x: x[1] if len(x) > 1 else ''))),
    '# l3 nodes': len(set(df_train.category.apply(lambda x: x[2] if len(x) > 2 else ''))),
    '# l4 nodes': len(set(df_train.category.apply(lambda x: x[3] if len(x) > 3 else ''))),
}

{'# l1 nodes': 31, '# l2 nodes': 505, '# l3 nodes': 2411, '# l4 nodes': 2933}

In [27]:
df_train_paths = set(df_train.category.apply(tuple))
df_test_paths = set(df_test.category.apply(tuple))

In [29]:
len(df_test_paths.intersection(df_train_paths)) / len(df_test_paths)

0.9900327689787002

In [31]:
len(df_test_paths.intersection(df_train_paths)) / len(df_train_paths)

0.19945535566925235

In [33]:
df_train.sample(10).to_dict('records')

[{'title': 'Champion Cooling, 3 Row All Aluminum Radiator for Jeep CJ Series, CC583',
  'category': ['automotive',
   'replacement parts',
   'engine cooling & climate control',
   'radiators'],
  'text': 'Champion Cooling, 3 Row All Aluminum Radiator for Jeep CJ Series, CC583 -> [automotive][replacement parts][engine cooling & climate control][radiators]'},
 {'title': 'Yoga Clothing For You Ladies OM Symbol Angel Fleece Full-Zip Hoodie',
  'category': ['sports & outdoors',
   'sports & fitness',
   'exercise & fitness',
   'yoga',
   'clothing'],
  'text': 'Yoga Clothing For You Ladies OM Symbol Angel Fleece Full-Zip Hoodie -> [sports & outdoors][sports & fitness][exercise & fitness][yoga][clothing]'},
 {'title': "Troy Lee Designs Ruckus Men's 3/4 Sleeve Bicycle BMX Jersey - Native Turquoise",
  'category': ['automotive',
   'motorcycle & powersports',
   'protective gear',
   'jerseys'],
  'text': "Troy Lee Designs Ruckus Men's 3/4 Sleeve Bicycle BMX Jersey - Native Turquoise -> [aut