# Download meta data file into folder, then unzip

In [None]:
!wget https://s3.amazonaws.com/vargo.aprd6342/data/meta_Clothing_Shoes_and_Jewelry.json.gz -P /content/drive/MyDrive/APRD6343/TopicModeling

In [None]:
#-d decompresses the zip file
!gzip -d /content/drive/MyDrive/APRD6343/TopicModeling/meta_Clothing_Shoes_and_Jewelry.json.gz

# Importing packages, setting working directory, and opening the meta data file

In [None]:
import pickle
import json

#Store the filename as a string variable
working_directory = '/content/drive/MyDrive/APRD6343/TopicModeling'
working_file = '%s/meta_Clothing_Shoes_and_Jewelry.json' % working_directory #%s gets replaced with working_directory
loadedjson = open(working_file, 'r')

# Exlporing products and categories in the file

In [None]:
from time import sleep

#Getting reviews is a 2 step process
#1) Go through Amazon's product catalog for "Clothnig, Shoes and Jewelery" and extract out matching products by their ASIN
#2) Go through review data and parse out matching reviews by ASIN

In [None]:
#1) Extracting ASINs by brand
#First, iterate through data and store it as a dictionary

#Set the counter to see how many products there are in the json file
count = 0
allproducts = {}

#Each line is a product and its metadata
for aline in loadedjson:
  count += 1
  if count % 100000 == 0: #Only print count every 100k, so it doesn't spam the output console
    print(count)
  aproduct = eval(aline) #eval() takes the text string and interprets it as code
  allproducts[aproduct['asin']] = aproduct #Make dictionary entry with the ASIN of the product as the key and the metadata as nested dictionaries

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [None]:
#Next,  explore product data to see what categories are common in the data
#This is a subset of prod categories, so you need to find the list of product ids
#For Steve Madden - use 'categories' meta data field to find the brand


#Create a dictionary of all product subcategories
#By doing so, also come up with a list of brands and number of products they have listed in the Amazon product catalog

allcategories = {}
count = 0

for aproduct in allproducts:
  count += 1 #Counter to know progress in processing the entire catalog
  if count % 100000 == 0:
    #There are 1.5 M products, so build a counter that tells how processing is going - when counter reaches 1, it is done
    print(count/1503384)
  #Set dictionary up with just 1 product entry to inspect and reference it
  aproduct = allproducts[aproduct]
  #Create a dictionary entry for each product category while also counting occurences of each category
  #If there is a category field/key in the dictionary, then continue
  if 'categories' in aproduct:
    for categories in aproduct['categories']: #Iterate through each list of categories
      for acategory in categories: #Iterate for every category in each list
        if acategory in allcategories: #If category is already in dictionary,
          allcategories[acategory] += 1 #Increase count for thet category key
        if acategory not in allcategories: #If it is not already in dictionary,
          allcategories[acategory] = 1 #Add key and set to 1
  #Dictionary with categories as key, values is the number of times each cateogry is there

0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518


In [None]:
sortedlist = []
for acategory in allcategories:
  sortedlist.append((allcategories[acategory],acategory)) #Add count and the category name

sortedlist = sorted(sortedlist, reverse=True) #Sort list descending

for item in range (0,50):
  print (sortedlist[item])
  #Prints top product categories in the product data

(3429257, 'Clothing, Shoes & Jewelry')
(1086181, 'Women')
(617092, 'Clothing')
(541681, 'Men')
(537761, 'Novelty, Costumes & More')
(432653, 'Shoes')
(339900, 'Novelty')
(268065, 'Shoes & Accessories: International Shipping Available')
(255454, 'Jewelry')
(174962, 'Accessories')
(97095, 'Girls')
(93596, 'Tops & Tees')
(87688, 'Dresses')
(84549, 'T-Shirts')
(82063, 'Boots')
(80302, 'Shirts')
(79897, 'Sandals')
(79545, 'Watches')
(77684, 'Boys')
(73507, 'Jewelry: International Shipping Available')
(72372, 'Athletic')
(71414, 'Wrist Watches')
(70335, 'Sports & Outdoors')
(59763, 'Petite')
(58350, 'Fashion')
(53826, 'Costumes & Accessories')
(53021, 'Earrings')
(51728, 'Baby')
(50943, 'Comfort Shoes')
(50662, 'Casual')
(50357, 'Boot Shop')
(50124, 'C')
(49599, 'Active')
(49491, 'Band & Music Fan')
(46004, 'New Arrivals')
(43722, 'Necklaces & Pendants')
(43410, 'Intimates')
(43100, 'S')
(41709, 'Lingerie, Sleep & Lounge')
(41542, 'Handbags & Wallets')
(41400, 'Rings')
(40832, "Women's Luxur

In [None]:
allcategories['Steve Madden'] #Getting count for Steve Madden

4593

# Extracting ASINs

In [None]:
#Now, go through dictionary and extract out matching ASINs for Steve Madden
#First, create a set to store the ASINs
#Note: use a set here to not have duplicates

allSMasins = set ()
count = 0

for areview in allproducts:
  theproduct = allproducts[areview]
  count += 1
  if count % 100000 == 0:
    print(count/1503384)

    #Iterate for each category for a product – any given product can be assigned multiple product categories
    #Each category is encoded as a list, so have to iterate 2 times
  for categories in theproduct['categories']:
    for acategory in categories:
      #Checking to see if product category matches Steve Madden
      #Lowercasing the category string in case capitization might get in the way of a match
      if 'steve madden' in acategory.lower():
        #Store it to the set of Steve Madden ASINs
        allSMasins.add(theproduct['asin'])

print(len(allSMasins)) #Check to see if it matches the count from earlier

0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518
5291


# Write out ASINs to use in the next file (2-Pulling-Reviews) to extract product reviews

In [None]:
outputfile = open('%s/allSMasins.txt' % working_directory, 'w')

outputfile.write(','.join(allSMasins))
outputfile.close()