In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import collections
from scipy import stats

%matplotlib inline

In [2]:
# Making Train, Test, and Validation Datasets
PRODUCTS_FILEPATH = 'data/products.csv'

id_to_product = {}

with open(PRODUCTS_FILEPATH, 'r') as f:
    for i, row in enumerate(f):
        if i % 500 == 0:
            print(i)
        for product in json.loads(row):
            id_to_product[product['id']] = product
            
ids = list(id_to_product.keys())

id_to_category = {}
for product in id_to_product.values():
    for category in product['categories']:
        id_to_category[category['numId']] = category

category_ids_of_interest = set([
    70, # bracelets
    73, # necklaces
    75, # rings
    72, # earrings
])

category_to_count = collections.Counter()
category_to_products = collections.defaultdict(set)

i = 0
for product in id_to_product.values():
    category_ids = set(category['numId'] for category in product['categories'])
    ids_in_common = category_ids & category_ids_of_interest
    if len(ids_in_common) != 1:
        i += 1
    else:
        category_to_count[list(ids_in_common)[0]] += 1
        category_id = list(ids_in_common)[0]
        category_to_products[category_id].add(product['id'])
print(category_to_count)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
Counter({72: 9401, 73: 6937, 70: 4507, 75: 4099})


In [3]:
random_bracelets = np.random.choice(np.array(list(category_to_products[70])), 2000, replace=False)
random_necklaces = np.random.choice(np.array(list(category_to_products[73])), 2000, replace=False)
random_rings = np.random.choice(np.array(list(category_to_products[75])), 2000, replace=False)
random_earrings = np.random.choice(np.array(list(category_to_products[72])), 2000, replace=False)

In [4]:
import shutil, pathlib, os

print(os.getcwd())

jewelry_type = ['bracelets', 'necklaces', 'rings', 'earrings']
dataset_type = ['training', 'testing', 'validation']

for jewelry in jewelry_type:
    for dataset in dataset_type:
        path = f'data/{dataset}/{jewelry}'
        pathlib.Path(path).mkdir(exist_ok=True, parents=True)

/Users/linhchau/Desktop/galvanize/jewelery_recommender


In [5]:
import multiprocessing
import os


jewelry_type_to_product_ids = {
    'bracelets': random_bracelets,
    'necklaces': random_necklaces,
    'rings': random_rings,
    'earrings': random_earrings,
}

def copy_files(dataset):
    args = {
        'training': [1000],
        'testing': [1000, 1500],
        'validation': [1500, 2000],
    }
    args = args[dataset]
    for i in range(*args):
        if i % 100 == 0:
            print(i)
        for jewelry in jewelry_type:
            product_id = jewelry_type_to_product_ids[jewelry][i]
            path = f'data/{dataset}/{jewelry}/{product_id}.jpg'
            if os.path.isfile(path):
                continue
            shutil.copyfile(
                f'data/data/{product_id}.jpg',
                path,
            )

with multiprocessing.Pool() as workers:
    workers.map(copy_files, dataset_type)

# for dataset in dataset_type:
#     copy_files(dataset)


0
1000
1500
100
1100
1600
1200
200
1700
1300
1800
300
1400
1900
400
500
600
700
800
900


In [6]:
filenames = collections.defaultdict(dict)

for dataset in dataset_type:
    for jewelry in jewelry_type:
        filenames[dataset][jewelry] = os.listdir(f'data/{dataset}/{jewelry}')
        print(len(filenames[dataset][jewelry]))

1000
1000
1000
1000
500
500
500
500
500
500
500
500
