In [1]:
import APIParsingFunctions as apf
import pandas as pd
import time
import json
import datetime
import concurrent.futures

In [None]:
month = datetime.date.today().month
day = datetime.date.today().day
year = datetime.date.today().year

### import ulta json data from scrapy

In [2]:
file = 'data/ulta_%d_%d_%d.json'%(month, day, year)
with open(file,'r') as f:
    ultaJson = json.loads(f.read())

In [4]:
ulta = pd.DataFrame(ultaJson).drop_duplicates()

In [5]:
productIds = apf.get_product_ids(ulta)

### use thread pools to asynchronously scrape product and default sku data from the ulta api

In [None]:
start = time.time()

productsDict = {}
skusDict = {}
skusDirectoryDict = {}

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    future2productId = {executor.submit(apf.get_product_data, productId):productId for productId in productIds}
    for future in concurrent.futures.as_completed(future2productId):
        productId = future2productId[future]
        try:
            skuDirectory, productDict, skuDict = future.result()
            skusDirectoryDict.update(skuDirectory)
            productsDict.update(productDict)
            skusDict.update(skuDict)
        except Exception as exc:
            print('%r generated an exception: %s' % (productId, exc))

### get list of skus for which we need data

In [None]:
skuIds = apf.get_skus_to_scrape(skusDirectoryDict, productsDict)

### use thread pools to asynchronously scrape sku data from the ulta api

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    future2skuId = {executor.submit(apf.get_sku_data, skuId):skuId for skuId in skuIds}
    for future in concurrent.futures.as_completed(future2skuId):
        skuId = future2skuId[future]
        try:
            skuDict = future.result()
            skusDict.update(skuDict)
        except Exception as exc:
            print('%r generated an exception: %s' % (skuId, exc))

In [None]:
end = time.time()

In [None]:
(end-start)/60

### create category table and save it to a csv file

In [None]:
category = apf.parse_category(productsDict)
file = 'data/category_%d_%d_%d.csv'%(month, day, year)
category.to_csv(file)

### create category_directory table and save it to a csv file

In [None]:
categoryDirectory = apf.get_category_directory(productsDict)
file = 'data/categoryDirectory_%d_%d_%d.csv'%(month, day, year)
categoryDirectory.to_csv(file)

### create product table and save it to a csv file

In [None]:
products = (
        pd.DataFrame.from_dict(productsDict, orient='index')
        .loc[:, 
        [
            'data_product_id', 
            'data_product_displayName', 
            'data_brand_brandName', 
            'data_product_live', 
            'data_reviewSummary_rating', 
            'data_reviewSummary_reviewCount', 
            'meta_lastFetchedTime'
        ]
    ]
    .rename(columns=
        {
            'data_product_displayName': 'displayName', 
            'data_product_id': 'productId', 
            'data_product_live': 'isLive', 
            'data_reviewSummary_rating': 'rating', 
            'data_reviewSummary_reviewCount': 'reviewCount', 
            'data_brand_brandName': 'brandName', 
            'meta_lastFetchedTime': 'lastFetchedTime'
            
        }
    )
)

In [None]:
file = 'data/products_%d_%d_%d.csv'%(month, day, year)
products.to_csv(file)

### create skus dataframe, dataframe with data for both the sku table and the price table, and then save it to a csv file

In [None]:
skus = (
    pd.DataFrame.from_dict(skusDict, orient='index')
    .loc[:, 
        [
            'id',
            'UPC', 
            'displayName', 
            'storeOnly',
            'onlineOnlyStatus',  
            'price_onlineOnlySalePrice', 
            'price_listPrice_amount',
            'price_salePrice_amount',
            'variant_variantType',
            'variant_variantDesc', 
            'size', 
            'UOM',
            'inventoryStatus',
            'couponEligible',
            'badges_items'    
        ]
    ]
    .rename(columns=
        {
            'id': 'skuId',
            'price_onlineOnlySalePrice': 'onlineOnlySalePrice', 
            'price_listPrice_amount': 'listPrice',
            'price_salePrice_amount': 'salePrice',
            'variant_variantType': 'variantType',
            'variant_variantDesc': 'variantDesc' 
        }
    )
)

skus['badge'] = skus.apply(apf.get_badges, axis=1)
skus = (
    skus
    .drop(columns={'badges_items'})
    .merge(apf.get_sku_directory(skusDirectoryDict))
    .dropna(subset=['listPrice'])
)

In [None]:
file = 'data/skus_%d_%d_%d.csv'%(month, day, year)
skus.to_csv(file)