In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
import time
from datetime import datetime
from bs4 import BeautifulSoup # for HTML parsing

In [2]:
# function that extracts a page of products
def get_products(product_base_url, page_nbr):
    # product_base_url: including limit parameter (eg. https://aplusrstore.com/products.json?limit=250)
    # page_nbr: page number to extract
    # returns: products list formatted as JSON

    # add page number parameter to the product info request
    url = f"{product_base_url}&page={page_nbr}"
    
    # debug / demonstrate progress
    print(url)
    
    # get the page of products from the store site
    response = requests.get(url)
    
    # extract the page of products from the response formtted as JSON
    products = response.json()["products"]
        
    return products

In [3]:
# function that parses body_html element from product JSON content
# and returns the size information as plain text
# if size information cannot be found, returns empty 
def extract_size_from_html(html):
    # html: body_html element from product JSON
    # returns product size information as plain text
    
    # create HTML parser
    soup = BeautifulSoup(html, 'html.parser')

    # Find the <h4> tag with the text "Size"
    size_tag = soup.find('h4', string='Size')
    
    # If we can't find a size tag
    if not size_tag:
        return "" # empty string

    # Find the next <ul> tag after the <h4> tag
    size_list = size_tag.find_next('ul')
    sizes = []
    
    # if we found a <ul> tag
    if size_list:
        # Extract the size information from the <li> tags
        for li in size_list.find_all('li'):
            sizes.append(li.text.strip())
    
    else:
        # Extract the size information from the <p> tags
        for p in size_tag.find_next('p'):
            sizes.append(p.text.strip())

    return sizes

In [4]:
# test extract_size_from_html
html = """
<h2>Description</h2>
<!-- split -->
<p class="lead">Hand-woven in India by skilled artisans, Gubi Rug offers a nuanced update on\xa0traditional techniques. A mix of New Zealand wool and viscose lends a depth of color and a soft and luxurious silky finish, as well as strength and durability. The tight weave makes the rug easy to clean and ensures that the pile remains straight even with heavy use. Available in 3 rectangular sizes that have been carefully selected for likely placement in front of a sofa, under a bed, or beneath a table and chairs.\xa0</p>
<!-- split -->
<h2>Specifications</h2>
<!-- split -->
<h4>Size</h4>
<ul>
<li>Small: 94.5" l x 66.9" w (240x170cm)</li>
<li>Medium: 118.1" l x 98.4" w (300x250cm)</li>
<li>Large: 137.8" l x 98.4" w (350x250cm)</li>
</ul>
<h4>Material</h4>
<p>Hand woven, wool viscose lends</p>
"""

extract_size_from_html(html)

['Small: 94.5" l x 66.9" w (240x170cm)',
 'Medium: 118.1" l x 98.4" w (300x250cm)',
 'Large: 137.8" l x 98.4" w (350x250cm)']

In [5]:
# test extract_size_from_html
test_html = """
<h2>Description</h2>\n<!-- split -->\n<p class="lead">Made for the Architectural Biennale of Venice, designer Kari Virtaneon’s stool-table references organic Finnish style with its seemingly naturally hewn surface and demonstrably impressive proportions. Biennale is fashioned of solid oak and its carefully crafted cracks, wormholes, unevenness and knots make each piece unique—a one-of-a-kind work of art.</p>\n<!-- split -->\n<h2>Specifications</h2>\n<!-- split -->\n<h4>Size</h4>\n<p>17.7" h x 13.8" w x 11.8" d (45x35x30cm)</p>\n<h4>Material</h4>\n<p>Oak</p>
"""

extract_size_from_html(test_html)

['17.7" h x 13.8" w x 11.8" d (45x35x30cm)']

In [6]:
domain_url = "https://aplusrstore.com"
productRequestStr = "https://aplusrstore.com/products.json"

product_list = [] # initialize empty product list
page_size = 250 # maximum number of products to retrieve with each page request
page_nbr = 1 # keeps track of current page number being requested
url = f"{productRequestStr}?limit={page_size}" # product request URL including maximum number of products to retrieve with each page request 

# loop through pages of products
while True:
    
    # get a page of products
    product_batch = get_products(url, page_nbr)
    
    # for each product in the page
    for i in product_batch:
        title = i["title"]
        prod_id = i["id"]
        slug = i["handle"]
        publish_date = i["published_at"]
        updated_date = i["updated_at"]
        vendor = i["vendor"]
        product_type = i["product_type"]
        tags = i["tags"]
        full_url = domain_url + "/products/" + slug
        
        # not every product variant has its own image
        # so set a backup image for every product variant
        # use the first image in the image list as the backup
        # this backup will be overwritten below if there is a specific image for the product variant
        images = i["images"]
        featured_image_src = images[0]["src"] # URL for first image in product image list
        
        # size options is a tricky one - it will be sparse
        # will be populated only for the special case where product -> "options" -> "Size" exists
        # in this case we will concatenate together the size options
        options = i["options"]
        size_options = ""
        
        # for every option
        for o in options:
            
            # if this is the "Size" option
            if o["name"] == "Size":
                
                # loop through all the size values
                for size_val in o["values"]:
                    
                    # and add the value to our size options string 
                    size_options = size_options + size_val + " | "
                    
        # extract size info from body_html element of product JSON
        body_html = i["body_html"]
        size_info = extract_size_from_html(body_html)
        
        # for every product variant
        for v in i["variants"]:
        
            variant_title = v["title"]
            price = v["price"]
            sku = v["sku"]
            featured_image = v["featured_image"]
            
            # if there's a featured image for this product variant
            if featured_image:
                # overwrite the backup image set above
                featured_image_src = featured_image["src"]
                
            # create product variant record
            details = [title, prod_id, full_url, publish_date, updated_date, vendor, product_type, tags, variant_title, price, sku, featured_image_src, size_info, size_options]
            
            # append the product variant record to the list
            product_list.append(details)    
    
    # if the number of products in the page just retrieved is smaller than the page size
    if len(product_batch) < page_size:
        
        # this is the last page, we have all the products...
        break
        
    # pause before we retrieve the next page
    time.sleep(10)
    
    # increment the page number 
    page_nbr += 1
    

https://aplusrstore.com/products.json?limit=250&page=1
https://aplusrstore.com/products.json?limit=250&page=2
https://aplusrstore.com/products.json?limit=250&page=3
https://aplusrstore.com/products.json?limit=250&page=4
https://aplusrstore.com/products.json?limit=250&page=5
https://aplusrstore.com/products.json?limit=250&page=6
https://aplusrstore.com/products.json?limit=250&page=7
https://aplusrstore.com/products.json?limit=250&page=8
https://aplusrstore.com/products.json?limit=250&page=9
https://aplusrstore.com/products.json?limit=250&page=10
https://aplusrstore.com/products.json?limit=250&page=11
https://aplusrstore.com/products.json?limit=250&page=12
https://aplusrstore.com/products.json?limit=250&page=13
https://aplusrstore.com/products.json?limit=250&page=14
https://aplusrstore.com/products.json?limit=250&page=15
https://aplusrstore.com/products.json?limit=250&page=16
https://aplusrstore.com/products.json?limit=250&page=17
https://aplusrstore.com/products.json?limit=250&page=18
h

In [7]:
len(product_list)

105314

In [8]:
# create a Pandas dataframe from the product list
# Pandas dataframes are easy to export as CSV
df = pd.DataFrame(product_list)
# Set the column names
df.columns = ['Product Name', 'Product ID', 'Product URL', 'Published At', 'Updated At', 'Vendor', 'Product type', 'Tags', 'Variant Name', 'Price', 'SKU', 'Featured Image', 'Size Info', 'Size Options']

# take a look at the dataframe
df

Unnamed: 0,Product Name,Product ID,Product URL,Published At,Updated At,Vendor,Product type,Tags,Variant Name,Price,SKU,Featured Image,Size Info,Size Options
0,Lamina Wall Lamp,7229139812410,https://aplusrstore.com/products/santa-cole-la...,2024-04-12T11:54:26-07:00,2024-04-15T19:19:53-07:00,Santa & Cole • Antoni Arola,Wall Lighting,"[1500-2000, 2000, fullprice, New Product-Light...",45 / White Grey,1650.00,SC-LAMWA01UL,https://cdn.shopify.com/s/files/1/0012/2005/10...,,
1,Lamina Wall Lamp,7229139812410,https://aplusrstore.com/products/santa-cole-la...,2024-04-12T11:54:26-07:00,2024-04-15T19:19:53-07:00,Santa & Cole • Antoni Arola,Wall Lighting,"[1500-2000, 2000, fullprice, New Product-Light...",45 / Gold,1720.00,SC-LATWA01UL,https://cdn.shopify.com/s/files/1/0012/2005/10...,,
2,Lamina Wall Lamp,7229139812410,https://aplusrstore.com/products/santa-cole-la...,2024-04-12T11:54:26-07:00,2024-04-15T19:19:53-07:00,Santa & Cole • Antoni Arola,Wall Lighting,"[1500-2000, 2000, fullprice, New Product-Light...",165 / White Grey,2300.00,SC-LAMWE01UL,https://cdn.shopify.com/s/files/1/0012/2005/10...,,
3,Lamina 45 Table Lamp,7229139779642,https://aplusrstore.com/products/santa-cole-la...,2024-04-12T11:53:30-07:00,2024-04-15T19:19:53-07:00,Santa & Cole • Antoni Arola,Table + Task Lamps,"[2000, fullprice, New Product-Lighting, Santa ...",White Grey,2000.00,SC-LAMTA01UL,https://cdn.shopify.com/s/files/1/0012/2005/10...,,
4,Lamina 45 Table Lamp,7229139779642,https://aplusrstore.com/products/santa-cole-la...,2024-04-12T11:53:30-07:00,2024-04-15T19:19:53-07:00,Santa & Cole • Antoni Arola,Table + Task Lamps,"[2000, fullprice, New Product-Lighting, Santa ...",Gold,2000.00,SC-LATTA01UL,https://cdn.shopify.com/s/files/1/0012/2005/10...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105309,Dustpan + Broom,1690091716666,https://aplusrstore.com/products/normann-copen...,2018-07-11T09:46:52-07:00,2024-04-15T19:24:44-07:00,Normann Copenhagen • Ole Jensen,Tools,"[0-300, fullprice, Normann Copenhagen]",Black,35.00,NOR-303200,https://cdn.shopify.com/s/files/1/0012/2005/10...,"[12.6"" h x 9.4"" w x 9.4"" d (32x24x24cm)]",
105310,Dustpan + Broom,1690091716666,https://aplusrstore.com/products/normann-copen...,2018-07-11T09:46:52-07:00,2024-04-15T19:24:44-07:00,Normann Copenhagen • Ole Jensen,Tools,"[0-300, fullprice, Normann Copenhagen]",Light Grey,35.00,NOR-303000,https://cdn.shopify.com/s/files/1/0012/2005/10...,"[12.6"" h x 9.4"" w x 9.4"" d (32x24x24cm)]",
105311,Pen Pen Pencil Holder,2094195081274,https://aplusrstore.com/products/essey-pen-pen...,2018-07-11T09:42:09-07:00,2024-04-15T19:24:44-07:00,Essey • John Brauer,Objects,"[0-300, Essey, fullprice, Quick Ship]",Red,20.00,ES-PPRD,https://cdn.shopify.com/s/files/1/0012/2005/10...,"[3.9"" h x 3.9"" dia (10x10cm)]",
105312,Pen Pen Pencil Holder,2094195081274,https://aplusrstore.com/products/essey-pen-pen...,2018-07-11T09:42:09-07:00,2024-04-15T19:24:44-07:00,Essey • John Brauer,Objects,"[0-300, Essey, fullprice, Quick Ship]",Graphite,20.00,ES-PPGT,https://cdn.shopify.com/s/files/1/0012/2005/10...,"[3.9"" h x 3.9"" dia (10x10cm)]",


In [9]:
df.describe()

Unnamed: 0,Product ID
count,105314.0
mean,5379942000000.0
std,2017428000000.0
min,1690092000000.0
25%,4406575000000.0
50%,6616112000000.0
75%,6900082000000.0
max,7236747000000.0


In [10]:
# current timestamp
x = time.time()
print("Timestamp:", x)

dt = datetime.fromtimestamp(x)
print("The date and time is:", dt)

Timestamp: 1713234285.114668
The date and time is: 2024-04-16 02:24:45.114668


In [11]:
# Save the DataFrame to a CSV file
df.to_csv(f"products_{dt}.csv", index=False)