### Import libraries

In [1]:
import os
import requests
import shutil
import random 
import pandas as pd
import numpy as np
import urllib.parse
import matplotlib.pyplot as plt
from selenium import webdriver
from bs4 import BeautifulSoup
from PIL import Image as PilImage
from io import BytesIO
%load_ext autotime

### Data Processing

In [2]:
# Get productIds
shoes_df = pd.read_csv('./amazon_reviews_us_Shoes_v1_00_help_voted_And_cut_lognTail.csv')
raw_productIds = shoes_df['product_id'].to_list()
unique_products = list(np.unique(raw_productIds))
products_10k = random.sample(unique_products,10000)
products_20k = random.sample(unique_products,20000)

time: 288 ms


In [3]:
print("Raw shoes dataset:", len(raw_productIds))
print("Unique shoes products:", len(unique_products))
print("10K samples:", len(products_10k))
print("20K samples:", len(products_20k))

Raw shoes dataset: 155509
Unique shoes products: 97758
10K samples: 10000
20K samples: 20000
time: 1.86 ms


In [4]:
# top 1 recommended products for each user.
# Get productIds
shoes_df = pd.read_csv('../prediction_JH/Shoes_for_100_users_per_100_products_prediction_Ver4.csv')
productIds_for_100_users = shoes_df['asin'].to_list()
top100recommended = [productIds_for_100_users[i] for i in range(0,1000,10)]

time: 42.1 ms


In [5]:
for item in top100recommended:
    products_10k.append(item)
for item in top100recommended:
    products_20k.append(item)

products_10k = list(np.unique(products_10k))
products_20k = list(np.unique(products_20k))

print("10K sample:", len(products_10k))
print("20K_sample:", len(products_20k))

10K sample: 10088
20K_sample: 20079
time: 36.3 ms


In [14]:
products_50k = random.sample(unique_products,50000)
for item in top100recommended:
    products_50k.append(item)
products_50k = list(np.unique(products_50k))

print("50K_sample:", len(products_50k))

50K_sample: 50050
time: 120 ms


### Get image urls ( 10K dataset )

In [6]:
# Get website url
product_urls_10k={}
for i in products_10k:
    url = urllib.parse.urljoin('https://www.amazon.com/dp/', i)
    product_urls_10k[i]=url

time: 139 ms


In [7]:
# product_urls_10k 

time: 550 µs


In [8]:
# Execute Chromedriver 
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
options.add_argument("user-agent=whatever you want")
driver = webdriver.Chrome(options=options)

time: 1h 5min 50s


In [None]:
product_urls_10k_list = list(product_urls_10k.items())

In [None]:
# Extract img urls 
image_urls_10k={}
for product in list(product_urls_10k.items()):
    asin, url = product[0], product[1]
    driver.get(url)
    content = driver.page_source
    soup = BeautifulSoup(content,"lxml")
    for div in soup.find_all('div', id="imgTagWrapperId"):
        image=div.find('img', alt=True)
        image_urls_10k[asin]=(image['src'])

In [9]:
len(list(image_urls_10k.items()))

776

time: 4.49 ms


In [11]:
asin_url_10k_df = pd.DataFrame.from_dict(image_urls_10k, orient ='index',columns=['url']).reset_index().rename(columns={'index': 'asin'})
asin_url_10k_df.to_csv (r'./asin_url_for_10k.csv',index = False, header=True)            

time: 52 ms


In [12]:
# Download images
for asin,url in image_urls_10k.items():
    # Open the url image, set stream to True, this will return the stream content.
    resp = requests.get(url, stream=True)
    # Open a local file with wb ( write binary ) permission. (created increment filename)
    path = os.path.join("./10K_Shoes_images/", asin + "." + "jpg")
#     print(path)
    local_file = open(path,'wb')
    # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
    resp.raw.decode_content = True
    # Copy the response stream raw data to local image file.
    shutil.copyfileobj(resp.raw, local_file)
    # Remove the image url response object.
    del resp

time: 2min 45s


### Get image urls ( 10K dataset )

In [None]:
# Get website url
product_urls_20k={}
for i in products_20k:
    url = urllib.parse.urljoin('https://www.amazon.com/dp/', i)
    product_urls_20k[i]=url

In [None]:
# Execute Chromedriver 
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
options.add_argument("user-agent=whatever you want")
driver = webdriver.Chrome(options=options)

# Extract img urls 
image_urls_20k={}
for product in list(product_urls_20k.items()):
    asin, url = product[0], product[1]
    driver.get(url)
    content = driver.page_source
    soup = BeautifulSoup(content,"lxml")
    for div in soup.find_all('div', id="imgTagWrapperId"):
        image=div.find('img', alt=True)
        image_urls_20k[asin]=(image['src'])