In [1]:
import os
import gzip
import jsonlines
from subprocess import check_call
from typing import Optional, List, Dict, Set, Iterable
from pathlib import Path
import pandas as pd
import requests
import re
import tqdm

# 1. Download Data

In [2]:
BASE_DATA_FOLDER = Path('data/amazon/')


def download_wget(url: str, dest_file: str):
    check_call(['wget', 
                '--progress=bar:force',
                '--no-check-certificate',
                url,
                '--output-document',
                dest_file])


def get_metafile(filename: str) -> str:
    if not filename or '/' in filename:
        raise ValueError('invalid filename')

    dest_file = BASE_DATA_FOLDER / filename
    if os.path.exists(dest_file):
        print(f'Not downloading {filename}. File already exist')
    else:
        url = f'https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles/{filename}'
        download_wget(url, dest_file)

    return dest_file


metadata_filepath = get_metafile('meta_Gift_Cards.json.gz')
metadata_filepath

Not downloading meta_Gift_Cards.json.gz. File already exist


PosixPath('data/amazon/meta_Gift_Cards.json.gz')

In [3]:
def get_categoryfile(filename: str):
    if not filename or '/' in filename:
        raise ValueError('invalid filename')

    dest_file = BASE_DATA_FOLDER / filename
    if os.path.exists(dest_file):
        print(f'Not downloading {filename}. File already exist')
    else:
        base_url = 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFiles/'
        download_wget(f'{base_url}{filename}', dest_file)

    return dest_file


category_filepath = get_categoryfile('Gift_Cards.json.gz')
category_filepath

Not downloading Gift_Cards.json.gz. File already exist


PosixPath('data/amazon/Gift_Cards.json.gz')

In [4]:
def load_metadata(src: Path, data_limit: Optional[int] = None) -> pd.DataFrame:
    with gzip.open(src) as file:
        with jsonlines.Reader(file) as reader:
            if data_limit is None:
                metadata = [obj for obj in reader]
            else:
                metadata = [obj for obj, _ in zip(reader, range(data_limit))]

    return pd.DataFrame(metadata)


metadata = load_metadata(metadata_filepath, data_limit=10000)
metadata

Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin
0,"[Gift Cards, Gift Cards]",,[Gift card for the purchase of goods or servic...,,Serendipity 3 $100.00 Gift Card,[],[],,Serendipity 3,[],[],[],{'  Product Dimensions: ': '3.4 x 2.1 ...,Grocery,,,,B001BKEWF2
1,"[Gift Cards, Gift Cards]",,[Amazon.com Gift Cards are the perfect way to ...,,Amazon.com Gift Cards,[],[https://images-na.ssl-images-amazon.com/image...,,Amazon,[Amazon.com Gift cards never expire and carry ...,[],"[BT00DC6QU4, B01I4AHZXC, B0719C5P56, B01K8RLHZ...","{'Shipping Weight:': '0.5 ounces', 'Domestic S...",Gift Cards,,,,B001GXRQW0
2,"[Gift Cards, Gift Cards]",,"[<div class=""aplus""> <br>Amazon.com Gift Cards...",,"Amazon.com Gift Cards, Pack of 50 (Old Version...","[B005ESMEBQ, B007EE5WNA, B007EE5OBU, B07HJHK8Y...",[https://images-na.ssl-images-amazon.com/image...,,Amazon,"[Contains 50 Gift Cards, Gift Card has no fees...","1,786 in Gift Cards (","[B005ESMEBQ, B007EE5OBU, B007EE5WNA, B007EE60N...","{'Shipping Weight:': '1.8 pounds (', 'Domestic...",Gift Cards,,,"$1,250.00",B001H53QE4
3,"[Gift Cards, Gift Cards]",,"[<div class=""aplus""> <h4>Amazon.com Gift Cards...",,"Amazon.com $50 Gift Cards, Pack of 50 (Old Ve...",[],[https://images-na.ssl-images-amazon.com/image...,,Amazon,"[Contains 50 Gift Cards, Gift Card has no fees...",[],[],"{'Shipping Weight:': '1.6 pounds', 'Domestic S...",Gift Cards,,,,B001H53QEO
4,"[Gift Cards, Gift Cards]",,[],,Wood Puzzle Magic Box Gift Card,[],[https://images-na.ssl-images-amazon.com/image...,,Creative Crafthouse,[],"[>#2,893,775 in Toys & Games (See Top 100 in T...",[],{},Toys & Games,,,,B001KMWN2K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,"[Gift Cards, Gift Cards]",,"[Hand-Pattied Burgers Chicken, Steaks & Fish ...",,Harvey&#39;s Grill and Bar &amp; Hayden&#39;s...,[],[https://images-na.ssl-images-amazon.com/image...,,,[],"5,808 in Gift Cards (",[],"{'Domestic Shipping: ': 'Currently, item can b...",Gift Cards,,,,B01GMFJ8JU
1543,"[Gift Cards, Gift Cards]",,[<strong>Smoothie King Gift Cards. The Perfect...,,Smoothie King Gift Card,"[B00NVUDIZ0, B00BXLVE6Y, B00O4I5S7O, B00GOLGWV...",[https://images-na.ssl-images-amazon.com/image...,,QuickGifts,"[Over 500 Locations Nationwide!, Smoothies, He...","1,201 in Gift Cards (","[B00NVUDIZ0, B00GOLGWVK, B07JKD8G5X, B00BXLVE6...",{'ASIN:': 'B01GOQIKRI'},Gift Cards,,,$10.00,B01GOQIKRI
1544,"[Gift Cards, Gift Cards]",,"[Cattle Baron Restaurants, Inc.is a privately ...",,Cattle Baron Restaurant Gift Card,"[B00CXZPG0O, B00PG84W32]",[https://images-na.ssl-images-amazon.com/image...,,QuickGifts,"[9 Locations over New Mexico and Texas!, Join ...","3,302 in Gift Cards (",[],{'ASIN:': 'B01GOQHGL4'},Gift Cards,,,,B01GOQHGL4
1545,"[Gift Cards, Gift Cards]",,"[The cards, available in $25 increments up to ...",,Go Play Golf Gift Card,"[B077CZY68Z, B07JJHVWXL, B00NVUDIZ0, B01CZ54L6...",[],,Go Play Golf by Fairway Rewards,[Go Play Golf - the only national gift card th...,"1,015 in Gift Cards (","[B01LYIR4CZ, B077CZY68Z, B07736DNZB, B01N9AE2Z...","{'Shipping Weight:': '0.6 ounces (', 'ASIN:': ...",Gift Cards,,,$50.00,B01GP1W4LA


In [5]:
def get_duplicated_product_list() -> Path:
    dest_file = BASE_DATA_FOLDER / 'duplicate_products.txt'
    if os.path.exists(dest_file):
        print(f'Not downloading {dest_file}. File already exist')
    else:
        url = 'https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles/duplicates.txt'
        download_wget(url, dest_file)

    return dest_file


dup_products_filepath = get_duplicated_product_list()
dup_products_filepath

Not downloading data/amazon/duplicate_products.txt. File already exist


PosixPath('data/amazon/duplicate_products.txt')

In [6]:
def read_duplicate_products(
    src: Path,
    relevant_asins: Iterable[str]
) -> List[Set[str]]:
    duplicate_sets = []

    # To speed up lookup
    relevant_asin_set = frozenset(relevant_asins)

    with open(src, 'r') as file:
        for line in file:
            line_set = {
                asin for asin in line.split()
                if asin in relevant_asin_set
            }
            if len(line_set) > 1:
                # Empty sets don't matter
                # sets with one item mean the duplicates are outside
                # our product set
                duplicate_sets.append(line_set)

    return duplicate_sets


# Not necessary. I guess we don't have duplicates
duplicate_sets = read_duplicate_products(
    src=dup_products_filepath,
    relevant_asins=metadata['asin']
)
duplicate_sets

[]

In [8]:
def collect_image_paths(
    asin: str, 
    images: List[str], 
    max_dimension: int = 400
) -> Dict[str, str]:
    image_re = re.compile(
        r'(?P<prefix>https:\/\/images-na\.ssl-images-amazon\.com\/images\/I\/'
        r'(?P<name>.*)\.)'
        r'(?P<dimensions>_((AC_)?(SX\d+_SY\d+_CR(,\d+)+)|(SR\d+,\d+)|(SS\d+))_)'
        r'(?P<suffix>\.jpg)'
    )
    image_urls = {}
    for image in images:
        match = image_re.match(image)
        if not match:
            raise ValueError('Invalid Image')

        dest_path = (
            BASE_DATA_FOLDER / 'metadata' / f'{match.group("name")}{match.group("suffix")}'
        )

        if dest_path.exists():
            continue  # Avoid downloading the image again

        url = (f'{match.group("prefix")}_SX{max_dimension}'
               f'_{match.group("suffix")}')

        if url in image_urls:
            raise ValueError('Duplicated URL (should not happen)')

        image_urls[url] = dest_path

    return image_urls


def download_images(metadata_df: pd.DataFrame):
    print('Collecting URLs')
    image_urls: Dict[str, Path] = {}
    for _, item in metadata_df.iterrows():
        item_urls = collect_image_paths(item['asin'], item['image'])

        # Multiple products have the same image. Download it once
        for url, dest_file in item_urls.items():
            old_dest_file = image_urls.get(url)
            if old_dest_file is None:
                image_urls[url] = dest_file
            else:
                # Same URLs should have the same destination file
                assert old_dest_file == dest_file

    if not image_urls:
        print('Nothing to download')
        return

    print(f'I need to download {len(image_urls)} files')

    for url, dest in tqdm.tqdm(image_urls.items(), unit='image'):
        resp = requests.get(url, stream=True)
        resp.raise_for_status()

        # The first destination has the real file
        with open(dest, 'wb') as dest_file:
            for chunk in resp.iter_content(chunk_size=16384):
                dest_file.write(chunk)

    print('DONE')


download_images(metadata)

Collecting URLs
Nothing to download


In [15]:
def load_reviews(src: Path, data_limit: Optional[int] = None) -> pd.DataFrame:
    with gzip.open(src) as file:
        with jsonlines.Reader(file) as reader:
            if data_limit is None:
                metadata = [obj for obj in reader]
            else:
                metadata = [obj for obj, _ in zip(reader, range(data_limit))]

    return pd.DataFrame(metadata)


reviews = load_reviews(category_filepath, data_limit=1000000)
reviews

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,1.0,25,False,"12 19, 2008",APV13CM0919JD,B001GXRQW0,{'Gift Amount:': ' 50'},LEH,"Amazon,\nI am shopping for Amazon.com gift car...",Merry Christmas.,1229644800,
1,5.0,,False,"12 17, 2008",A3G8U1G1V082SN,B001GXRQW0,{'Gift Amount:': ' 50'},Tali,"I got this gift card from a friend, and it was...",Gift card with best selection,1229472000,
2,5.0,4,False,"12 17, 2008",A11T2Q0EVTUWP,B001GXRQW0,{'Gift Amount:': ' 50'},Z,aren't we going to save trees?! :) People who ...,A convenient and great gift for the environmen...,1229472000,
3,5.0,,False,"12 17, 2008",A9YKGBH3SV22C,B001GXRQW0,{'Gift Amount:': ' 25'},Giotravels,You can always get someone something from Amaz...,Totally make sense,1229472000,
4,1.0,,True,"12 17, 2008",A34WZIHVF3OKOL,B001GXRQW0,,King Dad,Why take 50 dollars of good money with no limi...,Give CASH!,1229472000,
...,...,...,...,...,...,...,...,...,...,...,...,...
147189,5.0,,True,"09 15, 2018",A2K9WVQW9TLWNK,B01H5PPJT4,,Mark.,I always enjoy getting these Gift cards via em...,Just always great service!!,1536969600,
147190,4.0,,True,"09 7, 2018",A149ALSR6TPGF7,B01H5PPJT4,,timothy kuta,Worked great,Good to have,1536278400,
147191,5.0,,True,"08 29, 2018",A2Q066NZCQSCOR,B01H5PPJT4,,Jess,Gift card,Gift card,1535500800,
147192,5.0,,True,"08 18, 2018",A1KJLWCW7XBS8I,B01H5PPJT4,,Average Homeowner,"What is there to say, It's a gift card.",Easy to buy and give as a gift.,1534550400,


# 2. Analyze Data

In [25]:
%who

BASE_DATA_FOLDER	 Dict	 Iterable	 List	 Optional	 Path	 Set	 category_filepath	 check_call	 
collect_image_paths	 download_images	 download_wget	 dup_products_filepath	 duplicate_sets	 get_categoryfile	 get_duplicated_product_list	 get_metafile	 gzip	 
json	 jsonlines	 load_metadata	 load_reviews	 metadata	 metadata_filepath	 os	 pd	 re	 
read_duplicate_products	 requests	 reviews	 tqdm	 


In [37]:
reviews.groupby('reviewerID')['reviewerID'].count().value_counts()

1     117322
2       8157
3       1950
4        719
5        333
6        151
7         87
8         61
9         35
10        17
11        11
12         8
14         6
16         4
13         4
15         3
19         2
22         1
23         1
32         1
20         1
17         1
39         1
26         1
Name: reviewerID, dtype: int64