Goes through the process of downloading data from the 2018 Amazon dataset
and places it locally in a SQlite Database for further processing

In [1]:
from datetime import date
from pathlib import Path
from typing import NamedTuple, Optional, Tuple, List

import pandas as pd
import plotly.express as px
import scipy.sparse as sp

import amazon_dataset

# 1. Download Data

In [2]:
DATASETS = [
    'Baby',
    'Clothing_Shoes_and_Jewelry',
    'Home_and_Kitchen',
    'Movies_and_TV',
    'Musical_Instruments',
    'Office_Products',
    'Sports_and_Outdoors',
    'Toys_and_Games',
]

# 2. Import data to database

In [11]:
try:
    amazon_dataset.load_amazon_dataset(
        'Clothing_Shoes_and_Jewelry',
        force=False,
        min_date=date.fromisoformat('2018-03-01'),
        max_date=date.fromisoformat('2018-10-01'),
        min_reviews_per_reviewer=5,
        min_reviews_per_asin=5
    )
except ValueError as ex:
    # This is OK, since we don't want to blow up the Database. Extracting
    # data can take some minutes
    print(ex)

There are reviews. Use force=True to force removal


In [4]:
some_reviews = amazon_dataset.reviews_df('Clothing_Shoes_and_Jewelry')
some_reviews

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
0,676,5120053084,A35EUS1E3WK1HC,Kiley and Mars,5.0,"It's a cute top, works good for nursing and la...",2018-04-10,Decent lounge around top,True,
1,677,5120053084,AKIZYAIS4SYVF,Bethany,5.0,Looks really cute and super easy to nurse my d...,2018-03-22,Cute,True,
2,679,5120053084,A2L74OWEP7H1VC,Shelby0516,3.0,The tie is longer than the pictures showed. Ha...,2018-03-14,Awkward tie,True,
3,681,5120053084,A260RMKZXGDHVH,Kelly Kennedy,5.0,Of all the nursing shirts I bought my daughter...,2018-03-07,this one is her favorite. She wears it with pa...,True,
4,1300,7709260373,A13QI8GT2FFGN6,Amy,5.0,For the price... this is awesome!,2018-03-13,this is awesome!,True,
...,...,...,...,...,...,...,...,...,...,...
178939,32291840,B01HJCSCLK,AAHWQ4FMWLNH3,amazonlover,5.0,"Beautiful. Strong, durable, and chic but subtl...",2018-07-25,Beautiful,True,
178940,32291855,B01HJDVCJI,A2WUHKA1I75SL3,FRCP,3.0,Fit is great on these and they are very comfor...,2018-09-03,Comfortable,False,
178941,32291863,B01HJDVCJI,A7B48AJT6IC0A,Lives2read,4.0,Excellent arch support. Unique tongue design c...,2018-08-13,Unique look and comfort,False,
178942,32291875,B01HJDZM30,A2CJOG4NUHVDGK,Brittney Mitchell,5.0,Bought this for my husband and he absolutely l...,2018-08-29,Five Stars,False,


Donwnload Product images. This process can take around 5 hours

In [6]:
amazon_dataset.download_images('Clothing_Shoes_and_Jewelry')

Download products with missing images using an alternate way

In [14]:
amazon_dataset.add_image_for_remaining_products('Clothing_Shoes_and_Jewelry')

100%|██████████| 485/485 [06:44<00:00,  1.20image/s, Errors 229 https://ws-na.amazon-adsystem.com/widgets/q?_encoding=UTF8&MarketPlace=US&ASIN=B01HH3ZL3S&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=SS400]


In [7]:
some_products = amazon_dataset.products_df('Clothing_Shoes_and_Jewelry')
some_products

Unnamed: 0_level_0,asin,description,title,brand,main_cat,rank,price,image_slug,image_url,feature,category,tech_detail
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
47,5120053084,Top Length (Neck to Bottom Hem) Small - 29 inc...,sofsy Soft-Touch Rayon Blend Tie Front Nursing...,,,"87,615inClothing,ShoesJewelry(",,"[51HJbA8UG2L, 51FufN7RbSL, 51vKjwQ6eAL, 410fEp...",[https://images-na.ssl-images-amazon.com/image...,[SAVE 10% WHEN YOU ADD 2 OR MORE sofsy PRODUCT...,"[Clothing, Shoes & Jewelry, Women, Maternity, ...",
144,7709260373,,LJYH Children's Collar Motorcycle Faux Leather...,LJYH,,"15,484inClothing,ShoesJewelry(",$23.99 - $29.99,"[41MAnSZ8QTL, 51qL9W098pL, 51QWVSqlAML, 51eOan...",[https://images-na.ssl-images-amazon.com/image...,"[100% New Faux Leather Coat, Fashion New Slim ...","[Clothing, Shoes & Jewelry, Boys, Clothing, Ja...",
284,B00001TOXD,Includes One Broom. This broom goes great with...,Adult Witch Broom,,Toys & Games,,$4.99,[21VAYWoNedL],[https://m.media-amazon.com/images/I/21VAYWoNe...,[Brand new authentic licensed Pegan Witch broo...,"[Clothing, Shoes & Jewelry, Costumes & Accesso...",
316,B00004U1J2,Tired of having your kid running around the ho...,Buzz Lightyear Jet Pack,,Toys & Games,,$7.41,[31y%2BZwVQJJL],[https://images-na.ssl-images-amazon.com/image...,"[Polyester, Imported, 16"" high, 12"" wide, Qual...","[Clothing, Shoes & Jewelry, Costumes & Accesso...",
333,B00004VWJ3,Birkenstock's Black Super Birki Clog is made o...,"Birki's Super Pu, Black, 44 M EU (13 Women /11...",Birki's,,">#3,164 in Patio, Lawn & Garden (See Top 100 i...",,"[51g%2BtUP7YSL, 41Jn0zjKRuL, 41ElBt2gWdL, 31Aw...",[https://images-na.ssl-images-amazon.com/image...,[Birkenstock SUPER BIRKI BLACK PU 44],"[Clothing, Shoes & Jewelry, Women, Shoes, Mule...",
...,...,...,...,...,...,...,...,...,...,...,...,...
2684954,B01HJDBMUM,When you need a fashionable shoe for your son-...,Deer Stags Kid's Brilliant Dress Comfort Oxfor...,,,"343,935inClothing,ShoesJewelry(",$16.80 - $40.00,"[51Q9Np4I4OL, 41JgD3Klb6L, 31hD-OLssoL, 41ABLP...",[https://images-na.ssl-images-amazon.com/image...,"[100% Manmade, Synthetic sole, QUALITY YOU CAN...","[Clothing, Shoes & Jewelry, Girls, Shoes, Oxfo...",
2684955,B01HJCZ02O,"Dynamic in design, the honor sneaker is comfor...",Dansko Women's Honor Sneaker,,,"44,554inClothing,ShoesJewelry(",$74.93 - $221.43,"[51wGv%2BSmTHL, 41AE5SULqtL, 311xxeLqZ2L, 41Js...",[https://images-na.ssl-images-amazon.com/image...,"[100% Leather, Imported, Rubber sole, Shaft me...","[Clothing, Shoes & Jewelry, Women, Shoes, Fash...",
2684979,B01HJDZM30,Part of our Performance Fishing Gear line of p...,Columbia PFG Mesh Snap Back Ball Cap,Columbia,Sports & Outdoors,"5,346inSportsOutdoors(",$20.90,"[51cajtyCrBL, 416yFSOGLdL, 41t%2Bt3iFPYL, 513t...",[https://images-na.ssl-images-amazon.com/image...,"[blend, ADVANCED TECHNOLOGY: Columbia's PFG Me...","[Clothing, Shoes & Jewelry, Men, Accessories, ...",
2684987,B01HJDVCJI,An edgy take on Adidas running-inspired herita...,adidas Originals Men's Tubular Shadow Fashion ...,,,"74,828inClothing,ShoesJewelry(",$48.45 - $199.00,"[51wvO%2BYYxLL, 51VDqEjkM1L, 31Pab7SYh7L, 41gg...",[https://images-na.ssl-images-amazon.com/image...,"[100% Leather and Textile, Imported, Synthetic...","[Clothing, Shoes & Jewelry, Men, Shoes, Fashio...",


# 2. Analyze Data

Simple command to debug which variables we have defined

In [8]:
fig = px.histogram(some_reviews, x="overall", title='Stars per review')
fig.show()

In [10]:
fig = px.bar(
    some_reviews.groupby('reviewerID')['reviewerID'].count().value_counts(),
    log_y=True,
    title='Users vs Number of Reviews'
)
fig.show()

# 3. Build interaction matrices

In [25]:
def series_to_indexes(serie: pd.Series, name: str) -> pd.Series:
    vals = serie.unique()
    return pd.Series(index=vals, data=range(len(vals)), name=name)


class ReviewDataSet(NamedTuple):
    reviewer_idx: pd.Series
    asin_idx: pd.Series
    train_df: pd.DataFrame
    test_df: pd.DataFrame
    train_matrix: sp.spmatrix
    test_matrix: sp.spmatrix


def build_interation_matrix(
    df_with_idx: pd.DataFrame,
    reviewer_idx: pd.Series,
    asin_idx: pd.Series,
) -> sp.coo_matrix:
    grouped_df = df_with_idx.groupby(['reviewerIdx', 'asinIdx'])
    # We are super lenient and we only take the highest review number
    grouped_review_series = grouped_df['overall'].max()

    # TODO: decide if we should use csr or coo or whatever
    return sp.coo_matrix(
        (
            grouped_review_series.values, 
            (
                grouped_review_series.index.get_level_values(0),
                grouped_review_series.index.get_level_values(1),
            )
        ),
        shape=(len(reviewer_idx), len(asin_idx))
    )


def split_df(
    df: pd.DataFrame,
    col: Optional[str] = None,
    train_frac: float = 1
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if not (0 < train_frac <= 1):
        raise ValueError('train_frac should be between 0 and 1')

    if col:
        rank = df[col].rank(pct=True)
        train_df = df.loc[rank <= train_frac]
        test_df = df.loc[rank > train_frac]
    else:
        train_df = df.sample(frac=train_frac)
        test_df = df.drop(train_df.index)

    return train_df, test_df


def build_interaction_dataset(
    df: pd.DataFrame,
    train_frac: float = 1,
    split_by_time: bool = False
) -> ReviewDataSet:
    reviewer_idx = series_to_indexes(df['reviewerID'], 'reviewerIdx')
    asin_idx = series_to_indexes(df['asin'], 'asinIdx')

    df_with_idx = df.join(reviewer_idx, 'reviewerID').join(asin_idx, on='asin')

    # This should not happen, all rows must have an index
    assert df_with_idx['reviewerIdx'].isna().sum() == 0
    assert df_with_idx['asinIdx'].isna().sum() == 0

    if split_by_time:
        col = 'reviewTime'
    else:
        col = None

    train_df, test_df = split_df(df_with_idx, train_frac=train_frac, col=col)

    train_matrix = build_interation_matrix(
        train_df,
        reviewer_idx=reviewer_idx,
        asin_idx=asin_idx
    )

    test_matrix = build_interation_matrix(
        test_df,
        reviewer_idx=reviewer_idx,
        asin_idx=asin_idx
    )

    return ReviewDataSet(
        reviewer_idx=reviewer_idx, 
        asin_idx=asin_idx,
        train_df=train_df,
        test_df=test_df,
        train_matrix=train_matrix,
        test_matrix=test_matrix
    )


reviews_dataset = build_interaction_dataset(
    reviews,
    split_by_time=True,
    train_frac=0.8
)

In [27]:
len(reviews_dataset.test_df)

29439

In [28]:
len(reviews_dataset.train_df)

117755

In [29]:
test_reviewers = reviews_dataset.test_df.groupby(by=['reviewerID']).size().to_frame('test_freq')
train_reviewers = reviews_dataset.train_df.groupby(by=['reviewerID']).size().to_frame('train_freq')
train_reviewers.join(test_reviewers, how='outer').describe()

Unnamed: 0,train_freq,test_freq
count,105149.0,25448.0
mean,1.119887,1.15683
std,0.556151,0.619089
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,32.0,22.0


# 4. Quick check using BPR

In [30]:
from vbpr import VBPR

In [32]:
train = reviews_dataset.train_matrix
test = reviews_dataset.test_matrix

In [33]:
train

<128877x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 117723 stored elements in COOrdinate format>

In [34]:
test

<128877x1548 sparse matrix of type '<class 'numpy.float64'>'
	with 29417 stored elements in COOrdinate format>

In [35]:
reviews_dataset.reviewer_idx

APV13CM0919JD          0
A3G8U1G1V082SN         1
A11T2Q0EVTUWP          2
A9YKGBH3SV22C          3
A34WZIHVF3OKOL         4
                   ...  
A2K9WVQW9TLWNK    128872
A149ALSR6TPGF7    128873
A2Q066NZCQSCOR    128874
A1KJLWCW7XBS8I    128875
ANABUB0FRZXRM     128876
Name: reviewerIdx, Length: 128877, dtype: int64

In [36]:
test.sum(axis=1).std()

2.5673645477680496

In [37]:
bpr = VBPR()
bpr.fit(train, test, epochs=10, lr=.1, verbose=0)
print("Train AUC: {}".format(bpr.auc_score(train)))
print("Test AUC: {}".format(bpr.auc_score(test)))

Epoch 0
Training AUC: 0.8461198590990738
Test AUC: 0.7700241490921965
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Train AUC: 0.8777223738540135
Test AUC: 0.7848739442674354
