In [5]:
from pathlib import Path
from typing import List
from tqdm import tqdm

import numpy as np
import pandas as pd

import amazon_dataset

# Intialize Datasets

In [3]:
DATASET = 'Clothing_Shoes_and_Jewelry'
VALIDATION_SIZE=0.15
RANDOM_SEED = 20230219
ALEXNET_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_alexnet_features.npz')
VIT_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_vit_features.npz')
TEXT_FEATURES = Path(f'data/amazon/{DATASET}_bert_features.npz')
SEM_MACRID_DEST_FOLDER = Path(f'data/SEM-MacridVAE-{DATASET}')
DMRL_DEST_FOLDER = Path(f'data/DMRL-{DATASET}')

assert ALEXNET_IMAGE_FEATURES.exists()
assert VIT_IMAGE_FEATURES.exists()
assert TEXT_FEATURES.exists()
SEM_MACRID_DEST_FOLDER.mkdir(exist_ok=True)
DMRL_DEST_FOLDER.mkdir(exist_ok=True)

In [3]:
def generate_unique_ids(series: pd.Series) -> pd.Series:
    rng = np.random.default_rng(RANDOM_SEED)
    unique_ids = series.unique()
    return pd.Series(
        index=rng.permutation(unique_ids), 
        data=range(len(unique_ids))
    )

def df_stats(df: pd.DataFrame) -> str:
    n_items = len(df['asin'].unique())
    n_users = len(df['reviewerID'].unique())
    sparsity = 1. * len(df) / (n_users * n_items)
    return f'{n_items} items {n_users} users. Sparsity {sparsity * 100:.3f}%'

def save_numerized(
    reviews: pd.DataFrame, 
    uids: pd.Series, 
    product_ids: pd.Series,
    user_id_column: str,
    product_ids_column: str,
    columns: List[str],
    dest: Path
):
    """Save a Dataframe following userids and product_ids"""
    joined = reviews.join(uids.to_frame(user_id_column), on='reviewerID')
    assert joined[user_id_column].isna().sum() == 0
    
    joined = joined.join(product_ids.to_frame(product_ids_column), on='asin')
    assert joined[product_ids_column].isna().sum() == 0
    
    res = joined[columns]
    
    res.to_csv(dest, index=False)

    return res

def split_train_test_proportion(df: pd.DataFrame, test_prop=0.2):
    """
    Split the dataframe by reviewer and take exactly `test_prop` records
    for test and leave the rest for training
    """
    rng = np.random.default_rng(RANDOM_SEED)

    res = df.copy()
    res['rnd'] = rng.random(size=len(df))
    res['rnd_rank'] = res.groupby('reviewerID')['rnd'].rank(pct=True)
    condition = res['rnd_rank'] <= test_prop
    
    (_, train), (_, test) = res.groupby(condition)
    
    assert(len(set(train.index) & set(test.index)) == 0)

    return df.loc[train.index], df.loc[test.index]

In [4]:
reviews_df = amazon_dataset.reviews_df(DATASET)
reviews_df

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
0,676,5120053084,A35EUS1E3WK1HC,Kiley and Mars,5.0,"It's a cute top, works good for nursing and la...",2018-04-10,Decent lounge around top,True,
1,677,5120053084,AKIZYAIS4SYVF,Bethany,5.0,Looks really cute and super easy to nurse my d...,2018-03-22,Cute,True,
2,679,5120053084,A2L74OWEP7H1VC,Shelby0516,3.0,The tie is longer than the pictures showed. Ha...,2018-03-14,Awkward tie,True,
3,681,5120053084,A260RMKZXGDHVH,Kelly Kennedy,5.0,Of all the nursing shirts I bought my daughter...,2018-03-07,this one is her favorite. She wears it with pa...,True,
4,1300,7709260373,A13QI8GT2FFGN6,Amy,5.0,For the price... this is awesome!,2018-03-13,this is awesome!,True,
...,...,...,...,...,...,...,...,...,...,...
178939,32291840,B01HJCSCLK,AAHWQ4FMWLNH3,amazonlover,5.0,"Beautiful. Strong, durable, and chic but subtl...",2018-07-25,Beautiful,True,
178940,32291855,B01HJDVCJI,A2WUHKA1I75SL3,FRCP,3.0,Fit is great on these and they are very comfor...,2018-09-03,Comfortable,False,
178941,32291863,B01HJDVCJI,A7B48AJT6IC0A,Lives2read,4.0,Excellent arch support. Unique tongue design c...,2018-08-13,Unique look and comfort,False,
178942,32291875,B01HJDZM30,A2CJOG4NUHVDGK,Brittney Mitchell,5.0,Bought this for my husband and he absolutely l...,2018-08-29,Five Stars,False,


# Generate Data for SEM-MacridVAE

Some data stats

In [5]:
print(df_stats(reviews_df))

38493 items 23318 users. Sparsity 0.020%


Sort some users randomly

In [6]:
user_ids = generate_unique_ids(reviews_df['reviewerID'])
user_ids

A30NKRF3KBGA06        0
AL0XGCBE6Z22M         1
AMT5LF0TKY67C         2
A2BY8EVXA3NRHD        3
AWE6KR1ELIYQ3         4
                  ...  
A1MFBF49ZFMH2N    23313
A36AF5I7D0VO8F    23314
A35ZS7JT3G9B8     23315
A3GC94SEKQI3QU    23316
A185C12Y9XLYGY    23317
Length: 23318, dtype: int64

Same with products

In [7]:
item_ids = generate_unique_ids(reviews_df['asin'])
item_ids

B000B6AV7K        0
B0143D7EE4        1
B0105V2DEY        2
B014EY21H2        3
B005LUROIK        4
              ...  
B00A9R2P7A    38488
B017HK485S    38489
B008H7UKYY    38490
B006K6PJTK    38491
B01DUSBHZ0    38492
Length: 38493, dtype: int64

Split train test and validation by user

In [8]:
train_reviews, validation_test_reviews = split_train_test_proportion(reviews_df, test_prop=0.4)
validation_reviews, test_reviews = split_train_test_proportion(validation_test_reviews, test_prop=0.5)

In [9]:
len(train_reviews)

113836

len(validation_reviews)

In [10]:
len(test_reviews)

30728

Sanity checks. All users are in the train validation and test sets!

In [11]:
set(train_reviews['reviewerID']) == set(test_reviews['reviewerID']) ==  set(validation_reviews['reviewerID'])

True

Check that the train set and the validation set all have at least one item (same check as above)

In [12]:
pd.concat([
    train_reviews.groupby('reviewerID').size().to_frame('train_size'),
    validation_reviews.groupby('reviewerID').size().to_frame('val_size'),
    test_reviews.groupby('reviewerID').size().to_frame('test_size')
], axis=1).describe()

Unnamed: 0,train_size,val_size,test_size
count,23318.0,23318.0,23318.0
mean,4.881894,1.474397,1.31778
std,3.529635,1.185903,1.091225
min,3.0,1.0,1.0
25%,3.0,1.0,1.0
50%,4.0,1.0,1.0
75%,5.0,2.0,1.0
max,61.0,20.0,20.0


Generate users

In [13]:
user_ids.to_csv(SEM_MACRID_DEST_FOLDER / 'users.txt')
item_ids.to_csv(SEM_MACRID_DEST_FOLDER / 'items.txt', header=['item_id'])

In [14]:
SEMMACRID_NUMERIZED_COMMON_PARAMS = {
    'uids': user_ids, 
    'product_ids': item_ids,
    'user_id_column': 'user',
    'product_ids_column': 'item',
    'columns': ['user', 'item']
}

save_numerized(
    train_reviews, 
    dest=SEM_MACRID_DEST_FOLDER / 'train.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

Unnamed: 0,user,item
0,14891,30409
4,19073,29339
6,21394,38246
10,22275,38246
14,6565,38246
...,...,...
178934,5195,6878
178935,418,7919
178937,11611,5762
178938,16539,5762


In [15]:
save_numerized(
    validation_reviews, 
    dest=SEM_MACRID_DEST_FOLDER / 'validation.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)
'Saved validation data'

'Saved validation data'

In [16]:
save_numerized(
    test_reviews, 
    dest=SEM_MACRID_DEST_FOLDER / 'test.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

'Saved test data'

'Saved test data'

In [30]:
def copy_semmacrid_features(
        item_ids: pd.Series, 
        features_file: Path
):
    print('Opening file')
    with np.load(features_file) as features:
        some_embedding = next(iter(features.values()))
        embedding_shape, = some_embedding.shape
        array_shape = (len(item_ids),  embedding_shape)
        print(f'Initializing array {array_shape}')
        res = np.full(array_shape,  fill_value=np.nan)

        for asin, idx in tqdm(item_ids.items(), 
                total=len(item_ids), unit_scale=True, unit='items', 
                desc='Copying features'):
            value = features[asin]
            
            assert np.isnan(value).sum() == 0, "Feature has NaN Values"
            
            res[idx, :] = value

        return res


image_features_array = copy_semmacrid_features(
    item_ids=item_ids, 
    features_file=VIT_IMAGE_FEATURES
)
image_features_array.shape

Opening file
Initializing array (38493, 1024)


Copying features: 100%|██████████| 38.5k/38.5k [01:30<00:00, 424items/s]


(38493, 1024)

In [31]:
SEMMACRID_IMAGE_EMBED_DEST = SEM_MACRID_DEST_FOLDER / 'embed_image.npy'
np.save(SEMMACRID_IMAGE_EMBED_DEST, image_features_array)

print(f'{str(SEMMACRID_IMAGE_EMBED_DEST)}: {SEMMACRID_IMAGE_EMBED_DEST.stat().st_size // 2**20}MiB')

data/SEM-MacridVAE-Clothing_Shoes_and_Jewelry/embed_image.npy: 300MiB


In [32]:
semmacrid_text_features = copy_semmacrid_features(
    item_ids=item_ids, 
    features_file=TEXT_FEATURES
)
semmacrid_text_features.shape

Opening file
Initializing array (38493, 1024)


Copying features: 100%|██████████| 38.5k/38.5k [01:29<00:00, 432items/s]


(38493, 1024)

In [33]:
SEMMACRID_TEXT_EMBED_DEST = SEM_MACRID_DEST_FOLDER / 'embed_text.npy'
np.save(SEMMACRID_TEXT_EMBED_DEST, semmacrid_text_features)

print(f'{str(SEMMACRID_TEXT_EMBED_DEST)}: {SEMMACRID_TEXT_EMBED_DEST.stat().st_size // 2**20}MiB')

data/SEM-MacridVAE-Clothing_Shoes_and_Jewelry/embed_text.npy: 300MiB


# Generate Data for DMRL

In [17]:
print(df_stats(reviews_df))

38493 items 23318 users. Sparsity 0.020%


In [18]:
DMRL_NUMERIZED_COMMON_PARAMS = {
    'uids': user_ids, 
    'product_ids': item_ids,
    'user_id_column': 'userID',
    'product_ids_column': 'itemID',
    'columns': ['itemID','userID', 'reviewerID', 'asin']
}

save_numerized(train_reviews,     
    dest=DMRL_DEST_FOLDER / 'train.csv',
    **DMRL_NUMERIZED_COMMON_PARAMS)

Unnamed: 0,itemID,userID,reviewerID,asin
0,30409,14891,A35EUS1E3WK1HC,5120053084
4,29339,19073,A13QI8GT2FFGN6,7709260373
6,38246,21394,AWR7V9DKUUFLT,B00006XXGO
10,38246,22275,AMJ40C6RPXII9,B00006XXGO
14,38246,6565,A39ASGCNTOMC1E,B00006XXGO
...,...,...,...,...
178934,6878,5195,A1VX77NHMT1XJN,B01HJC0WSQ
178935,7919,418,A3IPC0UT5N4RYK,B01HJCNFDU
178937,5762,11611,A2WUDTAL0ADE6S,B01HJDBMUM
178938,5762,16539,A2N0T25TWYXPMP,B01HJDBMUM


*Note* : We use a validation set for the `test.csv` reviews in DMRL. 

In [20]:
save_numerized(
    validation_reviews,
    dest=DMRL_DEST_FOLDER / 'test.csv',
    **DMRL_NUMERIZED_COMMON_PARAMS
)

Unnamed: 0,itemID,userID,reviewerID,asin
1,30409,13868,AKIZYAIS4SYVF,5120053084
3,30409,3474,A260RMKZXGDHVH,5120053084
7,38246,5995,A23Z52PYAKMXE7,B00006XXGO
9,38246,8671,AW15SAKQYDR8Z,B00006XXGO
15,38246,16043,A16Q571QVLTIXU,B00006XXGO
...,...,...,...,...
178936,19135,3632,A3RVPCAPF92RK1,B01HJCYJ4Y
178940,31137,15473,A2WUHKA1I75SL3,B01HJDVCJI
178941,31137,15875,A7B48AJT6IC0A,B01HJDVCJI
178942,12170,20531,A2CJOG4NUHVDGK,B01HJDZM30


In [22]:
def copy_dmrl_image_features(
    item_ids: pd.Series,
    features_file: Path
):
    with np.load(features_file) as image_features, \
        tqdm(total=len(image_features), unit_scale=True, unit='image', 
             desc='Copying image features') as progress:
        res = {}

        for asin, feature in image_features.items():
            if asin in item_ids:
                # DMRL uses lists for image features
                res[asin] = feature.tolist()
            progress.update()
            
        assert set(res.keys()) == set(item_ids.index), 'Items were not copied!'

        return res

dmrl_image_features = copy_dmrl_image_features(
    item_ids, 
    VIT_IMAGE_FEATURES
)

Copying image features: 100%|██████████| 38.5k/38.5k [01:32<00:00, 415image/s]


In [23]:
DMRL_IMAGE_DEST = DMRL_DEST_FOLDER / 'item_feature.npy'
np.save(DMRL_IMAGE_DEST, dmrl_image_features)

print(f'{str(DMRL_IMAGE_DEST)}: {DMRL_IMAGE_DEST.stat().st_size // 2**20}MiB')

data/DMRL-Clothing_Shoes_and_Jewelry/item_feature.npy: 339MiB


In [24]:
def copy_dmrl_text_features(
    item_ids: pd.Series,
    features_file: Path
):
    with np.load(features_file) as image_features, \
        tqdm(total=len(image_features), unit_scale=True, unit='product', 
             desc='Copying textual features') as progress:
        res = {}

        for asin, feature in image_features.items():
            if asin in item_ids:
                # DMRL uses column vectors :)
                res[asin] = np.reshape(feature, (1, -1))
            progress.update()
            
        assert set(res.keys()) == set(item_ids.index), 'Items were not copied!'

        return res

dmrl_text_features = copy_dmrl_text_features(
    item_ids, 
    TEXT_FEATURES
)

Copying textual features: 100%|██████████| 38.5k/38.5k [01:26<00:00, 446product/s]


In [25]:
DMRL_TEXT_DEST = DMRL_DEST_FOLDER / 'review.npz'
np.savez(DMRL_TEXT_DEST, dmrl_text_features)

print(f'{str(DMRL_IMAGE_DEST)}: {DMRL_IMAGE_DEST.stat().st_size // 2**20}MiB')

data/DMRL-Clothing_Shoes_and_Jewelry/item_feature.npy: 339MiB


In [None]:
user_ids.to_csv(DMRL_DEST_FOLDER / 'users.txt')
item_ids.to_csv(DMRL_DEST_FOLDER / 'items.txt', header=['item_id'])

In [4]:
amazon_dataset.product_images_df(DATASET)

Unnamed: 0_level_0,url,slug,product_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,https://images-na.ssl-images-amazon.com/images...,51HJbA8UG2L,47
2,https://images-na.ssl-images-amazon.com/images...,51FufN7RbSL,47
3,https://images-na.ssl-images-amazon.com/images...,51vKjwQ6eAL,47
4,https://images-na.ssl-images-amazon.com/images...,410fEp9sdjL,47
5,https://images-na.ssl-images-amazon.com/images...,51vFScdjWiL,47
...,...,...,...
366210,https://m.media-amazon.com/images/I/41BtbDbVag...,41BtbDbVagL,2681802
366211,https://m.media-amazon.com/images/I/51HIWL51dW...,51HIWL51dWL,2681870
366212,https://m.media-amazon.com/images/I/41wqQkCn7T...,41wqQkCn7TL,2682043
366213,https://m.media-amazon.com/images/I/41v526rIgz...,41v526rIgzL,2682259
