In [1]:
from pathlib import Path
from typing import List, Tuple
from tqdm import tqdm

import numpy as np
import pandas as pd

import amazon_dataset

# Intialize Datasets

In [2]:
DATASET = 'Clothing_Shoes_and_Jewelry'
VALIDATION_SIZE=0.15
RANDOM_SEED = 20230219
ALEXNET_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_alexnet_features.npz')
VIT_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_vit_features.npz')
TEXT_FEATURES = Path(f'data/amazon/{DATASET}_text_features.npz')
SEM_MACRID_DEST_FOLDER = Path(f'data/SEM-MacridVAE-{DATASET}')
DMRL_DEST_FOLDER = Path(f'data/DMRL-{DATASET}')

assert ALEXNET_IMAGE_FEATURES.exists()
assert VIT_IMAGE_FEATURES.exists()
assert TEXT_FEATURES.exists()
SEM_MACRID_DEST_FOLDER.mkdir(exist_ok=True)
DMRL_DEST_FOLDER.mkdir(exist_ok=True)

In [3]:
def generate_unique_ids(series: pd.Series) -> pd.Series:
    rng = np.random.default_rng(RANDOM_SEED)
    unique_ids = series.unique()
    return pd.Series(
        index=rng.permutation(unique_ids), 
        data=range(len(unique_ids))
    )

def df_stats(df: pd.DataFrame) -> str:
    n_items = len(df['asin'].unique())
    n_users = len(df['reviewerID'].unique())
    sparsity = 1. * len(df) / (n_users * n_items)
    return f'{n_items} items {n_users} users. Sparsity {sparsity * 100:.3f}%'

def save_numerized(
    reviews: pd.DataFrame, 
    uids: pd.Series, 
    product_ids: pd.Series,
    user_id_column: str,
    product_ids_column: str,
    columns: List[str],
    dest: Path
):
    """Save a Dataframe following userids and product_ids"""
    joined = reviews.join(uids.to_frame(user_id_column), on='reviewerID')
    assert joined[user_id_column].isna().sum() == 0
    
    joined = joined.join(product_ids.to_frame(product_ids_column), on='asin')
    assert joined[product_ids_column].isna().sum() == 0
    
    res = joined[columns]
    
    res.to_csv(dest, index=False)

    return res

def split_train_test_proportion(df: pd.DataFrame, test_prop=0.2):
    """
    Split the dataframe by reviewer and take exactly `test_prop` records
    for test and leave the rest for training
    """
    rng = np.random.default_rng(RANDOM_SEED)

    res = df.copy()
    res['rnd'] = rng.random(size=len(df))
    res['rnd_rank'] = res.groupby('reviewerID')['rnd'].rank(pct=True)
    condition = res['rnd_rank'] <= test_prop
    
    (_, train), (_, test) = res.groupby(condition)
    
    assert(len(set(train.index) & set(test.index)) == 0)

    return df.loc[train.index], df.loc[test.index]

In [4]:
reviews_df = amazon_dataset.reviews_df(DATASET)
reviews_df

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
0,676,5120053084,A35EUS1E3WK1HC,Kiley and Mars,5.0,"It's a cute top, works good for nursing and la...",2018-04-10,Decent lounge around top,True,
1,677,5120053084,AKIZYAIS4SYVF,Bethany,5.0,Looks really cute and super easy to nurse my d...,2018-03-22,Cute,True,
2,679,5120053084,A2L74OWEP7H1VC,Shelby0516,3.0,The tie is longer than the pictures showed. Ha...,2018-03-14,Awkward tie,True,
3,681,5120053084,A260RMKZXGDHVH,Kelly Kennedy,5.0,Of all the nursing shirts I bought my daughter...,2018-03-07,this one is her favorite. She wears it with pa...,True,
4,1300,7709260373,A13QI8GT2FFGN6,Amy,5.0,For the price... this is awesome!,2018-03-13,this is awesome!,True,
...,...,...,...,...,...,...,...,...,...,...
178939,32291840,B01HJCSCLK,AAHWQ4FMWLNH3,amazonlover,5.0,"Beautiful. Strong, durable, and chic but subtl...",2018-07-25,Beautiful,True,
178940,32291855,B01HJDVCJI,A2WUHKA1I75SL3,FRCP,3.0,Fit is great on these and they are very comfor...,2018-09-03,Comfortable,False,
178941,32291863,B01HJDVCJI,A7B48AJT6IC0A,Lives2read,4.0,Excellent arch support. Unique tongue design c...,2018-08-13,Unique look and comfort,False,
178942,32291875,B01HJDZM30,A2CJOG4NUHVDGK,Brittney Mitchell,5.0,Bought this for my husband and he absolutely l...,2018-08-29,Five Stars,False,


: 

# Generate Data for SEM-MacridVAE

In [4]:
# binarize the data (only keep ratings >= 4)
sem_macrid_relevant_reviews = reviews_df.loc[reviews_df['overall'] >= 4][['asin', 'reviewerID']]
sem_macrid_relevant_reviews

Unnamed: 0,asin,reviewerID
0,5120053084,A35EUS1E3WK1HC
1,5120053084,AKIZYAIS4SYVF
3,5120053084,A260RMKZXGDHVH
4,7709260373,A13QI8GT2FFGN6
5,7709260373,A1246QM67H27LN
...,...,...
178938,B01HJDBMUM,A2N0T25TWYXPMP
178939,B01HJCSCLK,AAHWQ4FMWLNH3
178941,B01HJDVCJI,A7B48AJT6IC0A
178942,B01HJDZM30,A2CJOG4NUHVDGK


Some data stats

In [5]:
print(df_stats(sem_macrid_relevant_reviews))

34778 items 22181 users. Sparsity 0.018%


Sort some users randomly

In [6]:
sem_macrid_user_ids = generate_unique_ids(sem_macrid_relevant_reviews['reviewerID'])
sem_macrid_user_ids

A244Z4EM8ZX0X6        0
A2O23L14H5P0ZY        1
A6A4UR3UKVEGV         2
AG1C839UYKQJ2         3
A4RUSOW0ZKXUG         4
                  ...  
A2RUZ3SRDQIPXD    22176
A3DXDF5LTJ3L1D    22177
A14HN3YVM4XZNJ    22178
A1C42QMCGX6IQR    22179
A3SWTMD8AQLYAW    22180
Length: 22181, dtype: int64

Same with products

In [9]:
sem_macrid_item_ids = generate_unique_ids(sem_macrid_relevant_reviews['asin'])
sem_macrid_item_ids

B000IG6FCI        0
B00EDFKVJS        1
B013N774YG        2
B00BG4WFR0        3
B0009G3QH4        4
              ...  
B00HLST7QC    34773
B01DUSBHZ0    34774
B00G2J7H2C    34775
B007CRR7YQ    34776
B01FSDN8UM    34777
Length: 34778, dtype: int64

Split train test and validation by user

In [10]:
def split_uids(series: pd.Series, proportion: float) -> Tuple:
    assert proportion < 0.5
    return np.split(series, [
        int(len(series) * (1 - proportion * 2)), 
        int(len(series) * (1-proportion))
    ])

train_users, validation_users, test_users = split_uids(sem_macrid_user_ids, VALIDATION_SIZE)
print(f'{len(train_users)=}, {len(validation_users)=}, {len(test_users)=}')

len(train_users)=15526, len(validation_users)=3327, len(test_users)=3328


In [11]:
train_reviews = sem_macrid_relevant_reviews.loc[
    sem_macrid_relevant_reviews['reviewerID'].isin(train_users.index)
]
train_reviews

Unnamed: 0,asin,reviewerID
0,5120053084,A35EUS1E3WK1HC
1,5120053084,AKIZYAIS4SYVF
4,7709260373,A13QI8GT2FFGN6
6,B00006XXGO,AWR7V9DKUUFLT
10,B00006XXGO,AMJ40C6RPXII9
...,...,...
178933,B01HJBEZ3A,A2U2XESSJX57EQ
178937,B01HJDBMUM,A2WUDTAL0ADE6S
178939,B01HJCSCLK,AAHWQ4FMWLNH3
178942,B01HJDZM30,A2CJOG4NUHVDGK


Generate users

In [12]:
sem_macrid_user_ids.to_csv(SEM_MACRID_DEST_FOLDER / 'users.txt')
sem_macrid_item_ids.to_csv(SEM_MACRID_DEST_FOLDER / 'items.txt', header=['item_id'])

In [14]:
SEMMACRID_NUMERIZED_COMMON_PARAMS = {
    'uids': sem_macrid_user_ids, 
    'product_ids': sem_macrid_item_ids,
    'user_id_column': 'user',
    'product_ids_column': 'item',
    'columns': ['user', 'item']
}

save_numerized(
    train_reviews, 
    dest=SEM_MACRID_DEST_FOLDER / 'train.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

Unnamed: 0,user,item
0,13505,24700
1,12567,24700
4,2391,23840
6,14750,34549
10,4934,34549
...,...,...
178933,4925,913
178937,2533,1325
178939,1028,5373
178942,10398,8037


Use the code at `/disentangle-recsys/data/prep.py` to split users

In [15]:
validation_reviews = sem_macrid_relevant_reviews.loc[
    sem_macrid_relevant_reviews['reviewerID'].isin(validation_users.index)
]

validation_train, validation_test = split_train_test_proportion(validation_reviews)

save_numerized(
    validation_train, 
    dest=SEM_MACRID_DEST_FOLDER / 'valid_tr.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

save_numerized(
    validation_test, 
    dest=SEM_MACRID_DEST_FOLDER / 'valid_te.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

'Saved validation data'

'Saved validation data'

In [16]:
test_reviews = sem_macrid_relevant_reviews.loc[
    sem_macrid_relevant_reviews['reviewerID'].isin(test_users.index)
]
test_train, test_test = split_train_test_proportion(test_reviews)

save_numerized(
    test_train, 
    dest=SEM_MACRID_DEST_FOLDER / 'test_tr.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

save_numerized(
    test_test, 
    dest=SEM_MACRID_DEST_FOLDER / 'test_te.txt',
    **SEMMACRID_NUMERIZED_COMMON_PARAMS
)

'Saved test data'

'Saved test data'

In [17]:
def copy_semmacrid_image_features(item_ids: pd.Series, features_file: Path):
    print('Opening file')
    with np.load(features_file) as image_features:
        some_image = next(iter(image_features.values()))
        image_feature_size, = some_image.shape
        
        print('Initializing array')
        res = np.full(
            (len(item_ids), 
            image_feature_size), 
            fill_value=np.nan
        )

        for asin, idx in tqdm(item_ids.items(), 
                total=len(item_ids), unit_scale=True, unit='image', 
                desc='Copying features'):
            value = image_features[asin]
            
            assert np.isnan(value).sum() == 0, "Image should be all defined!"
            
            res[idx, :] = value

        return res

image_features_array = copy_semmacrid_image_features(
    item_ids=sem_macrid_item_ids, 
    features_file=ALEXNET_IMAGE_FEATURES
)
image_features_array.shape

Opening file
Initializing array


Copying features: 100%|██████████| 34.8k/34.8k [00:52<00:00, 660image/s]


(34778, 9216)

In [20]:
SEMMACRID_EMBED_DEST = SEM_MACRID_DEST_FOLDER / 'embed.npy'
np.save(SEMMACRID_EMBED_DEST, image_features_array)

print(f'{str(SEMMACRID_EMBED_DEST)}: {SEMMACRID_EMBED_DEST.stat().st_size // 2**20}MiB')

data/SEM-MacridVAE-Clothing_Shoes_and_Jewelry/embed.npy: 2445MiB


# Generate Data for DMRL

In [5]:
print(df_stats(reviews_df))

38493 items 23318 users. Sparsity 0.020%


In [8]:
train_dmrl, test_dmrl = split_train_test_proportion(reviews_df)

In [9]:
dmrl_user_ids = generate_unique_ids(reviews_df['reviewerID'])
dmrl_user_ids

A30NKRF3KBGA06        0
AL0XGCBE6Z22M         1
AMT5LF0TKY67C         2
A2BY8EVXA3NRHD        3
AWE6KR1ELIYQ3         4
                  ...  
A1MFBF49ZFMH2N    23313
A36AF5I7D0VO8F    23314
A35ZS7JT3G9B8     23315
A3GC94SEKQI3QU    23316
A185C12Y9XLYGY    23317
Length: 23318, dtype: int64

In [10]:
dmrl_item_ids = generate_unique_ids(reviews_df['asin'])
dmrl_item_ids

B000B6AV7K        0
B0143D7EE4        1
B0105V2DEY        2
B014EY21H2        3
B005LUROIK        4
              ...  
B00A9R2P7A    38488
B017HK485S    38489
B008H7UKYY    38490
B006K6PJTK    38491
B01DUSBHZ0    38492
Length: 38493, dtype: int64

In [12]:
DMRL_NUMERIZED_COMMON_PARAMS = {
    'uids': dmrl_user_ids, 
    'product_ids': dmrl_item_ids,
    'user_id_column': 'userID',
    'product_ids_column': 'itemID',
    'columns': ['itemID','userID', 'reviewerID', 'asin']
}

save_numerized(train_dmrl,     
    dest=DMRL_DEST_FOLDER / 'train.csv',
    **DMRL_NUMERIZED_COMMON_PARAMS)

Unnamed: 0,itemID,userID,reviewerID,asin
0,30409,14891,A35EUS1E3WK1HC,5120053084
3,30409,3474,A260RMKZXGDHVH,5120053084
4,29339,19073,A13QI8GT2FFGN6,7709260373
5,29339,16285,A1246QM67H27LN,7709260373
6,38246,21394,AWR7V9DKUUFLT,B00006XXGO
...,...,...,...,...
178936,19135,3632,A3RVPCAPF92RK1,B01HJCYJ4Y
178937,5762,11611,A2WUDTAL0ADE6S,B01HJDBMUM
178938,5762,16539,A2N0T25TWYXPMP,B01HJDBMUM
178939,3855,13738,AAHWQ4FMWLNH3,B01HJCSCLK


In [16]:
save_numerized(
    test_dmrl,
    dest=DMRL_DEST_FOLDER / 'test.csv',
    **DMRL_NUMERIZED_COMMON_PARAMS
)

Unnamed: 0,itemID,userID,reviewerID,asin
1,30409,13868,AKIZYAIS4SYVF,5120053084
2,30409,23021,A2L74OWEP7H1VC,5120053084
7,38246,5995,A23Z52PYAKMXE7,B00006XXGO
8,38246,7533,A3MLZHW2I4536P,B00006XXGO
11,38246,22554,A3EZJBNH70UNW3,B00006XXGO
...,...,...,...,...
178928,31391,1090,A3JKEN4MXCP5Y4,B01HJ2HUQI
178932,11093,12787,A3OXB8POGTLMFY,B01HJBL2S6
178940,31137,15473,A2WUHKA1I75SL3,B01HJDVCJI
178941,31137,15875,A7B48AJT6IC0A,B01HJDVCJI


In [54]:
def copy_dmrl_image_features(
    item_ids: pd.Series,
    features_file: Path
):
    with np.load(features_file) as image_features, \
        tqdm(total=len(image_features), unit_scale=True, unit='image', 
             desc='Copying image features') as progress:
        res = {}

        for asin, feature in image_features.items():
            if asin in item_ids:
                # DMRL uses lists for image features
                res[asin] = feature.tolist()
            progress.update()
            
        assert set(res.keys()) == set(item_ids.index), 'Items were not copied!'

        return res

dmrl_image_features = copy_dmrl_image_features(
    dmrl_item_ids, 
    VIT_IMAGE_FEATURES
)

Copying features: 100%|██████████| 38.5k/38.5k [01:36<00:00, 398image/s]


In [55]:
DMRL_IMAGE_DEST = DMRL_DEST_FOLDER / 'item_feature.npy'
np.save(DMRL_IMAGE_DEST, dmrl_image_features)

print(f'{str(DMRL_IMAGE_DEST)}: {DMRL_IMAGE_DEST.stat().st_size // 2**20}MiB')

data/DMRL-Clothing_Shoes_and_Jewelry/item_feature.npy: 339MiB


In [58]:
def copy_dmrl_text_features(
    item_ids: pd.Series,
    features_file: Path
):
    with np.load(features_file) as image_features, \
        tqdm(total=len(image_features), unit_scale=True, unit='product', 
             desc='Copying textual features') as progress:
        res = {}

        for asin, feature in image_features.items():
            if asin in item_ids:
                # DMRL uses column vectors :)
                res[asin] = np.reshape(feature, (1, -1))
            progress.update()
            
        assert set(res.keys()) == set(item_ids.index), 'Items were not copied!'

        return res

dmrl_text_features = copy_dmrl_text_features(
    dmrl_item_ids, 
    TEXT_FEATURES
)

Copying textual features: 100%|██████████| 38.5k/38.5k [00:41<00:00, 920product/s]  


In [None]:
DMRL_TEXT_DEST = DMRL_DEST_FOLDER / 'review.npz'
np.savez(DMRL_TEXT_DEST, dmrl_text_features)

print(f'{str(DMRL_IMAGE_DEST)}: {DMRL_IMAGE_DEST.stat().st_size // 2**20}MiB')

In [53]:
x = np.load(DMRL_IMAGE_DEST, allow_pickle=True)
type(x.item())

dict