In [1]:
from pathlib import Path
from typing import List
from tqdm import tqdm

import numpy as np
import pandas as pd

import amazon_dataset

# Intialize Datasets

In [2]:
DATASET = 'Clothing_Shoes_and_Jewelry'
VALIDATION_SIZE=0.15
RANDOM_SEED = 20230219
ALEXNET_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_alexnet_features.npz')
VIT_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_vit_features.npz')
CLIP_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_clipimage_features.npz')

CLIP_TEXT_FEATURES = Path(f'data/amazon/{DATASET}_cliptext_features.npz')
BERT_TEXT_FEATURES = Path(f'data/amazon/{DATASET}_bert_features.npz')
DEST_FOLDER = Path(f'data/{DATASET}')

assert ALEXNET_IMAGE_FEATURES.exists()
assert VIT_IMAGE_FEATURES.exists()
assert CLIP_IMAGE_FEATURES.exists()

assert BERT_TEXT_FEATURES.exists()
assert CLIP_TEXT_FEATURES.exists()


DEST_FOLDER.mkdir(exist_ok=True)

In [3]:
def generate_unique_ids(series: pd.Series) -> pd.Series:
    rng = np.random.default_rng(RANDOM_SEED)
    unique_ids = series.unique()
    return pd.Series(
        index=rng.permutation(unique_ids), 
        data=range(len(unique_ids))
    )

def df_stats(df: pd.DataFrame) -> str:
    n_items = len(df['asin'].unique())
    n_users = len(df['reviewerID'].unique())
    sparsity = 1. * len(df) / (n_users * n_items)
    return f'{n_items} items {n_users} users. Sparsity {sparsity * 100:.3f}%'

def save_numerized(
    reviews: pd.DataFrame, 
    uids: pd.Series, 
    product_ids: pd.Series,
    user_id_column: str,
    product_ids_column: str,
    columns: List[str],
    dest: Path
):
    """Save a Dataframe following userids and product_ids"""
    joined = reviews.join(uids.to_frame(user_id_column), on='reviewerID')
    assert joined[user_id_column].isna().sum() == 0
    
    joined = joined.join(product_ids.to_frame(product_ids_column), on='asin')
    assert joined[product_ids_column].isna().sum() == 0
    
    res = joined[columns]
    
    res.to_csv(dest, index=False)

    return res

def split_train_test_proportion(df: pd.DataFrame, test_prop=0.2):
    """
    Split the dataframe by reviewer and take exactly `test_prop` records
    for test and leave the rest for training
    """
    rng = np.random.default_rng(RANDOM_SEED)

    res = df.copy()
    res['rnd'] = rng.random(size=len(df))
    res['rnd_rank'] = res.groupby('reviewerID')['rnd'].rank(pct=True)
    condition = res['rnd_rank'] <= test_prop
    
    (_, train), (_, test) = res.groupby(condition)
    
    assert(len(set(train.index) & set(test.index)) == 0)

    return df.loc[train.index], df.loc[test.index]

In [4]:
reviews_df = amazon_dataset.reviews_df(DATASET)
reviews_df.sample(n=3)

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
70627,16228232,B00SAS4FGO,A2Y6ETKBQ4EHLP,Victoria E Aurand,5.0,Love them ! Was not looking for Mets shoes but...,2018-03-05,Love them! Was not looking for Mets shoes but ...,True,
113401,20411120,B0194E63NU,A1CFZTW88EV2T6,nakeisha barber,5.0,Great socks,2018-07-11,Love them,True,
44834,9986440,B00CAXQV3K,A262MWE69MEQE3,Mike Mendenhall,4.0,Could be a little thicker..,2018-04-02,Four Stars,True,


# Generate Data for SEM-MacridVAE & DMRL

Some data stats

In [5]:
print(df_stats(reviews_df))

38493 items 23318 users. Sparsity 0.020%


Sort some users randomly

In [6]:
user_ids = generate_unique_ids(reviews_df['reviewerID'])
user_ids

A30NKRF3KBGA06        0
AL0XGCBE6Z22M         1
AMT5LF0TKY67C         2
A2BY8EVXA3NRHD        3
AWE6KR1ELIYQ3         4
                  ...  
A1MFBF49ZFMH2N    23313
A36AF5I7D0VO8F    23314
A35ZS7JT3G9B8     23315
A3GC94SEKQI3QU    23316
A185C12Y9XLYGY    23317
Length: 23318, dtype: int64

Same with products

In [7]:
item_ids = generate_unique_ids(reviews_df['asin'])
item_ids

B000B6AV7K        0
B0143D7EE4        1
B0105V2DEY        2
B014EY21H2        3
B005LUROIK        4
              ...  
B00A9R2P7A    38488
B017HK485S    38489
B008H7UKYY    38490
B006K6PJTK    38491
B01DUSBHZ0    38492
Length: 38493, dtype: int64

Split train test and validation by user

In [8]:
train_reviews, validation_test_reviews = split_train_test_proportion(reviews_df, test_prop=0.4)
validation_reviews, test_reviews = split_train_test_proportion(validation_test_reviews, test_prop=0.5)

In [9]:
len(train_reviews)

113836

len(validation_reviews)

In [10]:
len(test_reviews)

30728

Sanity checks. All users are in the train validation and test sets!

In [11]:
set(train_reviews['reviewerID']) == set(test_reviews['reviewerID']) ==  set(validation_reviews['reviewerID'])

True

Check that the train set and the validation set all have at least one item (same check as above)

In [12]:
pd.concat([
    train_reviews.groupby('reviewerID').size().to_frame('train_size'),
    validation_reviews.groupby('reviewerID').size().to_frame('val_size'),
    test_reviews.groupby('reviewerID').size().to_frame('test_size')
], axis=1).describe()

Unnamed: 0,train_size,val_size,test_size
count,23318.0,23318.0,23318.0
mean,4.881894,1.474397,1.31778
std,3.529635,1.185903,1.091225
min,3.0,1.0,1.0
25%,3.0,1.0,1.0
50%,4.0,1.0,1.0
75%,5.0,2.0,1.0
max,61.0,20.0,20.0


Generate users

In [13]:
user_ids.to_csv(DEST_FOLDER / 'users.txt')
item_ids.to_csv(DEST_FOLDER / 'items.txt', header=['item_id'])

In [14]:
NUMERIZED_COMMON_PARAMS = {
    'uids': user_ids, 
    'product_ids': item_ids,
    'user_id_column': 'user',
    'product_ids_column': 'item',
    'columns': ['user', 'item']
}

save_numerized(
    train_reviews, 
    dest=DEST_FOLDER / 'train.txt',
    **NUMERIZED_COMMON_PARAMS
)

Unnamed: 0,user,item
0,14891,30409
4,19073,29339
6,21394,38246
10,22275,38246
14,6565,38246
...,...,...
178934,5195,6878
178935,418,7919
178937,11611,5762
178938,16539,5762


In [15]:
save_numerized(
    validation_reviews, 
    dest=DEST_FOLDER / 'validation.txt',
    **NUMERIZED_COMMON_PARAMS
)
'Saved validation data'

'Saved validation data'

In [16]:
save_numerized(
    test_reviews, 
    dest=DEST_FOLDER / 'test.txt',
    **NUMERIZED_COMMON_PARAMS
)
'Saved test data'

'Saved test data'

In [17]:
def copy_features(
        item_ids: pd.Series, 
        features_file: Path
):
    print('Opening file')
    with np.load(features_file) as features:
        some_embedding = next(iter(features.values()))
        embedding_shape, = some_embedding.shape
        array_shape = (len(item_ids),  embedding_shape)
        print(f'Initializing array {array_shape}')
        res = np.full(array_shape,  fill_value=np.nan)

        for asin, idx in tqdm(item_ids.items(), 
                total=len(item_ids), unit_scale=True, unit='items', 
                desc='Copying features'):
            value = features[asin]
            
            assert np.isnan(value).sum() == 0, "Feature has NaN Values"
            
            res[idx, :] = value

        return res


image_features_array = copy_features(
    item_ids=item_ids, 
    features_file=ALEXNET_IMAGE_FEATURES
)
image_features_array.shape

Opening file
Initializing array (38493, 1024)


Copying features: 100%|██████████| 38.5k/38.5k [01:11<00:00, 541items/s]


(38493, 1024)

In [18]:
IMAGE_EMBED_DEST = DEST_FOLDER / 'embed_image.npy'
np.save(IMAGE_EMBED_DEST, image_features_array)

print(f'{str(IMAGE_EMBED_DEST)}: {IMAGE_EMBED_DEST.stat().st_size // 2**20}MiB')

data/Clothing_Shoes_and_Jewelry/embed_image.npy: 300MiB


In [19]:
text_features_array = copy_features(
    item_ids=item_ids, 
    features_file=BERT_TEXT_FEATURES
)
text_features_array.shape

Opening file
Initializing array (38493, 1024)


Copying features: 100%|██████████| 38.5k/38.5k [01:08<00:00, 564items/s]


(38493, 1024)

In [20]:
TEXT_EMBED_DEST = DEST_FOLDER / 'embed_text.npy'
np.save(TEXT_EMBED_DEST, text_features_array)

print(f'{str(TEXT_EMBED_DEST)}: {TEXT_EMBED_DEST.stat().st_size // 2**20}MiB')

data/Clothing_Shoes_and_Jewelry/embed_text.npy: 300MiB
