In [1]:
from functools import lru_cache
from pathlib import Path
from typing import List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import amazon_dataset

# Intialize Datasets

In [2]:
DATASET = 'Musical_Instruments'
#DATASET = 'Clothing_Shoes_and_Jewelry'
#DATASET = 'Home_and_Kitchen'
#DATASET = 'Movies_and_TV'

VALIDATION_SIZE=0.15
RANDOM_SEED = 20230219
#ALEXNET_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_alexnet_features.npz')
VIT_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_vit_features.npz')
CLIP_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_clipimage_features.npz')

CLIP_TEXT_FEATURES = Path(f'data/amazon/{DATASET}_cliptext_features.npz')
BERT_TEXT_FEATURES = Path(f'data/amazon/{DATASET}_bert_features.npz')


#assert ALEXNET_IMAGE_FEATURES.exists()
assert VIT_IMAGE_FEATURES.exists(), f"{VIT_IMAGE_FEATURES} does not exist"
assert CLIP_IMAGE_FEATURES.exists(), f"{CLIP_IMAGE_FEATURES} does not exist"
assert BERT_TEXT_FEATURES.exists(), f"{BERT_TEXT_FEATURES} does not exist"
assert CLIP_TEXT_FEATURES.exists(), f"{CLIP_TEXT_FEATURES} does not exist"

#DEST_FOLDER = Path(f'data/{DATASET}')
# DEST_FOLDER.mkdir(exist_ok=True)

# with open(DEST_FOLDER / 'signature.txt', mode='w') as f:
#     f.write(f'Image: {image_features.stem}\nText: {text_featues.stem}')

In [3]:
def generate_unique_ids(series: pd.Series) -> pd.Series:
    rng = np.random.default_rng(RANDOM_SEED)
    unique_ids = series.unique()
    return pd.Series(
        index=rng.permutation(unique_ids), 
        data=range(len(unique_ids))
    )

def df_stats(df: pd.DataFrame) -> str:
    n_items = len(df['asin'].unique())
    n_users = len(df['reviewerID'].unique())
    sparsity = 1. * len(df) / (n_users * n_items)
    return f'{n_users} users {n_items} items ratings: {len(df)}. Sparsity {sparsity * 100:.3f}%'

def save_numerized(
    reviews: pd.DataFrame, 
    uids: pd.Series, 
    product_ids: pd.Series,
    user_id_column: str,
    product_ids_column: str,
    columns: List[str],
    dest: Path
):
    """Save a Dataframe following userids and product_ids"""
    joined = reviews.join(uids.to_frame(user_id_column), on='reviewerID')
    assert joined[user_id_column].isna().sum() == 0
    
    joined = joined.join(product_ids.to_frame(product_ids_column), on='asin')
    assert joined[product_ids_column].isna().sum() == 0
    
    res = joined[columns]
    
    res.to_csv(dest, index=False)

    return res

def split_train_test_proportion(df: pd.DataFrame, test_prop=0.2):
    """
    Split the dataframe by reviewer and take exactly `test_prop` records
    for test and leave the rest for training
    """
    rng = np.random.default_rng(RANDOM_SEED)

    res = df.copy()
    res['rnd'] = rng.random(size=len(df))
    res['rnd_rank'] = res.groupby('reviewerID')['rnd'].rank(pct=True)
    condition = res['rnd_rank'] <= test_prop
    
    (_, train), (_, test) = res.groupby(condition)
    
    assert(len(set(train.index) & set(test.index)) == 0)

    return df.loc[train.index], df.loc[test.index]

In [4]:
reviews_df = amazon_dataset.reviews_df(DATASET)
reviews_df.sample(n=3)

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
1835,69651,B0002E1NNC,A2WV9MAEV6I3R4,Benjamin Reign,5.0,I love these strings keep their tone longer\n...,2018-01-08,Sound great but pricey,True,
29559,1137772,B000OR2T0I,ALG0YK6C46XPS,Chet Mortenson,3.0,The loop end unwound as I tuned up the high e ...,2017-12-07,Good for the price but Argentines are better,False,
39548,1501224,B01EG26WH2,A2K3M2TEV17EJK,tazwayneiac,5.0,works very good,2018-04-30,Five Stars,True,


# Generate Data for SEM-MacridVAE & DMRL

Some data stats

In [5]:
print(DATASET)
print(df_stats(reviews_df))

Musical_Instruments
5422 users 8460 items ratings: 40118. Sparsity 0.087%


Sort some users randomly

In [6]:
user_ids = generate_unique_ids(reviews_df['reviewerID'])
user_ids

A18ISN6NS073V2       0
A1XFJTXA00NYX1       1
A3GITGRRVQ4IP0       2
A1W2V8Z9EWTZMM       3
AEOQG6GFYR009        4
                  ... 
A2LSUIA9THHK57    5417
A31D3XL44W1NBM    5418
A3775OP5VTX5ON    5419
A1G0HYMR02WM2W    5420
A7CN7OR0X715S     5421
Length: 5422, dtype: int64

Same with products

In [7]:
item_ids = generate_unique_ids(reviews_df['asin'])
item_ids

B0002GXZK4       0
B00ZUBQY9K       1
B00VUU6JL8       2
B00PUQK4FU       3
B00UTUKFHE       4
              ... 
B000EEJ8VE    8455
B00CAL9ZXK    8456
B00U1XK5Z6    8457
B0002D0CNA    8458
B016L4MRMW    8459
Length: 8460, dtype: int64

Split train test and validation by user

In [8]:
train_reviews, validation_test_reviews = split_train_test_proportion(reviews_df, test_prop=0.4)
validation_reviews, test_reviews = split_train_test_proportion(validation_test_reviews, test_prop=0.5)

In [9]:
len(train_reviews)

25628

len(validation_reviews)

In [10]:
len(test_reviews)

6744

Sanity checks. All users are in the train validation and test sets!

In [11]:
set(train_reviews['reviewerID']) == set(test_reviews['reviewerID']) ==  set(validation_reviews['reviewerID'])

True

Check that the train set and the validation set all have at least one item (same check as above)

In [12]:
pd.concat([
    train_reviews.groupby('reviewerID').size().to_frame('train_size'),
    validation_reviews.groupby('reviewerID').size().to_frame('val_size'),
    test_reviews.groupby('reviewerID').size().to_frame('test_size')
], axis=1).describe()

Unnamed: 0,train_size,val_size,test_size
count,5422.0,5422.0,5422.0
mean,4.726669,1.428624,1.243821
std,2.451181,0.84009,0.720182
min,3.0,1.0,1.0
25%,3.0,1.0,1.0
50%,4.0,1.0,1.0
75%,5.0,2.0,1.0
max,36.0,12.0,12.0


In [21]:
combinations = pd.DataFrame.from_records([
    ('vit', 'bert', VIT_IMAGE_FEATURES, BERT_TEXT_FEATURES),
    ('none', 'bert', None, BERT_TEXT_FEATURES),
    ('vit', 'none', VIT_IMAGE_FEATURES, None),
    ('clip', 'clip', CLIP_IMAGE_FEATURES, CLIP_TEXT_FEATURES),
    ('none', 'clip', None, CLIP_TEXT_FEATURES),
    ('clip', 'none', CLIP_IMAGE_FEATURES, None),
    ('none', 'none', None, None),
], columns=['image', 'text', 'image_features', 'text_features'])
combinations['dest_folder'] = combinations.apply(
    lambda x: Path(f'data/{DATASET}-{x["image"]}_{x["text"]}'), axis=1
)
combinations

Unnamed: 0,image,text,image_features,text_features,dest_folder
0,vit,bert,data/amazon/Musical_Instruments_vit_features.npz,data/amazon/Musical_Instruments_bert_features.npz,data/Musical_Instruments-vit_bert
1,none,bert,,data/amazon/Musical_Instruments_bert_features.npz,data/Musical_Instruments-none_bert
2,vit,none,data/amazon/Musical_Instruments_vit_features.npz,,data/Musical_Instruments-vit_none
3,clip,clip,data/amazon/Musical_Instruments_clipimage_feat...,data/amazon/Musical_Instruments_cliptext_featu...,data/Musical_Instruments-clip_clip
4,none,clip,,data/amazon/Musical_Instruments_cliptext_featu...,data/Musical_Instruments-none_clip
5,clip,none,data/amazon/Musical_Instruments_clipimage_feat...,,data/Musical_Instruments-clip_none
6,none,none,,,data/Musical_Instruments-none_none


In [23]:
for dataset in ['Movies_and_TV', 'Home_and_Kitchen', 'Musical_Instruments', 'Clothing_Shoes_and_Jewelry']:
    for c in combinations.itertuples():
        print(f'{dataset}-{c.image}_{c.text}')

Movies_and_TV-vit_bert
Movies_and_TV-none_bert
Movies_and_TV-vit_none
Movies_and_TV-clip_clip
Movies_and_TV-none_clip
Movies_and_TV-clip_none
Movies_and_TV-none_none
Home_and_Kitchen-vit_bert
Home_and_Kitchen-none_bert
Home_and_Kitchen-vit_none
Home_and_Kitchen-clip_clip
Home_and_Kitchen-none_clip
Home_and_Kitchen-clip_none
Home_and_Kitchen-none_none
Musical_Instruments-vit_bert
Musical_Instruments-none_bert
Musical_Instruments-vit_none
Musical_Instruments-clip_clip
Musical_Instruments-none_clip
Musical_Instruments-clip_none
Musical_Instruments-none_none
Clothing_Shoes_and_Jewelry-vit_bert
Clothing_Shoes_and_Jewelry-none_bert
Clothing_Shoes_and_Jewelry-vit_none
Clothing_Shoes_and_Jewelry-clip_clip
Clothing_Shoes_and_Jewelry-none_clip
Clothing_Shoes_and_Jewelry-clip_none
Clothing_Shoes_and_Jewelry-none_none


Generate users

In [14]:
for c in combinations.itertuples():
    c.dest_folder.mkdir(exist_ok=True)

    user_ids.to_csv(c.dest_folder / 'users.txt')
    item_ids.to_csv(c.dest_folder / 'items.txt', header=['item_id'])

In [15]:
NUMERIZED_COMMON_PARAMS = {
    'uids': user_ids, 
    'product_ids': item_ids,
    'user_id_column': 'user',
    'product_ids_column': 'item',
    'columns': ['user', 'item']
}

for c in combinations.itertuples():
    c.dest_folder.mkdir(exist_ok=True)

    save_numerized(
        train_reviews, 
        dest=c.dest_folder / 'train.txt',
        **NUMERIZED_COMMON_PARAMS
    )


In [16]:
for c in combinations.itertuples():
    c.dest_folder.mkdir(exist_ok=True)

    save_numerized(
        validation_reviews, 
        dest=c.dest_folder / 'validation.txt',
        **NUMERIZED_COMMON_PARAMS
    )


'Saved validation data'

'Saved validation data'

In [17]:
for c in combinations.itertuples():
    save_numerized(
        test_reviews, 
        dest=c.dest_folder / 'test.txt',
        **NUMERIZED_COMMON_PARAMS
    )
'Saved test data'

'Saved test data'

In [19]:
@lru_cache()
def copy_features(features_file: Optional[Path]):
    if features_file is None:
        print('Returning Zeros')
        return np.zeros((len(item_ids), 256), dtype=np.float32)

    print(f'Opening file {features_file}')
    with np.load(features_file) as features:
        some_embedding = next(iter(features.values()))
        embedding_shape, = some_embedding.shape
        array_shape = (len(item_ids),  embedding_shape)
        print(f'Initializing array {array_shape}')
        res = np.full(array_shape,  fill_value=np.nan)

        for asin, idx in tqdm(item_ids.items(), 
                total=len(item_ids), unit_scale=True, unit='items', 
                desc='Copying features'):
            value = features.get(asin)
            if value is None:
                print(f'Item {asin} not found in features file')
                res[idx, :] = 0    
            else:
                assert np.isnan(value).sum() == 0, "Feature has NaN Values"
                res[idx, :] = value

        return res

for c in combinations.itertuples():
    print(c)
    features_array = copy_features(features_file=c.image_features)
    features_array
    embed_dest = c.dest_folder / 'embed_image.npy'
    np.save(embed_dest, features_array)

    print(f'{str(embed_dest)}: {embed_dest.stat().st_size // 2**20}MiB')

copy_features.cache_clear()

Pandas(Index=0, image='vit', text='bert', image_features=PosixPath('data/amazon/Musical_Instruments_vit_features.npz'), text_features=PosixPath('data/amazon/Musical_Instruments_bert_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-vit_bert'))
Opening file data/amazon/Musical_Instruments_vit_features.npz
Initializing array (8460, 1024)


Copying features: 100%|██████████| 8.46k/8.46k [00:04<00:00, 1.76kitems/s]


data/amazon/Musical_Instruments-vit_bert/embed_image.npy: 66MiB
Pandas(Index=1, image='none', text='bert', image_features=None, text_features=PosixPath('data/amazon/Musical_Instruments_bert_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-none_bert'))
Returning Zeros
data/amazon/Musical_Instruments-none_bert/embed_image.npy: 8MiB
Pandas(Index=2, image='vit', text='none', image_features=PosixPath('data/amazon/Musical_Instruments_vit_features.npz'), text_features=None, dest_folder=PosixPath('data/amazon/Musical_Instruments-vit_none'))
data/amazon/Musical_Instruments-vit_none/embed_image.npy: 66MiB
Pandas(Index=3, image='clip', text='clip', image_features=PosixPath('data/amazon/Musical_Instruments_clipimage_features.npz'), text_features=PosixPath('data/amazon/Musical_Instruments_cliptext_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-clip_clip'))
Opening file data/amazon/Musical_Instruments_clipimage_features.npz
Initializing array (8460, 768)

Copying features: 100%|██████████| 8.46k/8.46k [00:04<00:00, 1.76kitems/s]

data/amazon/Musical_Instruments-clip_clip/embed_image.npy: 49MiB
Pandas(Index=4, image='none', text='clip', image_features=None, text_features=PosixPath('data/amazon/Musical_Instruments_cliptext_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-none_clip'))
data/amazon/Musical_Instruments-none_clip/embed_image.npy: 8MiB
Pandas(Index=5, image='clip', text='none', image_features=PosixPath('data/amazon/Musical_Instruments_clipimage_features.npz'), text_features=None, dest_folder=PosixPath('data/amazon/Musical_Instruments-clip_none'))
data/amazon/Musical_Instruments-clip_none/embed_image.npy: 49MiB
Pandas(Index=6, image='none', text='none', image_features=None, text_features=None, dest_folder=PosixPath('data/amazon/Musical_Instruments-none_none'))
data/amazon/Musical_Instruments-none_none/embed_image.npy: 8MiB





In [20]:
for c in combinations.itertuples():
    print(c)
    features_array = copy_features(features_file=c.text_features)
    embed_dest = c.dest_folder / 'embed_text.npy'
    np.save(embed_dest, features_array)

    print(f'{str(embed_dest)}: {embed_dest.stat().st_size // 2**20}MiB')

Pandas(Index=0, image='vit', text='bert', image_features=PosixPath('data/amazon/Musical_Instruments_vit_features.npz'), text_features=PosixPath('data/amazon/Musical_Instruments_bert_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-vit_bert'))
Opening file data/amazon/Musical_Instruments_bert_features.npz
Initializing array (8460, 1024)


Copying features: 100%|██████████| 8.46k/8.46k [00:04<00:00, 1.82kitems/s]


data/amazon/Musical_Instruments-vit_bert/embed_text.npy: 66MiB
Pandas(Index=1, image='none', text='bert', image_features=None, text_features=PosixPath('data/amazon/Musical_Instruments_bert_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-none_bert'))
data/amazon/Musical_Instruments-none_bert/embed_text.npy: 66MiB
Pandas(Index=2, image='vit', text='none', image_features=PosixPath('data/amazon/Musical_Instruments_vit_features.npz'), text_features=None, dest_folder=PosixPath('data/amazon/Musical_Instruments-vit_none'))
Returning Zeros
data/amazon/Musical_Instruments-vit_none/embed_text.npy: 8MiB
Pandas(Index=3, image='clip', text='clip', image_features=PosixPath('data/amazon/Musical_Instruments_clipimage_features.npz'), text_features=PosixPath('data/amazon/Musical_Instruments_cliptext_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-clip_clip'))
Opening file data/amazon/Musical_Instruments_cliptext_features.npz
Initializing array (8460, 768)


Copying features: 100%|██████████| 8.46k/8.46k [00:04<00:00, 1.91kitems/s]


data/amazon/Musical_Instruments-clip_clip/embed_text.npy: 49MiB
Pandas(Index=4, image='none', text='clip', image_features=None, text_features=PosixPath('data/amazon/Musical_Instruments_cliptext_features.npz'), dest_folder=PosixPath('data/amazon/Musical_Instruments-none_clip'))
data/amazon/Musical_Instruments-none_clip/embed_text.npy: 49MiB
Pandas(Index=5, image='clip', text='none', image_features=PosixPath('data/amazon/Musical_Instruments_clipimage_features.npz'), text_features=None, dest_folder=PosixPath('data/amazon/Musical_Instruments-clip_none'))
data/amazon/Musical_Instruments-clip_none/embed_text.npy: 8MiB
Pandas(Index=6, image='none', text='none', image_features=None, text_features=None, dest_folder=PosixPath('data/amazon/Musical_Instruments-none_none'))
data/amazon/Musical_Instruments-none_none/embed_text.npy: 8MiB
