In [1]:
from pathlib import Path
from typing import List

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import amazon_dataset

# Intialize Datasets

In [102]:
#DATASET = 'Musical_Instruments'
#DATASET = 'Clothing_Shoes_and_Jewelry'
#DATASET = 'Home_and_Kitchen'
DATASET = 'Movies_and_TV'

VALIDATION_SIZE=0.15
RANDOM_SEED = 20230219
#ALEXNET_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_alexnet_features.npz')
VIT_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_vit_features.npz')
CLIP_IMAGE_FEATURES = Path(f'data/amazon/{DATASET}_clipimage_features.npz')

CLIP_TEXT_FEATURES = Path(f'data/amazon/{DATASET}_cliptext_features.npz')
BERT_TEXT_FEATURES = Path(f'data/amazon/{DATASET}_bert_features.npz')
DEST_FOLDER = Path(f'data/{DATASET}')

#assert ALEXNET_IMAGE_FEATURES.exists()
assert VIT_IMAGE_FEATURES.exists(), f"{VIT_IMAGE_FEATURES} does not exist"
assert CLIP_IMAGE_FEATURES.exists(), f"{CLIP_IMAGE_FEATURES} does not exist"
assert BERT_TEXT_FEATURES.exists(), f"{BERT_TEXT_FEATURES} does not exist"
assert CLIP_TEXT_FEATURES.exists(), f"{CLIP_TEXT_FEATURES} does not exist"

image_features = CLIP_IMAGE_FEATURES
text_featues = CLIP_TEXT_FEATURES

DEST_FOLDER.mkdir(exist_ok=True)

with open(DEST_FOLDER / 'signature.txt', mode='w') as f:
    f.write(f'Image: {image_features.stem}\nText: {text_featues.stem}')

In [103]:
def generate_unique_ids(series: pd.Series) -> pd.Series:
    rng = np.random.default_rng(RANDOM_SEED)
    unique_ids = series.unique()
    return pd.Series(
        index=rng.permutation(unique_ids), 
        data=range(len(unique_ids))
    )

def df_stats(df: pd.DataFrame) -> str:
    n_items = len(df['asin'].unique())
    n_users = len(df['reviewerID'].unique())
    sparsity = 1. * len(df) / (n_users * n_items)
    return f'{n_users} users {n_items} items ratings: {len(df)}. Sparsity {sparsity * 100:.3f}%'

def save_numerized(
    reviews: pd.DataFrame, 
    uids: pd.Series, 
    product_ids: pd.Series,
    user_id_column: str,
    product_ids_column: str,
    columns: List[str],
    dest: Path
):
    """Save a Dataframe following userids and product_ids"""
    joined = reviews.join(uids.to_frame(user_id_column), on='reviewerID')
    assert joined[user_id_column].isna().sum() == 0
    
    joined = joined.join(product_ids.to_frame(product_ids_column), on='asin')
    assert joined[product_ids_column].isna().sum() == 0
    
    res = joined[columns]
    
    res.to_csv(dest, index=False)

    return res

def split_train_test_proportion(df: pd.DataFrame, test_prop=0.2):
    """
    Split the dataframe by reviewer and take exactly `test_prop` records
    for test and leave the rest for training
    """
    rng = np.random.default_rng(RANDOM_SEED)

    res = df.copy()
    res['rnd'] = rng.random(size=len(df))
    res['rnd_rank'] = res.groupby('reviewerID')['rnd'].rank(pct=True)
    condition = res['rnd_rank'] <= test_prop
    
    (_, train), (_, test) = res.groupby(condition)
    
    assert(len(set(train.index) & set(test.index)) == 0)

    return df.loc[train.index], df.loc[test.index]

In [104]:
reviews_df = amazon_dataset.reviews_df(DATASET)
reviews_df.sample(n=3)

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
8750,329672,B0010CYHW4,A1K6EBWW0BRCBP,enero30,4.0,very nice finish and quality for the price. my...,2017-04-20,Very happy for the price!,True,
31374,1213860,B003D33W3I,APZB58E43OOS1,Philip Turner,5.0,Finally a ligature with a good tight hold on m...,2017-03-31,A Winner Now,True,
29147,1122833,B000EELFGU,A1TFH8GU91KIBO,me,5.0,"great price, product and delivery",2017-10-24,Five Stars,True,


# Generate Data for SEM-MacridVAE & DMRL

Some data stats

In [105]:
print(DATASET)
print(df_stats(reviews_df))

Musical_Instruments
5422 users 8460 items ratings: 40118. Sparsity 0.087%


Sort some users randomly

In [106]:
user_ids = generate_unique_ids(reviews_df['reviewerID'])
user_ids

A18ISN6NS073V2       0
A1XFJTXA00NYX1       1
A3GITGRRVQ4IP0       2
A1W2V8Z9EWTZMM       3
AEOQG6GFYR009        4
                  ... 
A2LSUIA9THHK57    5417
A31D3XL44W1NBM    5418
A3775OP5VTX5ON    5419
A1G0HYMR02WM2W    5420
A7CN7OR0X715S     5421
Length: 5422, dtype: int64

Same with products

In [107]:
item_ids = generate_unique_ids(reviews_df['asin'])
item_ids

B0002GXZK4       0
B00ZUBQY9K       1
B00VUU6JL8       2
B00PUQK4FU       3
B00UTUKFHE       4
              ... 
B000EEJ8VE    8455
B00CAL9ZXK    8456
B00U1XK5Z6    8457
B0002D0CNA    8458
B016L4MRMW    8459
Length: 8460, dtype: int64

Split train test and validation by user

In [108]:
train_reviews, validation_test_reviews = split_train_test_proportion(reviews_df, test_prop=0.4)
validation_reviews, test_reviews = split_train_test_proportion(validation_test_reviews, test_prop=0.5)

In [109]:
len(train_reviews)

25628

len(validation_reviews)

In [35]:
len(test_reviews)

6744

Sanity checks. All users are in the train validation and test sets!

In [36]:
set(train_reviews['reviewerID']) == set(test_reviews['reviewerID']) ==  set(validation_reviews['reviewerID'])

True

Check that the train set and the validation set all have at least one item (same check as above)

In [37]:
pd.concat([
    train_reviews.groupby('reviewerID').size().to_frame('train_size'),
    validation_reviews.groupby('reviewerID').size().to_frame('val_size'),
    test_reviews.groupby('reviewerID').size().to_frame('test_size')
], axis=1).describe()

Unnamed: 0,train_size,val_size,test_size
count,5422.0,5422.0,5422.0
mean,4.726669,1.428624,1.243821
std,2.451181,0.84009,0.720182
min,3.0,1.0,1.0
25%,3.0,1.0,1.0
50%,4.0,1.0,1.0
75%,5.0,2.0,1.0
max,36.0,12.0,12.0


Generate users

In [13]:
user_ids.to_csv(DEST_FOLDER / 'users.txt')
item_ids.to_csv(DEST_FOLDER / 'items.txt', header=['item_id'])

In [14]:
NUMERIZED_COMMON_PARAMS = {
    'uids': user_ids, 
    'product_ids': item_ids,
    'user_id_column': 'user',
    'product_ids_column': 'item',
    'columns': ['user', 'item']
}

save_numerized(
    train_reviews, 
    dest=DEST_FOLDER / 'train.txt',
    **NUMERIZED_COMMON_PARAMS
)

Unnamed: 0,user,item
0,13260,15762
4,17977,15762
5,14483,15762
6,20167,15762
9,7290,15762
...,...,...
216099,21143,21474
216101,14427,17537
216102,3341,10725
216103,15373,19613


In [15]:
save_numerized(
    validation_reviews, 
    dest=DEST_FOLDER / 'validation.txt',
    **NUMERIZED_COMMON_PARAMS
)
'Saved validation data'

'Saved validation data'

In [16]:
save_numerized(
    test_reviews, 
    dest=DEST_FOLDER / 'test.txt',
    **NUMERIZED_COMMON_PARAMS
)
'Saved test data'

'Saved test data'

In [17]:
def copy_features(
        item_ids: pd.Series, 
        features_file: Path
):
    print('Opening file')
    with np.load(features_file) as features:
        some_embedding = next(iter(features.values()))
        embedding_shape, = some_embedding.shape
        array_shape = (len(item_ids),  embedding_shape)
        print(f'Initializing array {array_shape}')
        res = np.full(array_shape,  fill_value=np.nan)

        for asin, idx in tqdm(item_ids.items(), 
                total=len(item_ids), unit_scale=True, unit='items', 
                desc='Copying features'):
            value = features.get(asin)
            if value is None:
                print(f'Item {asin} not found in features file')
                res[idx, :] = 0    
            else:
                assert np.isnan(value).sum() == 0, "Feature has NaN Values"
                res[idx, :] = value

        return res


image_features_array = copy_features(
    item_ids=item_ids, 
    features_file=image_features
)
image_features_array.shape

Opening file
Initializing array (23958, 768)


Copying features:   0%|          | 101/24.0k [00:00<00:46, 510items/s] 

Item B0018CWW96 not found in features file


Copying features:   3%|▎         | 602/24.0k [00:01<00:45, 518items/s]

Item B0013KU93E not found in features file


Copying features:   6%|▋         | 1.53k/24.0k [00:04<01:43, 216items/s]

Item B001C3O6QS not found in features file


Copying features:   8%|▊         | 1.85k/24.0k [00:05<01:45, 209items/s]

Item B0014VPFPO not found in features file


Copying features:  15%|█▍        | 3.56k/24.0k [00:12<01:15, 269items/s]

Item B001ANQY7Y not found in features file
Item B001C00AHA not found in features file


Copying features:  15%|█▌        | 3.67k/24.0k [00:12<01:34, 216items/s]

Item B001B3LIOC not found in features file


Copying features:  23%|██▎       | 5.43k/24.0k [00:15<00:29, 630items/s]

Item B0017W22LK not found in features file


Copying features:  26%|██▌       | 6.22k/24.0k [00:17<00:27, 654items/s]

Item B0013LRKRQ not found in features file


Copying features:  29%|██▉       | 6.91k/24.0k [00:18<00:30, 551items/s]

Item B0017ANB08 not found in features file


Copying features:  58%|█████▊    | 13.8k/24.0k [00:31<00:18, 563items/s]

Item B0014CKCCY not found in features file
Item B0014F20GM not found in features file


Copying features:  61%|██████    | 14.6k/24.0k [00:32<00:15, 587items/s]

Item B001ADKATC not found in features file


Copying features:  68%|██████▊   | 16.2k/24.0k [00:35<00:11, 683items/s]

Item B0016MOWOG not found in features file


Copying features:  71%|███████   | 16.9k/24.0k [00:36<00:11, 608items/s]

Item B0013TR4GK not found in features file


Copying features:  72%|███████▏  | 17.1k/24.0k [00:36<00:11, 612items/s]

Item B0013LRKV2 not found in features file


Copying features:  74%|███████▍  | 17.8k/24.0k [00:37<00:10, 589items/s]

Item B0019ESNGE not found in features file
Item B0015XHR5C not found in features file


Copying features:  83%|████████▎ | 19.9k/24.0k [00:41<00:08, 495items/s]

Item B001AQR39O not found in features file


Copying features:  93%|█████████▎| 22.2k/24.0k [00:45<00:02, 690items/s]

Item B0017RFXRK not found in features file


Copying features:  97%|█████████▋| 23.3k/24.0k [00:46<00:00, 792items/s]

Item B0013K7ZUE not found in features file


Copying features: 100%|██████████| 24.0k/24.0k [00:47<00:00, 503items/s]


(23958, 768)

In [18]:
IMAGE_EMBED_DEST = DEST_FOLDER / 'embed_image.npy'
np.save(IMAGE_EMBED_DEST, image_features_array)

print(f'{str(IMAGE_EMBED_DEST)}: {IMAGE_EMBED_DEST.stat().st_size // 2**20}MiB')

data/Movies_and_TV/embed_image.npy: 140MiB


In [19]:
text_features_array = copy_features(
    item_ids=item_ids, 
    features_file=text_featues
)
text_features_array.shape

Opening file
Initializing array (23958, 768)


Copying features:   0%|          | 71.0/24.0k [00:00<00:34, 687items/s]

Item B0018CWW96 not found in features file


Copying features:   3%|▎         | 655/24.0k [00:00<00:33, 696items/s] 

Item B0013KU93E not found in features file


Copying features:   7%|▋         | 1.59k/24.0k [00:02<00:41, 542items/s]

Item B001C3O6QS not found in features file


Copying features:   8%|▊         | 1.92k/24.0k [00:03<00:41, 527items/s]

Item B0014VPFPO not found in features file


Copying features:  15%|█▌        | 3.65k/24.0k [00:06<00:31, 638items/s]

Item B001ANQY7Y not found in features file
Item B001C00AHA not found in features file
Item B001B3LIOC not found in features file


Copying features:  23%|██▎       | 5.46k/24.0k [00:08<00:29, 622items/s]

Item B0017W22LK not found in features file


Copying features:  26%|██▋       | 6.30k/24.0k [00:09<00:23, 763items/s]

Item B0013LRKRQ not found in features file


Copying features:  29%|██▉       | 6.92k/24.0k [00:10<00:23, 739items/s]

Item B0017ANB08 not found in features file


Copying features:  58%|█████▊    | 13.8k/24.0k [00:21<00:17, 584items/s]

Item B0014CKCCY not found in features file
Item B0014F20GM not found in features file


Copying features:  61%|██████    | 14.6k/24.0k [00:22<00:18, 497items/s]

Item B001ADKATC not found in features file


Copying features:  68%|██████▊   | 16.3k/24.0k [00:25<00:12, 616items/s]

Item B0016MOWOG not found in features file


Copying features:  71%|███████   | 17.0k/24.0k [00:26<00:09, 773items/s]

Item B0013TR4GK not found in features file


Copying features:  72%|███████▏  | 17.1k/24.0k [00:26<00:09, 757items/s]

Item B0013LRKV2 not found in features file


Copying features:  74%|███████▍  | 17.8k/24.0k [00:27<00:08, 746items/s]

Item B0019ESNGE not found in features file
Item B0015XHR5C not found in features file


Copying features:  83%|████████▎ | 20.0k/24.0k [00:30<00:05, 783items/s]

Item B001AQR39O not found in features file


Copying features:  93%|█████████▎| 22.3k/24.0k [00:33<00:02, 792items/s]

Item B0017RFXRK not found in features file


Copying features:  97%|█████████▋| 23.3k/24.0k [00:34<00:00, 768items/s]

Item B0013K7ZUE not found in features file


Copying features: 100%|██████████| 24.0k/24.0k [00:35<00:00, 676items/s]


(23958, 768)

In [20]:
TEXT_EMBED_DEST = DEST_FOLDER / 'embed_text.npy'
np.save(TEXT_EMBED_DEST, text_features_array)

print(f'{str(TEXT_EMBED_DEST)}: {TEXT_EMBED_DEST.stat().st_size // 2**20}MiB')

data/Movies_and_TV/embed_text.npy: 140MiB


# Generate images

In [54]:
products_df = amazon_dataset.items_df(DATASET)
products_images_dir = amazon_dataset.product_images_dir(DATASET)
products_images_dir

PosixPath('data/amazon/Musical_Instruments_product_images')

This is necessary only for generating the fancy graph. Puts images on an transparent background

In [22]:
def convert_image(src: str, dst: str):
    min_white = np.array([235, 235, 235], np.uint8)
    white = np.array([255, 255, 255], np.uint8)
    img = cv2.imread(src)
    
    mask = 255 - cv2.inRange(img, min_white, white)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2BGRA)
    res = cv2.bitwise_and(img, img, mask=mask)
    cv2.imwrite(dst, res)

def save_images(products_df: pd.DataFrame, item_ids: pd.Series, dest: Path):
    products_by_asin = products_df.set_index('asin')
    joined  = item_ids.to_frame('item_id').join(
        products_by_asin, validate='one_to_one'
    )
    dest.mkdir(exist_ok=True)
    
    no_images = 0
    
    progress = tqdm(
        joined.itertuples(), 
        desc='Copying images', 
        total=len(joined), 
        unit_scale=True, unit='items'
    )
    for row in progress:
        if not isinstance(row.image_slug, list):
            print('Not a list')
            continue
        if len(row.image_slug) == 0:
            no_images += 1
            progress.set_postfix_str(f'No images: {no_images}', refresh=False)
            continue
        slug = row.image_slug[0]
        src = products_images_dir / f"{slug}.jpg"
        assert src.exists()
        dst = dest / f'{row.item_id}.png'
        convert_image(str(src), str(dst))

save_images(products_df, item_ids,  DEST_FOLDER / 'images')

Copying images:   0%|          | 40.0/24.0k [00:00<04:27, 89.2items/s, No images: 4]

Not a list


Copying images:   2%|▏         | 538/24.0k [00:06<04:52, 80.1items/s, No images: 19]

Not a list


Copying images:   6%|▋         | 1.52k/24.0k [00:19<04:59, 74.9items/s, No images: 41]

Not a list


Copying images:   8%|▊         | 1.83k/24.0k [00:23<04:24, 83.8items/s, No images: 53]

Not a list


Copying images:  15%|█▍        | 3.56k/24.0k [00:43<03:45, 90.4items/s, No images: 87]

Not a list


Copying images:  15%|█▌        | 3.60k/24.0k [00:43<03:50, 88.4items/s, No images: 87]

Not a list


Copying images:  15%|█▌        | 3.66k/24.0k [00:44<04:37, 73.3items/s, No images: 87]

Not a list


Copying images:  22%|██▏       | 5.36k/24.0k [01:04<03:28, 89.4items/s, No images: 135]

Not a list


Copying images:  26%|██▌       | 6.16k/24.0k [01:13<03:21, 88.2items/s, No images: 161]

Not a list


Copying images:  29%|██▊       | 6.84k/24.0k [01:21<03:16, 87.2items/s, No images: 181]

Not a list


Copying images:  57%|█████▋    | 13.8k/24.0k [02:42<02:06, 80.5items/s, No images: 334]

Not a list


Copying images:  57%|█████▋    | 13.8k/24.0k [02:42<02:04, 81.7items/s, No images: 334]

Not a list


Copying images:  61%|██████    | 14.5k/24.0k [02:51<01:44, 90.4items/s, No images: 354]

Not a list


Copying images:  67%|██████▋   | 16.2k/24.0k [03:09<01:28, 88.5items/s, No images: 397]

Not a list


Copying images:  70%|███████   | 16.9k/24.0k [03:17<01:16, 92.5items/s, No images: 413]

Not a list


Copying images:  71%|███████   | 17.0k/24.0k [03:19<01:17, 89.5items/s, No images: 422]

Not a list


Copying images:  74%|███████▍  | 17.7k/24.0k [03:26<01:07, 93.0items/s, No images: 443]

Not a list


Copying images:  74%|███████▍  | 17.8k/24.0k [03:27<01:03, 96.2items/s, No images: 445]

Not a list


Copying images:  83%|████████▎ | 19.8k/24.0k [03:49<00:41, 99.9items/s, No images: 509]

Not a list


Copying images:  93%|█████████▎| 22.2k/24.0k [04:16<00:18, 93.4items/s, No images: 561]

Not a list


Copying images:  97%|█████████▋| 23.2k/24.0k [04:28<00:08, 90.0items/s, No images: 588]

Not a list


Copying images: 100%|██████████| 24.0k/24.0k [04:36<00:00, 86.5items/s, No images: 607]


In [23]:
def save_titles(products_df: pd.DataFrame, item_ids: pd.Series, dest: Path):
    products_by_asin = products_df.set_index('asin')
    joined  = item_ids.to_frame('item_id').join(
        products_by_asin, validate='one_to_one'
    )
    dest / 'item_titles.txt'
    joined[['item_id', 'title']].to_csv(dest / 'item_titles.txt', index=None)

# Not really needed for anything yet
#save_titles(products_df, item_ids,  DEST_FOLDER)

# Generate categories

In [58]:
categories_df = amazon_dataset.product_categories_df(DATASET)

In [25]:
# categories_id = generate_unique_ids(categories_df['name'])
# categories_id.to_csv(DEST_FOLDER / 'categories.txt', header=['category_id'])
# categories_id

In [26]:
def build_categories_matrix(
        categories_df: pd.DataFrame, 
        products_df: pd.DataFrame, 
        item_ids: pd.Series, 
        categories_id: pd.Series
    ):
    MIN_SUPPORT = 0.01
    MAX_SUPPORT = 0.9
    
    # Categories only with not too many or too few items
    categories_support = categories_df['name'].value_counts() 

    allowed_categories_names = categories_support[1:8].index

    print("How many allowed categories")
    print(len(allowed_categories_names))
    print(allowed_categories_names)
    
    filtered_categories = categories_df.loc[
        categories_df['name'].isin(allowed_categories_names)
    ]

    products_df = products_df.join(
        item_ids.to_frame('item_id'), on='asin', validate='1:1')

    res = filtered_categories.set_index('product_id').join(
        products_df, validate='m:1')
    res = res.reset_index(drop=True)
    
    # Check all categories are in categories_id
    res = res.join(categories_id.to_frame('category_id'), on='name')
    assert res['category_id'].isna().sum() == 0

    items_with_no_category = set(item_ids) - set(res['item_id'].unique())
    print(f"items with no category: {len(items_with_no_category)}")
    
    return products_df.loc[
        products_df['asin'].isin(item_ids[items_with_no_category].index)
    ]

    return res[['item_id', 'category_id']]

# categories_matrix = build_categories_matrix(
#     categories_df, products_df, item_ids, categories_id
# )
#categories_matrix
#categories_matrix.to_csv(DEST_FOLDER / 'categorical.txt', index=False)

This is a VERY provisional way to analyze categories

In [126]:
def analyse_semantic_categories(products_df: pd.DataFrame, categories_df: pd.DataFrame, item_ids: pd.Series):
    items_cate = np.load(DEST_FOLDER / 'item_cate.npy')
    item_semantic_category = pd.Series(items_cate).to_frame('semantic_category')
    items = item_ids.to_frame('item_id')

    item_with_semantic = items.join(
        item_semantic_category, 
        on='item_id', 
        validate='1:1'
    )
    products_with_semantic = item_with_semantic.join(
        products_df.reset_index().set_index('asin')
    )
    products_with_semantic.set_index('id', inplace=True)
    
    categories_and_semantic = categories_df.join(
        products_with_semantic, on='product_id', validate='m:1'
    )
    assert categories_and_semantic['semantic_category'].isna().sum() == 0
    
    for category_id, df in categories_and_semantic.groupby('semantic_category'):
        print(f"Category {category_id} Size: {len(df)}")
        print(df['name'].value_counts().head(n=10) )

#analyse_semantic_categories(products_df, categories_df, item_ids)