In [1]:
from functools import lru_cache
from pathlib import Path
from typing import List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import amazon_dataset
import movielens_dataset

# Intialize Datasets

In [2]:
def get_base_path(dataset: str) -> Path:
    if dataset.startswith('ml-'):
        return movielens_dataset.BASE_DATA_FOLDER
    return amazon_dataset.BASE_DATA_FOLDER

In [3]:
#DATASET = 'Musical_Instruments'
#DATASET = 'Clothing_Shoes_and_Jewelry'
#DATASET = 'Home_and_Kitchen'
DATASET = 'Movies_and_TV'
#DATASET = 'ml-1m'

VALIDATION_SIZE=0.15
RANDOM_SEED = 20230219
MIN_RATING = 0

base_path = get_base_path(DATASET)

#ALEXNET_IMAGE_FEATURES = BASE_PATH / f'{DATASET}_alexnet_features.npz'
VIT_IMAGE_FEATURES = base_path / f'{DATASET}_vit_features.npz'
CLIP_IMAGE_FEATURES = base_path / f'{DATASET}_clipimage_features.npz'

CLIP_TEXT_FEATURES = base_path / f'{DATASET}_cliptext_features.npz'
BERT_TEXT_FEATURES = base_path / f'{DATASET}_bert_features.npz'


#assert ALEXNET_IMAGE_FEATURES.exists()
assert VIT_IMAGE_FEATURES.exists(), f"{VIT_IMAGE_FEATURES} does not exist"
assert CLIP_IMAGE_FEATURES.exists(), f"{CLIP_IMAGE_FEATURES} does not exist"
assert BERT_TEXT_FEATURES.exists(), f"{BERT_TEXT_FEATURES} does not exist"
assert CLIP_TEXT_FEATURES.exists(), f"{CLIP_TEXT_FEATURES} does not exist"

In [4]:
print(f"VIT features {VIT_IMAGE_FEATURES}")
print(f"CLIP Image features {CLIP_IMAGE_FEATURES}")
print(f"BERT Text features {BERT_TEXT_FEATURES}")
print(f"CLIP Text features {CLIP_TEXT_FEATURES}")

VIT features data/amazon/Movies_and_TV_vit_features.npz
CLIP Image features data/amazon/Movies_and_TV_clipimage_features.npz
BERT Text features data/amazon/Movies_and_TV_bert_features.npz
CLIP Text features data/amazon/Movies_and_TV_cliptext_features.npz


In [5]:
def generate_unique_ids(series: pd.Series) -> pd.Series:
    rng = np.random.default_rng(RANDOM_SEED)
    unique_ids = series.unique()
    return pd.Series(
        index=rng.permutation(unique_ids), 
        data=range(len(unique_ids))
    )

def df_stats(df: pd.DataFrame) -> str:
    n_items = len(df['item_id'].unique())
    n_users = len(df['user_id'].unique())
    sparsity = 1. * len(df) / (n_users * n_items)
    return f'{n_users} users {n_items} items ratings: {len(df)}. Sparsity {sparsity * 100:.3f}%'

def save_numerized(
    reviews: pd.DataFrame, 
    uids: pd.Series, 
    product_ids: pd.Series,
    user_id_column: str,
    item_ids_column: str,
    columns: List[str],
    dest: Path
):
    """Save a Dataframe following userids and product_ids"""
    joined = reviews.join(uids.to_frame(user_id_column), on='user_id')
    assert joined[user_id_column].isna().sum() == 0
    
    joined = joined.join(product_ids.to_frame(item_ids_column), on='item_id')
    assert joined[item_ids_column].isna().sum() == 0
    
    res = joined[columns]
    
    res.to_csv(dest, index=False)

    return res

def split_train_test_proportion(df: pd.DataFrame, test_prop=0.2):
    """
    Split the dataframe by reviewer and take exactly `test_prop` records
    for test and leave the rest for training
    """
    rng = np.random.default_rng(RANDOM_SEED)

    res = df.copy()
    res['rnd'] = rng.random(size=len(df))
    res['rnd_rank'] = res.groupby('user_id')['rnd'].rank(pct=True)
    condition = res['rnd_rank'] <= test_prop
    
    (_, train), (_, test) = res.groupby(condition)
    
    assert(len(set(train.index) & set(test.index)) == 0)

    return df.loc[train.index], df.loc[test.index]

In [6]:
def get_reviews_df(dataset: str, min_rating: int) -> pd.DataFrame:
    if dataset.startswith('ml-'):
        reviews = movielens_dataset.reviews_df(dataset)
    else:
        reviews = amazon_dataset.reviews_df(dataset)

    with_min_ratings = reviews[reviews['rating'] >= min_rating]
    # valid user ids
    user_id_size = with_min_ratings.groupby('user_id').size()
    valid_user_ids = user_id_size.loc[user_id_size >= 5].index
    return with_min_ratings[with_min_ratings['user_id'].isin(valid_user_ids)]

reviews_df = get_reviews_df(DATASET, min_rating=4)
reviews_df.sample(n=3)

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote,user_id,item_id,rating
98670,5945972,B00EV4F5TC,AMTF78R74KICJ,WILLEM P ROELANDTS,5.0,Great movie....,2017-11-15,Five Stars,True,,AMTF78R74KICJ,B00EV4F5TC,5.0
105701,6427767,B00KHA5TJ0,AIMD8U35N3ER2,Amazon Customer,5.0,good job,2017-09-11,Five Stars,True,,AIMD8U35N3ER2,B00KHA5TJ0,5.0
44516,2570422,B000FKP4BU,A1QIZM8J946Z6K,Jesse Michaels,5.0,Don't be fooled by the high school setting and...,2017-08-02,Top notch neo noir staged in a highschool univ...,True,,A1QIZM8J946Z6K,B000FKP4BU,5.0


# Generate Data for SEM-MacridVAE & DMRL

Some data stats

In [7]:
print(DATASET)
print(df_stats(reviews_df))

Movies_and_TV
17773 users 22632 items ratings: 168889. Sparsity 0.042%


Sort some users randomly

In [8]:
user_ids = generate_unique_ids(reviews_df['user_id'])
user_ids

ACW9ZM5S4BKPO         0
A1TH4YX8KSQUFB        1
A12SBPEUSJMAJK        2
A28UFWNFTT4CWB        3
ASTLV5GH4DLW8         4
                  ...  
A4UIN3H6SCYYT     17768
A24OG1ZBIRUXY9    17769
ACEAQEX2XT98      17770
A2WNUETI5MUH9Z    17771
ATHEYQAPGSCIH     17772
Length: 17773, dtype: int64

Same with products

In [9]:
item_ids = generate_unique_ids(reviews_df['item_id'])
item_ids

B005DETYW8        0
B00JKEM294        1
B000021Y6N        2
B014DEGUIE        3
6302795567        4
              ...  
6300145786    22627
B002M36R4G    22628
B01FKXU58S    22629
B00005RIY5    22630
B001JJMGIU    22631
Length: 22632, dtype: int64

Split train test and validation by user

In [10]:
train_reviews, validation_test_reviews = split_train_test_proportion(reviews_df, test_prop=0.4)
validation_reviews, test_reviews = split_train_test_proportion(validation_test_reviews, test_prop=0.5)

In [11]:
len(train_reviews)

106979

len(validation_reviews)

In [12]:
len(test_reviews)

28999

Sanity checks. All users are in the train validation and test sets!

In [13]:
set(train_reviews['user_id']) == set(test_reviews['user_id']) ==  set(validation_reviews['user_id'])

True

Check that the train set and the validation set all have at least one item (same check as above)

In [14]:
pd.concat([
    train_reviews.groupby('user_id').size().to_frame('train_size'),
    validation_reviews.groupby('user_id').size().to_frame('val_size'),
    test_reviews.groupby('user_id').size().to_frame('test_size')
], axis=1).describe()

Unnamed: 0,train_size,val_size,test_size
count,17773.0,17773.0,17773.0
mean,6.019186,1.851741,1.631632
std,8.267516,2.766093,2.712988
min,3.0,1.0,1.0
25%,3.0,1.0,1.0
50%,5.0,1.0,1.0
75%,6.0,2.0,2.0
max,850.0,283.0,283.0


In [15]:
combinations = pd.DataFrame.from_records([
    ('vit', 'bert', VIT_IMAGE_FEATURES, BERT_TEXT_FEATURES),
    ('none', 'bert', None, BERT_TEXT_FEATURES),
    ('vit', 'none', VIT_IMAGE_FEATURES, None),
    ('clip', 'clip', CLIP_IMAGE_FEATURES, CLIP_TEXT_FEATURES),
    ('none', 'clip', None, CLIP_TEXT_FEATURES),
    ('clip', 'none', CLIP_IMAGE_FEATURES, None),
    ('none', 'none', None, None),
], columns=['image', 'text', 'image_features', 'text_features'])
combinations['dest_folder'] = combinations.apply(
    lambda x: Path(f'data/{DATASET}-{x["image"]}_{x["text"]}'), axis=1
)
combinations

Unnamed: 0,image,text,image_features,text_features,dest_folder
0,vit,bert,data/amazon/Movies_and_TV_vit_features.npz,data/amazon/Movies_and_TV_bert_features.npz,data/Movies_and_TV-vit_bert
1,none,bert,,data/amazon/Movies_and_TV_bert_features.npz,data/Movies_and_TV-none_bert
2,vit,none,data/amazon/Movies_and_TV_vit_features.npz,,data/Movies_and_TV-vit_none
3,clip,clip,data/amazon/Movies_and_TV_clipimage_features.npz,data/amazon/Movies_and_TV_cliptext_features.npz,data/Movies_and_TV-clip_clip
4,none,clip,,data/amazon/Movies_and_TV_cliptext_features.npz,data/Movies_and_TV-none_clip
5,clip,none,data/amazon/Movies_and_TV_clipimage_features.npz,,data/Movies_and_TV-clip_none
6,none,none,,,data/Movies_and_TV-none_none


Generate users

In [16]:
for c in combinations.itertuples():
    c.dest_folder.mkdir(exist_ok=True)

    user_ids.to_csv(c.dest_folder / 'users.txt')
    item_ids.to_csv(c.dest_folder / 'items.txt', header=['item_id'])

In [17]:
NUMERIZED_COMMON_PARAMS = {
    'uids': user_ids, 
    'product_ids': item_ids,
    'user_id_column': 'user',
    'item_ids_column': 'item',
    'columns': ['user', 'item']
}

for c in combinations.itertuples():
    c.dest_folder.mkdir(exist_ok=True)

    save_numerized(
        train_reviews, 
        dest=c.dest_folder / 'train.txt',
        **NUMERIZED_COMMON_PARAMS
    )


In [18]:
for c in combinations.itertuples():
    c.dest_folder.mkdir(exist_ok=True)

    save_numerized(
        validation_reviews, 
        dest=c.dest_folder / 'validation.txt',
        **NUMERIZED_COMMON_PARAMS
    )


'Saved validation data'

'Saved validation data'

In [19]:
for c in combinations.itertuples():
    save_numerized(
        test_reviews, 
        dest=c.dest_folder / 'test.txt',
        **NUMERIZED_COMMON_PARAMS
    )
'Saved test data'

'Saved test data'

In [20]:
@lru_cache()
def copy_features(features_file: Optional[Path]):
    if features_file is None:
        print('Returning Zeros')
        return np.zeros((len(item_ids), 256), dtype=np.float32)

    print(f'Opening file {features_file}')
    with np.load(features_file) as features:
        some_embedding = next(iter(features.values()))
        embedding_shape, = some_embedding.shape
        array_shape = (len(item_ids),  embedding_shape)
        print(f'Initializing array {array_shape}')
        res = np.full(array_shape,  fill_value=np.nan)

        for item_id, idx in tqdm(item_ids.items(), 
                total=len(item_ids), unit_scale=True, unit='items', 
                desc='Copying features'):
            value = features.get(str(item_id))
            if value is None:
                print(f'Item {item_id} not found in features file')
                res[idx, :] = 0    
            else:
                assert np.isnan(value).sum() == 0, "Feature has NaN Values"
                res[idx, :] = value

        return res

for c in combinations.itertuples():
    print(c)
    features_array = copy_features(features_file=c.image_features)
    embed_dest = c.dest_folder / 'embed_image.npy'
    np.save(embed_dest, features_array)

    print(f'{str(embed_dest)}: {embed_dest.stat().st_size // 2**20}MiB')

copy_features.cache_clear()

Pandas(Index=0, image='vit', text='bert', image_features=PosixPath('data/amazon/Movies_and_TV_vit_features.npz'), text_features=PosixPath('data/amazon/Movies_and_TV_bert_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-vit_bert'))
Opening file data/amazon/Movies_and_TV_vit_features.npz
Initializing array (22632, 1024)


Copying features:   1%|          | 246/22.6k [00:00<00:27, 802items/s] 

Item B0015XHR5C not found in features file


Copying features:  11%|█         | 2.50k/22.6k [00:03<00:23, 862items/s]

Item B001C3O6QS not found in features file


Copying features:  22%|██▏       | 4.94k/22.6k [00:06<00:20, 864items/s]

Item B0016MOWOG not found in features file


Copying features:  26%|██▌       | 5.79k/22.6k [00:07<00:21, 781items/s]

Item B0014VPFPO not found in features file
Item B0017RFXRK not found in features file


Copying features:  29%|██▉       | 6.55k/22.6k [00:08<00:22, 707items/s]

Item B001ADKATC not found in features file


Copying features:  31%|███       | 6.92k/22.6k [00:08<00:21, 733items/s]

Item B0017ANB08 not found in features file


Copying features:  33%|███▎      | 7.38k/22.6k [00:09<00:20, 760items/s]

Item B0014CKCCY not found in features file


Copying features:  35%|███▍      | 7.88k/22.6k [00:10<00:17, 820items/s]

Item B0013KU93E not found in features file


Copying features:  40%|███▉      | 8.98k/22.6k [00:11<00:19, 686items/s]

Item B0013LRKRQ not found in features file


Copying features:  42%|████▏     | 9.57k/22.6k [00:12<00:17, 748items/s]

Item B001AQR39O not found in features file
Item B001B3LIOC not found in features file


Copying features:  61%|██████▏   | 13.9k/22.6k [00:17<00:11, 770items/s]

Item B0018CWW96 not found in features file


Copying features:  63%|██████▎   | 14.3k/22.6k [00:18<00:11, 755items/s]

Item B0014F20GM not found in features file


Copying features:  71%|███████   | 16.1k/22.6k [00:20<00:08, 813items/s]

Item B0019ESNGE not found in features file
Item B0013TR4GK not found in features file


Copying features:  74%|███████▍  | 16.8k/22.6k [00:21<00:07, 761items/s]

Item B001ANQY7Y not found in features file


Copying features:  88%|████████▊ | 19.8k/22.6k [00:25<00:03, 850items/s]

Item B0017W22LK not found in features file


Copying features:  96%|█████████▌| 21.8k/22.6k [00:27<00:01, 768items/s]

Item B0013K7ZUE not found in features file


Copying features:  98%|█████████▊| 22.1k/22.6k [00:27<00:00, 815items/s]

Item B0013LRKV2 not found in features file


Copying features: 100%|██████████| 22.6k/22.6k [00:28<00:00, 791items/s]


data/Movies_and_TV-vit_bert/embed_image.npy: 176MiB
Pandas(Index=1, image='none', text='bert', image_features=None, text_features=PosixPath('data/amazon/Movies_and_TV_bert_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-none_bert'))
Returning Zeros
data/Movies_and_TV-none_bert/embed_image.npy: 22MiB
Pandas(Index=2, image='vit', text='none', image_features=PosixPath('data/amazon/Movies_and_TV_vit_features.npz'), text_features=None, dest_folder=PosixPath('data/Movies_and_TV-vit_none'))
data/Movies_and_TV-vit_none/embed_image.npy: 176MiB
Pandas(Index=3, image='clip', text='clip', image_features=PosixPath('data/amazon/Movies_and_TV_clipimage_features.npz'), text_features=PosixPath('data/amazon/Movies_and_TV_cliptext_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-clip_clip'))
Opening file data/amazon/Movies_and_TV_clipimage_features.npz
Initializing array (22632, 768)


Copying features:   0%|          | 91.0/22.6k [00:00<00:24, 909items/s]

Item B0015XHR5C not found in features file


Copying features:  11%|█         | 2.51k/22.6k [00:03<00:24, 816items/s]

Item B001C3O6QS not found in features file


Copying features:  22%|██▏       | 4.91k/22.6k [00:05<00:21, 836items/s]

Item B0016MOWOG not found in features file


Copying features:  26%|██▌       | 5.85k/22.6k [00:07<00:19, 872items/s]

Item B0014VPFPO not found in features file
Item B0017RFXRK not found in features file


Copying features:  29%|██▉       | 6.64k/22.6k [00:08<00:18, 851items/s]

Item B001ADKATC not found in features file


Copying features:  30%|███       | 6.90k/22.6k [00:08<00:18, 839items/s]

Item B0017ANB08 not found in features file


Copying features:  33%|███▎      | 7.40k/22.6k [00:08<00:18, 834items/s]

Item B0014CKCCY not found in features file


Copying features:  35%|███▍      | 7.91k/22.6k [00:09<00:17, 823items/s]

Item B0013KU93E not found in features file


Copying features:  40%|███▉      | 9.00k/22.6k [00:10<00:16, 836items/s]

Item B0013LRKRQ not found in features file


Copying features:  42%|████▏     | 9.56k/22.6k [00:11<00:14, 907items/s]

Item B001AQR39O not found in features file
Item B001B3LIOC not found in features file


Copying features:  62%|██████▏   | 14.0k/22.6k [00:16<00:09, 887items/s]

Item B0018CWW96 not found in features file


Copying features:  63%|██████▎   | 14.2k/22.6k [00:17<00:10, 768items/s]

Item B0014F20GM not found in features file


Copying features:  71%|███████   | 16.1k/22.6k [00:19<00:09, 707items/s]

Item B0019ESNGE not found in features file
Item B0013TR4GK not found in features file


Copying features:  74%|███████▍  | 16.7k/22.6k [00:20<00:08, 736items/s]

Item B001ANQY7Y not found in features file


Copying features:  87%|████████▋ | 19.8k/22.6k [00:25<00:04, 704items/s]

Item B0017W22LK not found in features file


Copying features:  96%|█████████▌| 21.8k/22.6k [00:27<00:01, 823items/s]

Item B0013K7ZUE not found in features file


Copying features:  98%|█████████▊| 22.1k/22.6k [00:28<00:00, 820items/s]

Item B0013LRKV2 not found in features file


Copying features: 100%|██████████| 22.6k/22.6k [00:28<00:00, 784items/s]


data/Movies_and_TV-clip_clip/embed_image.npy: 132MiB
Pandas(Index=4, image='none', text='clip', image_features=None, text_features=PosixPath('data/amazon/Movies_and_TV_cliptext_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-none_clip'))
data/Movies_and_TV-none_clip/embed_image.npy: 22MiB
Pandas(Index=5, image='clip', text='none', image_features=PosixPath('data/amazon/Movies_and_TV_clipimage_features.npz'), text_features=None, dest_folder=PosixPath('data/Movies_and_TV-clip_none'))
data/Movies_and_TV-clip_none/embed_image.npy: 132MiB
Pandas(Index=6, image='none', text='none', image_features=None, text_features=None, dest_folder=PosixPath('data/Movies_and_TV-none_none'))
data/Movies_and_TV-none_none/embed_image.npy: 22MiB


In [21]:
for c in combinations.itertuples():
    print(c)
    features_array = copy_features(features_file=c.text_features)
    embed_dest = c.dest_folder / 'embed_text.npy'
    np.save(embed_dest, features_array)

    print(f'{str(embed_dest)}: {embed_dest.stat().st_size // 2**20}MiB')

copy_features.cache_clear()

Pandas(Index=0, image='vit', text='bert', image_features=PosixPath('data/amazon/Movies_and_TV_vit_features.npz'), text_features=PosixPath('data/amazon/Movies_and_TV_bert_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-vit_bert'))
Opening file data/amazon/Movies_and_TV_bert_features.npz
Initializing array (22632, 1024)


Copying features:   0%|          | 74.0/22.6k [00:00<00:30, 731items/s]

Item B0015XHR5C not found in features file


Copying features:  11%|█▏        | 2.58k/22.6k [00:03<00:22, 903items/s]

Item B001C3O6QS not found in features file


Copying features:  22%|██▏       | 4.95k/22.6k [00:06<00:19, 914items/s]

Item B0016MOWOG not found in features file


Copying features:  26%|██▌       | 5.83k/22.6k [00:07<00:21, 782items/s]

Item B0014VPFPO not found in features file
Item B0017RFXRK not found in features file


Copying features:  29%|██▉       | 6.58k/22.6k [00:08<00:19, 818items/s]

Item B001ADKATC not found in features file


Copying features:  30%|███       | 6.89k/22.6k [00:08<00:22, 696items/s]

Item B0017ANB08 not found in features file


Copying features:  32%|███▏      | 7.32k/22.6k [00:09<00:24, 613items/s]

Item B0014CKCCY not found in features file


Copying features:  35%|███▍      | 7.90k/22.6k [00:10<00:18, 788items/s]

Item B0013KU93E not found in features file


Copying features:  40%|███▉      | 9.02k/22.6k [00:11<00:18, 727items/s]

Item B0013LRKRQ not found in features file


Copying features:  42%|████▏     | 9.59k/22.6k [00:12<00:15, 820items/s]

Item B001AQR39O not found in features file
Item B001B3LIOC not found in features file


Copying features:  61%|██████▏   | 13.9k/22.6k [00:18<00:10, 813items/s]

Item B0018CWW96 not found in features file


Copying features:  63%|██████▎   | 14.3k/22.6k [00:18<00:10, 807items/s]

Item B0014F20GM not found in features file


Copying features:  71%|███████▏  | 16.1k/22.6k [00:21<00:09, 716items/s]

Item B0019ESNGE not found in features file
Item B0013TR4GK not found in features file


Copying features:  74%|███████▍  | 16.8k/22.6k [00:22<00:07, 806items/s]

Item B001ANQY7Y not found in features file


Copying features:  87%|████████▋ | 19.8k/22.6k [00:25<00:03, 752items/s]

Item B0017W22LK not found in features file


Copying features:  96%|█████████▋| 21.8k/22.6k [00:28<00:01, 810items/s]

Item B0013K7ZUE not found in features file


Copying features:  97%|█████████▋| 22.1k/22.6k [00:28<00:00, 782items/s]

Item B0013LRKV2 not found in features file


Copying features: 100%|██████████| 22.6k/22.6k [00:29<00:00, 765items/s]


data/Movies_and_TV-vit_bert/embed_text.npy: 176MiB
Pandas(Index=1, image='none', text='bert', image_features=None, text_features=PosixPath('data/amazon/Movies_and_TV_bert_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-none_bert'))
data/Movies_and_TV-none_bert/embed_text.npy: 176MiB
Pandas(Index=2, image='vit', text='none', image_features=PosixPath('data/amazon/Movies_and_TV_vit_features.npz'), text_features=None, dest_folder=PosixPath('data/Movies_and_TV-vit_none'))
Returning Zeros
data/Movies_and_TV-vit_none/embed_text.npy: 22MiB
Pandas(Index=3, image='clip', text='clip', image_features=PosixPath('data/amazon/Movies_and_TV_clipimage_features.npz'), text_features=PosixPath('data/amazon/Movies_and_TV_cliptext_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-clip_clip'))
Opening file data/amazon/Movies_and_TV_cliptext_features.npz
Initializing array (22632, 768)


Copying features:   0%|          | 88.0/22.6k [00:00<00:25, 869items/s]

Item B0015XHR5C not found in features file


Copying features:  11%|█         | 2.48k/22.6k [00:03<00:30, 662items/s]

Item B001C3O6QS not found in features file


Copying features:  22%|██▏       | 4.89k/22.6k [00:06<00:21, 816items/s]

Item B0016MOWOG not found in features file


Copying features:  26%|██▌       | 5.81k/22.6k [00:07<00:20, 820items/s]

Item B0014VPFPO not found in features file
Item B0017RFXRK not found in features file


Copying features:  29%|██▉       | 6.62k/22.6k [00:08<00:20, 786items/s]

Item B001ADKATC not found in features file


Copying features:  31%|███       | 6.96k/22.6k [00:09<00:18, 836items/s]

Item B0017ANB08 not found in features file


Copying features:  33%|███▎      | 7.41k/22.6k [00:09<00:17, 882items/s]

Item B0014CKCCY not found in features file


Copying features:  35%|███▍      | 7.86k/22.6k [00:10<00:16, 881items/s]

Item B0013KU93E not found in features file


Copying features:  40%|███▉      | 9.03k/22.6k [00:11<00:15, 888items/s]

Item B0013LRKRQ not found in features file


Copying features:  42%|████▏     | 9.57k/22.6k [00:12<00:14, 898items/s]

Item B001AQR39O not found in features file
Item B001B3LIOC not found in features file


Copying features:  62%|██████▏   | 13.9k/22.6k [00:16<00:09, 933items/s]

Item B0018CWW96 not found in features file


Copying features:  63%|██████▎   | 14.3k/22.6k [00:17<00:09, 922items/s]

Item B0014F20GM not found in features file


Copying features:  71%|███████   | 16.1k/22.6k [00:19<00:08, 766items/s]

Item B0019ESNGE not found in features file
Item B0013TR4GK not found in features file


Copying features:  74%|███████▍  | 16.8k/22.6k [00:20<00:06, 859items/s]

Item B001ANQY7Y not found in features file


Copying features:  88%|████████▊ | 19.9k/22.6k [00:23<00:03, 925items/s]

Item B0017W22LK not found in features file


Copying features:  96%|█████████▋| 21.8k/22.6k [00:25<00:00, 956items/s]

Item B0013K7ZUE not found in features file


Copying features:  98%|█████████▊| 22.1k/22.6k [00:26<00:00, 989items/s]

Item B0013LRKV2 not found in features file


Copying features: 100%|██████████| 22.6k/22.6k [00:26<00:00, 851items/s]


data/Movies_and_TV-clip_clip/embed_text.npy: 132MiB
Pandas(Index=4, image='none', text='clip', image_features=None, text_features=PosixPath('data/amazon/Movies_and_TV_cliptext_features.npz'), dest_folder=PosixPath('data/Movies_and_TV-none_clip'))
data/Movies_and_TV-none_clip/embed_text.npy: 132MiB
Pandas(Index=5, image='clip', text='none', image_features=PosixPath('data/amazon/Movies_and_TV_clipimage_features.npz'), text_features=None, dest_folder=PosixPath('data/Movies_and_TV-clip_none'))
data/Movies_and_TV-clip_none/embed_text.npy: 22MiB
Pandas(Index=6, image='none', text='none', image_features=None, text_features=None, dest_folder=PosixPath('data/Movies_and_TV-none_none'))
data/Movies_and_TV-none_none/embed_text.npy: 22MiB
