In [1]:
import os
from pathlib import Path
from typing import Tuple

import numpy as np
import numpy.typing as npt
import openai
import pandas as pd
from numpy.typing import NDArray
from scipy import sparse

import amazon_dataset

DATASET = 'Clothing_Shoes_and_Jewelry'
DMRL_RES_PATH = Path(f'data/{DATASET}/best_scores_vitbert_dmrl.npz')
SEMMACRID_RES_PATH = Path(f'data/SEM-MacridVAE-{DATASET}-run/eval.npz')
TRAIN_PATH = Path(f'data/{DATASET}/train.txt')
TEST_PATH = Path(f'data/{DATASET}/test.txt')
ITEMS_PATH = Path(f'data/{DATASET}/items.txt')
USERS_PATH = Path(f'data/{DATASET}/users.txt')

In [2]:
assert DMRL_RES_PATH.exists()
assert SEMMACRID_RES_PATH.exists()
assert TRAIN_PATH.exists()
assert TEST_PATH.exists()
assert ITEMS_PATH.exists()
assert USERS_PATH.exists()
assert 'OPENAI_APIKEY' in os.environ

#  Load all data

In [3]:
dmrl_results: NDArray[np.floating] = np.vstack(next(iter(np.load(DMRL_RES_PATH, allow_pickle=True).values())))
dmrl_results.shape

(23318, 38493)

In [4]:
sem_macrid_results: NDArray[np.floating] = next(iter(np.load(SEMMACRID_RES_PATH, allow_pickle=True).values()))
sem_macrid_results.shape

(23318, 38493)

In [5]:
assert sem_macrid_results.shape == dmrl_results.shape

Get users

In [7]:
users = pd.read_csv(USERS_PATH)
assert len(users) == sem_macrid_results.shape[0]
users

Unnamed: 0.1,Unnamed: 0,0
0,A30NKRF3KBGA06,0
1,AL0XGCBE6Z22M,1
2,AMT5LF0TKY67C,2
3,A2BY8EVXA3NRHD,3
4,AWE6KR1ELIYQ3,4
...,...,...
23313,A1MFBF49ZFMH2N,23313
23314,A36AF5I7D0VO8F,23314
23315,A35ZS7JT3G9B8,23315
23316,A3GC94SEKQI3QU,23316


Get items

In [8]:
items = pd.read_csv(ITEMS_PATH, index_col=0)
assert len(items) == sem_macrid_results.shape[1]
items

Unnamed: 0,item_id
B000B6AV7K,0
B0143D7EE4,1
B0105V2DEY,2
B014EY21H2,3
B005LUROIK,4
...,...
B00A9R2P7A,38488
B017HK485S,38489
B008H7UKYY,38490
B006K6PJTK,38491


In [9]:
def load_interaction_matrix(file: Path, shape: Tuple[int, int]) -> sparse.csr_matrix:
    df = pd.read_csv(file)
    return sparse.csr_matrix(
        (np.ones_like(df['user']), (df['user'], df['item'])),
        shape=shape,
        dtype=float
    )


train = load_interaction_matrix(TRAIN_PATH, (len(users), len(items)))
test = load_interaction_matrix(TEST_PATH, (len(users), len(items)))

# Analyze results

Let's make the trained data to be zero 

In [10]:
dmrl_results[train.nonzero()] = -np.inf
sem_macrid_results[train.nonzero()] = -np.inf

Analyze how they agree in the top 5 results

In [11]:
k = 5

best_dmrl = dmrl_results.argpartition(kth=-k, axis=1)[:, -k:]
best_sem_macridvae = sem_macrid_results.argpartition(kth=-k, axis=1)[:, -k:]

# Get metrics

In [12]:
def get_top_k_items(x: npt.NDArray, k: int) -> npt.NDArray:
    # Best indexes without sorting
    best_indices = np.argpartition(x, axis=1, kth=-k)[:, -k:]

    # Best scores sorted in ascending order
    best_values = np.take_along_axis(x, best_indices, axis=1)

    # Best indices in descending order (from best_best_values)
    best_values_idxs = np.argsort(best_values, axis=1)[:,-1:-k-1:-1]

    # Now, we have the best indices in descending order
    return np.take_along_axis(best_indices, best_values_idxs, axis=1)

def ndcg_at_k(scores: npt.NDArray, test: sparse.csr_matrix, k: int) -> float:
    assert scores.shape == test.shape
    best_scores = get_top_k_items(scores, k)
    
    test = test > 0
    num_positives = np.minimum(k, test.sum(axis=1)).astype(np.int32)
    
    rows = np.indices(best_scores.shape)[0]
    denominator = np.log2(np.arange(2, k + 2))
    
    idcg = np.cumsum(1 / denominator)[np.maximum(num_positives - 1, 0)]
    
    dcg = np.sum(test[rows, best_scores] / denominator, axis=1)

    ndcg = dcg / idcg
    return ndcg

def recall_at_k(scores: npt.NDArray, test: sparse.csr_matrix, k: int) -> float:
    assert scores.shape == test.shape
    best_scores = get_top_k_items(scores, k)
    
    test = test > 0
    num_positives = np.minimum(k, test.sum(axis=1)).astype(np.int32)
    
    rows = np.indices(best_scores.shape)[0]
    recall = np.sum(test[rows, best_scores], axis=1) / num_positives
    return recall

def precision_at_k(scores: npt.NDArray, test: sparse.csr_matrix, k: int) -> float:
    assert scores.shape == test.shape
    best_scores = get_top_k_items(scores, k)
    
    test = test > 0
    
    rows = np.indices(best_scores.shape)[0]
    precision = np.sum(test[rows, best_scores], axis=1) / k
    return precision

def average_precision(scores: npt.NDArray, test: sparse.csr_matrix) -> float:
    assert scores.shape == test.shape
    
    test = test > 0
    num_positives = np.asarray(test.sum(axis=1).astype(np.int32)).reshape(-1)
    
    k = num_positives.max()
    best_scores = get_top_k_items(scores, k)
    
    rows = np.indices(best_scores.shape)[0]
    labels = test[rows, best_scores].toarray()
    
    true_positives_at_k = np.cumsum(labels, axis=1)
    precisions_at_k  = true_positives_at_k / np.arange(1, k+1)
    return (precisions_at_k * labels).sum(axis=1) / num_positives


print(f'NDCG@5:      {ndcg_at_k(dmrl_results, test, 5).mean():.6f}')
print(f'Recall@5:    {recall_at_k(dmrl_results, test, 5).mean():.6f}')
print(f'MAP:         {average_precision(dmrl_results, test).mean():.6f}')


NDCG@5:      0.203675
Recall@5:    0.240542
MAP:         0.190090


In [13]:
def build_metrics(results_path: Path):
    print(f'Loading results from {results_path}')
    results = np.vstack(next(iter(np.load(results_path, allow_pickle=True).values())))
    print(f'Computing metrics for {results_path}')
    return {
            'path': results_path.stem,
            'ndcg@5': ndcg_at_k(results, test, 5).mean(),
            'recall@5': recall_at_k(results, test, 5).mean(),
            'map': average_precision(results, test).mean(),
        }

def build_results_matrix():
    metrics = [
        build_metrics(p) 
        for p in Path(f'data/{DATASET}').glob('best_scores_*.npz')
    ]
    return pd.DataFrame.from_records(metrics, index='path')

# Uncomment because it takes long to compute
# all_metrics = build_results_matrix()
# all_metrics

Arrays are very memory intensive so we delete the original results

In [14]:
import gc
del sem_macrid_results, dmrl_results
gc.collect()

0

In [15]:
best_dmrl

array([[ 4753, 30949,  7959, 19594, 23563],
       [16992, 17106,  2121, 23339,  8913],
       [ 7970, 13521, 32930, 24926, 29933],
       ...,
       [14556,  3482, 18381,  4866, 24574],
       [32438,  2918, 24346, 32060, 13147],
       [ 6217, 33684, 13116, 27597, 34301]])

In [16]:
best_sem_macridvae

array([[35166, 26336, 12593, 18611, 24346],
       [18611,  5797,  2121, 23339,  8913],
       [31447, 10080, 12593, 16656, 32930],
       ...,
       [35166, 21514,  3482, 12593, 18611],
       [24346, 13147, 32438, 18611, 12593],
       [12593, 13116, 33684, 27597, 34301]])

In [17]:
best_counts_semmacridvae = pd.Series(best_sem_macridvae.flatten()).value_counts(normalize=True).to_frame('percentage').head(20)
best_counts_semmacridvae

Unnamed: 0,percentage
12593,0.117617
18611,0.102976
21514,0.07161
24346,0.06112
35166,0.054782
26336,0.046728
24956,0.040158
32060,0.031281
18492,0.011914
1235,0.007016


In [18]:
best_counts_dmrl = pd.Series(best_dmrl.flatten()).value_counts(normalize=True).to_frame('percentage').head(20)
best_counts_dmrl

Unnamed: 0,percentage
9258,0.007608
17254,0.006484
6112,0.006416
1235,0.006073
381,0.005635
19635,0.005567
6802,0.005438
6027,0.005412
12593,0.005232
921,0.00506


Top 20 items that counts as similar

In [22]:
set(best_counts_dmrl.index) & set(best_counts_semmacridvae.index)

{1235, 12593, 34043}

In [75]:
products = amazon_dataset.products_df(DATASET)
products.sample(n=3)

Unnamed: 0_level_0,asin,description,title,brand,main_cat,rank,price,image_slug,image_url,feature,category,tech_detail
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2416933,B01BFXNXWM,This butt lift skinny Jeans made of ultra soft...,HyBrid & Company Women's Butt Lift V2 Super Co...,HyBrid & Company,,"31,452inClothing,ShoesJewelry(",$11.79 - $24.99,[31worXZSHQL],[https://m.media-amazon.com/images/I/31worXZSH...,"[78% COTTON 20% POLYESTER 2% SPANDEX, Enhance ...","[Clothing, Shoes & Jewelry, Women, Clothing, J...",
79943,B00131FDTS,"Rain Rain, go away""...you won't be singing thi...",Nomad Women's Puddles Rain Boot,Nomad,,"24,578inClothing,ShoesJewelry(",$22.75 - $65.50,[51ja3jXc04L],[https://m.media-amazon.com/images/I/51ja3jXc0...,"[100% Rubber, Imported, NOTE: This style is av...","[Clothing, Shoes & Jewelry, Women, Shoes, Outd...",
2080052,B013F4GPD8,Plus4u is a company which carries variety styl...,Plus4u Women's Basic Solid Sleeveless Plus Siz...,Plus4u,,"553,119inClothing,ShoesJewelry(",$7.67 - $14.99,[313Orw0FrmL],[https://images-na.ssl-images-amazon.com/image...,"[AWTTSL076, AWTTK0477 (100% RAYON) / AWTTSL077...","[Clothing, Shoes & Jewelry, Women, Clothing, T...",


In [174]:


def analyze_train_vs_predicted(best_results: npt.NDArray[np.floating]):
    best_results_set = np.apply_along_axis(lambda x: set(x), axis=1, arr=best_results)

    train_set = [set() for _ in range(train.shape[0])]
    for user_id, product_id in zip(*train.nonzero()):
        train_set[user_id].add(product_id)
    
    sample = np.random.choice(len(train_set))
    
    products_by_item_id = items.join(products.set_index('asin')).set_index('item_id')

    products_train = products_by_item_id.loc[list(train_set[sample] )]
    products_predicted = products_by_item_id.loc[list(best_results_set[sample] )]

    print('User ID', sample)
    print('Items train')
    print('---------')
    pd.set_option('display.max_colwidth', 80)
    print(products_train.title)
    print('Items predicted')
    print('---------')
    print(products_predicted.title)
        

analyze_train_vs_predicted(best_sem_macridvae)

User ID 1702
Items train
---------
item_id
17385          LEE Women's Relaxed Fit All Day Straight Leg Pant
851            LEE Women's Relaxed Fit All Day Straight Leg Pant
11751    Hanes Men's Short-Sleeve Jersey Pocket Polo (Pack of 2)
Name: title, dtype: object
Items predicted
---------
item_id
15365    Rekucci Women's Ease In To Comfort Fit Barely Bootcut Stretch Pants
20009                       Hanes Men's 5-Pack Exposed Waistband Knit Boxers
21514                 Hanes Men's Pullover EcoSmart Fleece Hooded Sweatshirt
25362                              LEE Women's Relaxed Fit Straight Leg Jean
24346                                         Teva Women's Olowahu Flip-Flop
Name: title, dtype: object


: 

Analyze products duplication

In [170]:
import re
import hashlib

products = amazon_dataset.products_df('Clothing_Shoes_and_Jewelry')

def gen_hash(s) -> str:
    if s is None:
        return '???'
    
    cleaned = re.sub(r'\W+', '', s.lower().strip()).encode('ascii', errors='ignore')
    md5hash = hashlib.md5()
    md5hash.update(cleaned)
    return md5hash.hexdigest()

def analyze_dupicates():
    for dataset in ['Clothing_Shoes_and_Jewelry', 'Movies_and_TV', 'Home_and_Kitchen', 'Musical_Instruments']:
        products = amazon_dataset.products_df(dataset)
        value_counts = products['title'].apply(lambda x: gen_hash(x)).value_counts()
        duplicates = (value_counts > 1).sum() / len(value_counts)
        print(f'{dataset}: {100*duplicates:.2f}%')

analyze_dupicates()

Clothing_Shoes_and_Jewelry: 4.77%
Movies_and_TV: 1.94%
Home_and_Kitchen: 0.50%
Musical_Instruments: 0.28%


In [91]:
pd.Series(np.asarray(test.sum(axis=0))[0].astype(int)).sort_values(ascending=False).head(20)

18031    168
38308    165
26373    164
3520     163
8703     160
34043    153
28568    153
10284    152
6388     152
5372     146
1752     140
12593    140
13038    138
23510    131
33683    130
307      123
19838    118
35166     91
31411     82
6536      78
dtype: int64

In [21]:
pd.Series(train.sum(axis=0).tolist()[0]).sort_values(ascending=False)

12593    617.0
33683    598.0
19838    590.0
307      569.0
10284    567.0
         ...  
31833      0.0
8552       0.0
8557       0.0
8558       0.0
21618      0.0
Length: 38493, dtype: float64

In [72]:
items.iloc[2]

Unnamed: 0    B0105V2DEY
item_id                2
Name: 2, dtype: object

# Generate recommendation explanations

In [15]:
some_recommendations_itemids =  854, 30552, 26624, 32501, 34301
some_asins = items.loc[items['item_id'].isin(some_recommendations_itemids)].index
some_asins

Index(['B013G9AYPC', 'B013G9BBA4', 'B013CA13K0', 'B013CZSN5S', 'B0113OB89S'], dtype='object')

In [18]:
reviews = amazon_dataset.reviews_df(DATASET)
reviews

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
0,676,5120053084,A35EUS1E3WK1HC,Kiley and Mars,5.0,"It's a cute top, works good for nursing and la...",2018-04-10,Decent lounge around top,True,
1,677,5120053084,AKIZYAIS4SYVF,Bethany,5.0,Looks really cute and super easy to nurse my d...,2018-03-22,Cute,True,
2,679,5120053084,A2L74OWEP7H1VC,Shelby0516,3.0,The tie is longer than the pictures showed. Ha...,2018-03-14,Awkward tie,True,
3,681,5120053084,A260RMKZXGDHVH,Kelly Kennedy,5.0,Of all the nursing shirts I bought my daughter...,2018-03-07,this one is her favorite. She wears it with pa...,True,
4,1300,7709260373,A13QI8GT2FFGN6,Amy,5.0,For the price... this is awesome!,2018-03-13,this is awesome!,True,
...,...,...,...,...,...,...,...,...,...,...
178939,32291840,B01HJCSCLK,AAHWQ4FMWLNH3,amazonlover,5.0,"Beautiful. Strong, durable, and chic but subtl...",2018-07-25,Beautiful,True,
178940,32291855,B01HJDVCJI,A2WUHKA1I75SL3,FRCP,3.0,Fit is great on these and they are very comfor...,2018-09-03,Comfortable,False,
178941,32291863,B01HJDVCJI,A7B48AJT6IC0A,Lives2read,4.0,Excellent arch support. Unique tongue design c...,2018-08-13,Unique look and comfort,False,
178942,32291875,B01HJDZM30,A2CJOG4NUHVDGK,Brittney Mitchell,5.0,Bought this for my husband and he absolutely l...,2018-08-29,Five Stars,False,


In [16]:
products.loc[products['asin'].isin(some_asins)]['title']

id
2024956               TOMS Men's Classic Canvas Slip-On
2077409    TOMS Seasonal Classics Women's Slip on Shoes
2078298               TOMS Men's Classic Canvas Slip-On
2081419    TOMS Seasonal Classics Women's Slip on Shoes
2081430    TOMS Seasonal Classics Women's Slip on Shoes
Name: title, dtype: object

In [23]:
def some_reviews_joined(reviews_df: pd.DataFrame, asins: pd.Series):
    def is_long_enough(text: str):
        if text is None:
            return False
        return len(text.split()) > 5

    condition = (
        reviews_df['asin'].isin(asins) & 
        (reviews_df['overall'] >= 4) & 
        (~reviews_df['text'].isna()) &
        reviews_df['text'].apply(is_long_enough)
    )
    some_reviews = reviews.loc[condition]
    some_review_texts = some_reviews['text'].sample(n=5).apply(lambda x: f'- {x}')
    return "\n".join(some_review_texts.to_list())

def prompt_for_completion(reviews_df: pd.DataFrame, asins: pd.Series):
    reviews_joined = some_reviews_joined(reviews_df, asins)
    prefix = "The item has the following comments about it"
    instruction = 'The suggestion for this item is:'
    prompt = f"{prefix}:\n\n{reviews_joined}\n\n{instruction}\n\n"
    print(prompt)
    openai.api_key = os.getenv("OPENAI_APIKEY")
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0,
        max_tokens=60,
        top_p=1,
        frequency_penalty=0.5,
        presence_penalty=0
    )
    choices = response["choices"]
    assert len(choices) == 1
    return choices[0]["text"]

print(prompt_for_completion(reviews, some_asins))

The item has the following comments about it:

- Comfortable and adaptable to skirts, shorts, dresses.
- Comfortable and light. My daughter loves them!
- I just received these and I'm in the process of breaking them in.  They are a TAD small with socks on.  I normally wear a size 11, and my toe is basically to the end.  I love the comfort otherwise.  They are very light and are a good hybrid slipper/shoe.
- Great shoe for weekends/lazy days...everyday.  Shoe fitted closely a first but with continue wear it now fit perfectly.  The only thing that disappointed me was the colour faded after its first clean.
- Must have summer shoe!! Perfect Toms style and comfort!!

The suggestion for this item is:


This item is perfect for those looking for a comfortable and adaptable shoe that can be worn with skirts, shorts, and dresses. It is lightweight and provides great comfort. The shoe may fit a bit small at first, but with continued wear it will fit perfectly. The colour may fade after its firs