In [3]:
import os
from pathlib import Path
from typing import Tuple

import numpy as np
import numpy.typing as npt
import openai
import pandas as pd
from numpy.typing import NDArray
from scipy import sparse

import amazon_dataset

# Check if the paths are corresponding to the same dataset!!!
#DATASET = 'Clothing_Shoes_and_Jewelry'
DATASET = 'Movies_and_TV'
SEMMACRID_RES_PATH = Path(f'data/semmacrid-run/amazon-movies-DisenEVAE-50E-100B-0.001L-0.0001W-0.5D-0.2b-7k-200d-0.1t-98765s/eval.npz')
#DMRL_RES_PATH = Path(f'data/{DATASET}/best_scores_dmrl.npz')


TRAIN_PATH = Path(f'data/{DATASET}/train.txt')
TEST_PATH = Path(f'data/{DATASET}/test.txt')
ITEMS_PATH = Path(f'data/{DATASET}/items.txt')
USERS_PATH = Path(f'data/{DATASET}/users.txt')

#assert DMRL_RES_PATH.exists(), f'{DMRL_RES_PATH} does not exist'
assert SEMMACRID_RES_PATH.exists(), f'{SEMMACRID_RES_PATH} does not exist'
assert TRAIN_PATH.exists(), f'{TRAIN_PATH} does not exist'
assert TEST_PATH.exists(), f'{TEST_PATH} does not exist'
assert ITEMS_PATH.exists(), f'{ITEMS_PATH} does not exist'
assert USERS_PATH.exists(), f'{USERS_PATH} does not exist'
assert 'OPENAI_APIKEY' in os.environ

#  Load all data

In [3]:
dmrl_results: NDArray[np.floating] = np.vstack(next(iter(np.load(DMRL_RES_PATH, allow_pickle=True).values())))
dmrl_results.shape

(23318, 38493)

In [4]:
sem_macrid_results: NDArray[np.floating] = next(iter(np.load(SEMMACRID_RES_PATH, allow_pickle=True).values()))
sem_macrid_results.shape

(21974, 23958)

In [5]:
assert sem_macrid_results.shape == dmrl_results.shape

Get users

In [5]:
users = pd.read_csv(USERS_PATH)
assert len(users) == sem_macrid_results.shape[0]
users

Unnamed: 0.1,Unnamed: 0,0
0,A1MB7I0VC5L7P3,0
1,A2TPW076Q0EXLE,1
2,AT1PIN6WA88V9,2
3,A3NGCB9QQMM9SM,3
4,A19ELFMMKB83HC,4
...,...,...
21969,A2O9FHBZ34B5U5,21969
21970,ARX5VIQRPMX4B,21970
21971,A2EQ95QDK0PQCG,21971
21972,A1PV5TW2LP04YU,21972


Get items

In [6]:
items = pd.read_csv(ITEMS_PATH, index_col=0)
assert len(items) == sem_macrid_results.shape[1]
items

Unnamed: 0,item_id
B000E6ES94,0
B000PC6YTY,1
6303934870,2
B0024FADA6,3
6305926794,4
...,...
B00YPNIK5U,23953
B001AOC9G8,23954
B00SWFMCHO,23955
B00005JPI6,23956


In [7]:
def load_interaction_matrix(file: Path, shape: Tuple[int, int]) -> sparse.csr_matrix:
    df = pd.read_csv(file)
    return sparse.csr_matrix(
        (np.ones_like(df['user']), (df['user'], df['item'])),
        shape=shape,
        dtype=float
    )


train = load_interaction_matrix(TRAIN_PATH, (len(users), len(items)))
test = load_interaction_matrix(TEST_PATH, (len(users), len(items)))

# Analyze results

Let's make the trained data to be zero 

In [8]:
#dmrl_results[train.nonzero()] = -np.inf
sem_macrid_results[train.nonzero()] = -np.inf

Analyze how they agree in the top 5 results

In [9]:
k = 5

#best_dmrl = dmrl_results.argpartition(kth=-k, axis=1)[:, -k:]
best_sem_macridvae = sem_macrid_results.argpartition(kth=-k, axis=1)[:, -k:]

# Get metrics

In [12]:
def get_top_k_items(x: npt.NDArray, k: int) -> npt.NDArray:
    # Best indexes without sorting
    best_indices = np.argpartition(x, axis=1, kth=-k)[:, -k:]

    # Best scores sorted in ascending order
    best_values = np.take_along_axis(x, best_indices, axis=1)

    # Best indices in descending order (from best_best_values)
    best_values_idxs = np.argsort(best_values, axis=1)[:,-1:-k-1:-1]

    # Now, we have the best indices in descending order
    return np.take_along_axis(best_indices, best_values_idxs, axis=1)

def ndcg_at_k(scores: npt.NDArray, test: sparse.csr_matrix, k: int) -> float:
    assert scores.shape == test.shape
    best_scores = get_top_k_items(scores, k)
    
    test = test > 0
    num_positives = np.minimum(k, test.sum(axis=1)).astype(np.int32)
    
    rows = np.indices(best_scores.shape)[0]
    denominator = np.log2(np.arange(2, k + 2))
    
    idcg = np.cumsum(1 / denominator)[np.maximum(num_positives - 1, 0)]
    
    dcg = np.sum(test[rows, best_scores] / denominator, axis=1)

    ndcg = dcg / idcg
    return ndcg

def recall_at_k(scores: npt.NDArray, test: sparse.csr_matrix, k: int) -> float:
    assert scores.shape == test.shape
    best_scores = get_top_k_items(scores, k)
    
    test = test > 0
    num_positives = np.minimum(k, test.sum(axis=1)).astype(np.int32)
    
    rows = np.indices(best_scores.shape)[0]
    recall = np.sum(test[rows, best_scores], axis=1) / num_positives
    return recall

def precision_at_k(scores: npt.NDArray, test: sparse.csr_matrix, k: int) -> float:
    assert scores.shape == test.shape
    best_scores = get_top_k_items(scores, k)
    
    test = test > 0
    
    rows = np.indices(best_scores.shape)[0]
    precision = np.sum(test[rows, best_scores], axis=1) / k
    return precision

def average_precision(scores: npt.NDArray, test: sparse.csr_matrix) -> float:
    assert scores.shape == test.shape
    
    test = test > 0
    num_positives = np.asarray(test.sum(axis=1).astype(np.int32)).reshape(-1)
    
    k = num_positives.max()
    best_scores = get_top_k_items(scores, k)
    
    rows = np.indices(best_scores.shape)[0]
    labels = test[rows, best_scores].toarray()
    
    true_positives_at_k = np.cumsum(labels, axis=1)
    precisions_at_k  = true_positives_at_k / np.arange(1, k+1)
    return (precisions_at_k * labels).sum(axis=1) / num_positives


print(f'NDCG@5:      {ndcg_at_k(dmrl_results, test, 5).mean():.6f}')
print(f'Recall@5:    {recall_at_k(dmrl_results, test, 5).mean():.6f}')
print(f'MAP:         {average_precision(dmrl_results, test).mean():.6f}')


NDCG@5:      0.203675
Recall@5:    0.240542
MAP:         0.190090


In [13]:
def build_metrics(results_path: Path):
    print(f'Loading results from {results_path}')
    results = np.vstack(next(iter(np.load(results_path, allow_pickle=True).values())))
    print(f'Computing metrics for {results_path}')
    return {
            'path': results_path.stem,
            'ndcg@5': ndcg_at_k(results, test, 5).mean(),
            'recall@5': recall_at_k(results, test, 5).mean(),
            'map': average_precision(results, test).mean(),
        }

def build_results_matrix():
    metrics = [
        build_metrics(p) 
        for p in Path(f'data/{DATASET}').glob('best_scores_*.npz')
    ]
    return pd.DataFrame.from_records(metrics, index='path')

# Uncomment because it takes long to compute
# all_metrics = build_results_matrix()
# all_metrics

Arrays are very memory intensive so we delete the original results

In [14]:
import gc
del sem_macrid_results, dmrl_results
gc.collect()

0

In [11]:
best_dmrl

NameError: name 'best_dmrl' is not defined

In [10]:
best_sem_macridvae

array([[19728, 12427, 21117,  9377,  4883],
       [ 5066,  4883, 21117, 22245, 12427],
       [ 5066, 22245,  4883, 14528,  1286],
       ...,
       [12427, 21117, 19360, 16556,  4297],
       [ 8510,  9377,  9849, 20545,  8926],
       [21117,  2067,  9377,  3087, 16861]])

In [12]:
best_counts_semmacridvae = pd.Series(best_sem_macridvae.flatten()).value_counts(normalize=True).to_frame('percentage').head(20)
best_counts_semmacridvae

Unnamed: 0,percentage
21117,0.111759
4883,0.10679
12427,0.099854
22245,0.093911
9377,0.071494
16861,0.054564
3087,0.027687
1286,0.015946
3279,0.013989
5066,0.012442


In [13]:
# best_counts_dmrl = pd.Series(best_dmrl.flatten()).value_counts(normalize=True).to_frame('percentage').head(20)
# best_counts_dmrl

Top 20 items that counts as similar

In [22]:
set(best_counts_dmrl.index) & set(best_counts_semmacridvae.index)

{1235, 12593, 34043}

In [14]:
products = amazon_dataset.products_df(DATASET)
products.sample(n=3)

Unnamed: 0_level_0,asin,description,title,brand,main_cat,rank,price,image_slug,image_url,feature,category,tech_detail
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
192789,B00U2F8H6W,UK Released DVD/Blu-Ray item. It MAY NOT play ...,Banished,Myanna Buring,Movies & TV,"123,498 in Movies & TV (",,[517f+M9WAKL],[https://m.media-amazon.com/images/I/517f+M9WA...,[],"[Movies & TV, Genre for Featured Categories, R...",
192554,B00TPN7ET8,"Who wants to hire a three-time loser? No One, ...","Big Shot, The",Humphrey Bogart,Movies & TV,"63,486 in Movies & TV (",$15.31,[51mOQZgQxWL],[https://m.media-amazon.com/images/I/51mOQZgQx...,[],"[Movies & TV, Studio Specials, Warner Home Vid...",
28892,B00005RYLA,"In medieval Europe, crusading knights massacre...",The Church,Hugh Quarshie,Movies & TV,"141,054 in Movies & TV (",$9.98,[51VVB94XZVL],[https://m.media-amazon.com/images/I/51VVB94XZ...,[],"[Movies & TV, Genre for Featured Categories, F...",


In [34]:
def analyze_train_vs_predicted(best_results: npt.NDArray[np.floating]):
    best_results_set = np.apply_along_axis(lambda x: set(x), axis=1, arr=best_results)

    train_set = [set() for _ in range(train.shape[0])]
    for user_id, product_id in zip(*train.nonzero()):
        train_set[user_id].add(product_id)

    test_set = [set() for _ in range(test.shape[0])]
    for user_id, product_id in zip(*test.nonzero()):
        test_set[user_id].add(product_id)
    
    sample = np.random.choice(len(train_set))
    
    products_by_item_id = items.join(products.set_index('asin')).set_index('item_id')

    products_train = products_by_item_id.loc[list(train_set[sample] )]
    products_test = products_by_item_id.loc[list(test_set[sample] )]
    products_predicted = products_by_item_id.loc[list(best_results_set[sample] )]

    with pd.option_context('expand_frame_repr', False), pd.option_context('display.max_colwidth', 800):
        print('User ID', sample)
        print('Items train')
        print('---------')
        print(products_train[['title', 'category']])
        print('Items test')
        print('---------')
        print(products_test[['title', 'category']])
        print('Items predicted')
        print('---------')
        print(products_predicted[['title', 'category']])

        

analyze_train_vs_predicted(best_sem_macridvae)

User ID 17249
Items train
---------
                                                                     title                                                                                            category
item_id                                                                                                                                                                       
3266                                                                  Lucy  [Movies & TV, Studio Specials, Universal Studios Home Entertainment, All Universal Studios Titles]
23948    Alien: Quadrilogy (Alien / Aliens / Alien 3 / Alien Resurrection)                [Movies & TV, Studio Specials, 20th Century Fox Home Entertainment, Action, General]
2067                                                          The Revenant                                                                      [Movies & TV, Blu-ray, Movies]
10395                                                Star Trek 4k Ultra HD               

Analyze products duplication

In [170]:
import re
import hashlib

products = amazon_dataset.products_df('Clothing_Shoes_and_Jewelry')

def gen_hash(s) -> str:
    if s is None:
        return '???'
    
    cleaned = re.sub(r'\W+', '', s.lower().strip()).encode('ascii', errors='ignore')
    md5hash = hashlib.md5()
    md5hash.update(cleaned)
    return md5hash.hexdigest()

def analyze_dupicates():
    for dataset in ['Clothing_Shoes_and_Jewelry', 'Movies_and_TV', 'Home_and_Kitchen', 'Musical_Instruments']:
        products = amazon_dataset.products_df(dataset)
        value_counts = products['title'].apply(lambda x: gen_hash(x)).value_counts()
        duplicates = (value_counts > 1).sum() / len(value_counts)
        print(f'{dataset}: {100*duplicates:.2f}%')

analyze_dupicates()

Clothing_Shoes_and_Jewelry: 4.77%
Movies_and_TV: 1.94%
Home_and_Kitchen: 0.50%
Musical_Instruments: 0.28%


In [91]:
pd.Series(np.asarray(test.sum(axis=0))[0].astype(int)).sort_values(ascending=False).head(20)

18031    168
38308    165
26373    164
3520     163
8703     160
34043    153
28568    153
10284    152
6388     152
5372     146
1752     140
12593    140
13038    138
23510    131
33683    130
307      123
19838    118
35166     91
31411     82
6536      78
dtype: int64

In [21]:
pd.Series(train.sum(axis=0).tolist()[0]).sort_values(ascending=False)

12593    617.0
33683    598.0
19838    590.0
307      569.0
10284    567.0
         ...  
31833      0.0
8552       0.0
8557       0.0
8558       0.0
21618      0.0
Length: 38493, dtype: float64

In [72]:
items.iloc[2]

Unnamed: 0    B0105V2DEY
item_id                2
Name: 2, dtype: object

# Generate recommendation explanations

In [35]:
some_recommendations_itemids =  [9377]
some_asins = items.loc[items['item_id'].isin(some_recommendations_itemids)].index
some_asins

Index(['B00OV3VGP0'], dtype='object')

In [37]:
reviews = amazon_dataset.reviews_df(DATASET)
reviews.sample(n=3)

Unnamed: 0,id,asin,reviewerID,reviewerName,overall,text,reviewTime,summary,verified,vote
5088,214084,6300165086,A2A1NGUGKPH5B0,Janice L Browning,5.0,Great movie.,2017-11-14,Five Stars,True,
122284,7171234,B00XLX0Z62,AHXY8V6Q1J6X2,Robert Atkinson,5.0,Thanks,2017-06-12,Five Stars,True,
90932,5529234,B00AZI9JP4,A31VRYWMDZW4HH,trlwnc,4.0,"This is a neat movie about two young people falling in love. But think they can't. Along the way, they save a historic building and learn about their confusing past. Quite a fun punchline!",2017-03-23,... a neat movie about two young people falling in love. But think they can't,True,2.0


In [38]:
products.loc[products['asin'].isin(some_asins)]['title']

id
189488    John Wick
Name: title, dtype: object

In [43]:
def some_reviews_joined(reviews_df: pd.DataFrame, asins: pd.Series):
    def is_long_enough(text: str):
        if text is None:
            return False
        return len(text.split()) > 5

    condition = (
        reviews_df['asin'].isin(asins) & 
        (reviews_df['overall'] >= 4) & 
        (~reviews_df['text'].isna()) &
        reviews_df['text'].apply(is_long_enough)
    )
    some_reviews = reviews.loc[condition]
    some_review_texts = some_reviews['text'].sample(n=5).apply(lambda x: f'- {x}')
    return "\n".join(some_review_texts.to_list())

def prompt_for_completion(reviews_df: pd.DataFrame, asins: pd.Series):
    reviews_joined = some_reviews_joined(reviews_df, asins)
    prefix = "The item has the following comments about it"
    instruction = 'The suggestion for this item is:'
    prompt = f"{prefix}:\n\n{reviews_joined}\n\n{instruction}\n\n"
    print(prompt)
    openai.api_key = os.getenv("OPENAI_APIKEY")
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0,
        max_tokens=60,
        top_p=1,
        frequency_penalty=0.5,
        presence_penalty=0
    )
    choices = response["choices"]
    assert len(choices) == 1
    return choices[0]["text"]

print(prompt_for_completion(reviews, some_asins))

The item has the following comments about it:

- Received on time and in tact. Disk worked fine and movie was great. I love it.
- Well mounted production with lots of action, but to me the most interesting aspect was being immersed in the details of the bizarre criminal world that John had hoped to retire from.
- Movie i like so i bought it
- Fun thrilling movie.  Love the Hotel for Assassins.
- John Wick 1 or 2, both great movies!!

The suggestion for this item is:


This is a must-watch for any action movie fan! John Wick 1 and 2 are both great movies, full of thrilling action and an immersive criminal world. The production is well mounted and the details are fascinating. Plus, you'll love the Hotel for Assassins!
