In [1]:
import pandas as pd
import os
import sys
import numpy as np
import scipy

# remove
from tqdm.notebook import tqdm

# helpers
sys.path.append("/workspaces/ProductionRecommender/prod-reco/src/prod_reco")
from commons.recommender_utils import RecommenderUtils

In [5]:
df_ratings = catalog.load("ratings")
df_items = catalog.load("items")

2022-03-23 10:18:50,865 - kedro.io.data_catalog - INFO - Loading data from `ratings` (CSVDataSet)...


  return func(*args, **kwargs)


2022-03-23 10:19:00,020 - kedro.io.data_catalog - INFO - Loading data from `items` (CSVDataSet)...


In [6]:
# PARAMETERS
USER_ID = "userId"
ITEM_ID = "itemId"
RATING="rating"

item_min_bought = 8
user_min_bought = 8

## Data Engineering - Process Ratings

In [7]:
# ratings of 5 are 1, everything else is deleted
df_ratings = df_ratings[df_ratings[RATING] > 4]
df_ratings[RATING] = 1

# create utils object
utils = RecommenderUtils(user_id = USER_ID, item_id = ITEM_ID, rating=RATING)

# threshold interactions
df_txn = RecommenderUtils.threshold_interactions_df(df_ratings, USER_ID, ITEM_ID, user_min_bought, item_min_bought)
num_ratings, sparsity = utils.print_ratings_shape(df_txn)

# to sparse matrix
interactions, rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid = RecommenderUtils.df_to_matrix(df_txn, USER_ID, ITEM_ID, interaction_var=RATING)

num_users = interactions.shape[0]
num_items = interactions.shape[1]

Starting interactions info
Number of rows: 7212
Number of cols: 6432
Sparsity: 0.460%
Ending interactions info
Number of rows: 5387
Number of columns: 2620
Sparsity: 1.384%
Number of users: 5387
Number of items: 2620
Number of rows: (195359, 4)
Sparsity: 0.013841563730609597


### Persist mappings (idx, cid, rid) to pickle

In [8]:
rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid

({1: 0,
  3: 1,
  4: 2,
  5: 3,
  6: 4,
  7: 5,
  8: 6,
  9: 7,
  10: 8,
  11: 9,
  13: 10,
  14: 11,
  16: 12,
  17: 13,
  18: 14,
  19: 15,
  23: 16,
  24: 17,
  26: 18,
  30: 19,
  34: 20,
  35: 21,
  36: 22,
  37: 23,
  38: 24,
  40: 25,
  41: 26,
  43: 27,
  44: 28,
  45: 29,
  46: 30,
  47: 31,
  50: 32,
  51: 33,
  52: 34,
  53: 35,
  55: 36,
  56: 37,
  57: 38,
  58: 39,
  60: 40,
  61: 41,
  65: 42,
  66: 43,
  67: 44,
  68: 45,
  70: 46,
  73: 47,
  75: 48,
  76: 49,
  78: 50,
  81: 51,
  82: 52,
  86: 53,
  87: 54,
  88: 55,
  89: 56,
  90: 57,
  91: 58,
  92: 59,
  94: 60,
  96: 61,
  97: 62,
  98: 63,
  99: 64,
  101: 65,
  103: 66,
  104: 67,
  106: 68,
  107: 69,
  112: 70,
  114: 71,
  115: 72,
  116: 73,
  117: 74,
  121: 75,
  122: 76,
  123: 77,
  124: 78,
  125: 79,
  126: 80,
  128: 81,
  129: 82,
  130: 83,
  131: 84,
  135: 85,
  136: 86,
  137: 87,
  138: 88,
  139: 89,
  140: 90,
  141: 91,
  143: 92,
  144: 93,
  148: 94,
  149: 95,
  151: 96,
  152: 97,
  153

### Persist Interactions for candidate filtering

In [9]:
interactions

<5387x2620 sparse matrix of type '<class 'numpy.float64'>'
	with 195359 stored elements in Compressed Sparse Row format>

## Data Engineering - Process Items

In [10]:
cid_to_idx

{122: 0,
 185: 1,
 231: 2,
 292: 3,
 316: 4,
 329: 5,
 355: 6,
 356: 7,
 362: 8,
 364: 9,
 370: 10,
 377: 11,
 420: 12,
 466: 13,
 480: 14,
 520: 15,
 539: 16,
 586: 17,
 588: 18,
 589: 19,
 594: 20,
 616: 21,
 110: 22,
 151: 23,
 213: 24,
 1597: 25,
 1674: 26,
 3684: 27,
 4995: 28,
 6539: 29,
 8533: 30,
 8783: 31,
 27821: 32,
 34: 33,
 150: 34,
 153: 35,
 161: 36,
 165: 37,
 266: 38,
 317: 39,
 410: 40,
 500: 41,
 587: 42,
 590: 43,
 592: 44,
 595: 45,
 30: 46,
 32: 47,
 47: 48,
 326: 49,
 334: 50,
 412: 51,
 446: 52,
 527: 53,
 532: 54,
 538: 55,
 541: 56,
 562: 57,
 608: 58,
 919: 59,
 920: 60,
 923: 61,
 926: 62,
 1046: 63,
 1096: 64,
 1104: 65,
 1183: 66,
 1199: 67,
 1219: 68,
 1225: 69,
 1230: 70,
 1235: 71,
 1295: 72,
 260: 73,
 457: 74,
 858: 75,
 1193: 76,
 1196: 77,
 1197: 78,
 1277: 79,
 1304: 80,
 1584: 81,
 1653: 82,
 2396: 83,
 2571: 84,
 3578: 85,
 3994: 86,
 3996: 87,
 101: 88,
 599: 89,
 899: 90,
 903: 91,
 904: 92,
 908: 93,
 912: 94,
 913: 95,
 930: 96,
 951: 97,
 11

In [11]:
# reorder and filter df_items
df_items = df_items.set_index(ITEM_ID).loc[cid_to_idx]

# get metadata tags
df_items_feats = df_items["tags"]

# get movie names
df_item_names = df_items["movieName"]
idx_to_names = {cid_to_idx[k]:v for k,v in df_item_names.to_dict().items()}

In [12]:
# persist index to names
idx_to_names

{0: 'Boomerang (1992)',
 1: 'Net, The (1995)',
 2: 'Dumb & Dumber (1994)',
 3: 'Outbreak (1995)',
 4: 'Stargate (1994)',
 5: 'Star Trek: Generations (1994)',
 6: 'Flintstones, The (1994)',
 7: 'Forrest Gump (1994)',
 8: 'Jungle Book, The (1994)',
 9: 'Lion King, The (1994)',
 10: 'Naked Gun 33 1/3: The Final Insult (1994)',
 11: 'Speed (1994)',
 12: 'Beverly Hills Cop III (1994)',
 13: 'Hot Shots! Part Deux (1993)',
 14: 'Jurassic Park (1993)',
 15: 'Robin Hood: Men in Tights (1993)',
 16: 'Sleepless in Seattle (1993)',
 17: 'Home Alone (1990)',
 18: 'Aladdin (1992)',
 19: 'Terminator 2: Judgment Day (1991)',
 20: 'Snow White and the Seven Dwarfs (1937)',
 21: 'Aristocats, The (1970)',
 22: 'Braveheart (1995)',
 23: 'Rob Roy (1995)',
 24: 'Burnt by the Sun (Utomlyonnye solntsem) (1994)',
 25: 'Conspiracy Theory (1997)',
 26: 'Witness (1985)',
 27: 'Fabulous Baker Boys, The (1989)',
 28: 'Beautiful Mind, A (2001)',
 29: 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 3

In [13]:
# convert tags to list of dictionaries
dummies_items = df_items_feats.str.get_dummies(sep='|')

list_of_dict_item_features = [{} for _ in idx_to_cid]
for idx, row in tqdm(dummies_items.iterrows(), total=len(dummies_items)):
    dict_item_feat = {k:v for k,v in row.items() if v > 0}
    # cold start items are ignored, for now
    if idx in cid_to_idx:
        list_of_dict_item_features[cid_to_idx[idx]] = dict_item_feat

# use DictVectorizer to convert to sparse
from sklearn.feature_extraction import DictVectorizer

item_vec = DictVectorizer()
sp_item_feats = item_vec.fit_transform((list_of_dict_item_features))


# (3) format identity matrix and hstack with item feats
sp_items_eye = scipy.sparse.eye(sp_item_feats.shape[0])
sp_item_feats = scipy.sparse.hstack((sp_items_eye, sp_item_feats))

  0%|          | 0/2620 [00:00<?, ?it/s]

In [16]:
interactions, sp_item_feats

(<5387x2620 sparse matrix of type '<class 'numpy.float64'>'
 	with 195359 stored elements in Compressed Sparse Row format>,
 <2620x2639 sparse matrix of type '<class 'numpy.float64'>'
 	with 8858 stored elements in COOrdinate format>)

## Persist Popularity

In [17]:
# get popularity
df_item_pop = df_txn.groupby(ITEM_ID)[USER_ID].nunique().to_frame("num_users")

# extract year
series_year = df_items["movieName"].str.extract(r"(\([0-9]{4}\))", expand=False)
series_year = series_year.str.replace(r"\(|\)","", regex=True).astype(np.int32).to_frame("year")

# left join
df_item_rank = df_item_pop.join(series_year).join(df_items["movieName"])
# fill null years by minimum
df_item_rank["year"] = df_item_rank["year"].fillna(df_item_rank["year"].min())
# sort
df_item_rank = df_item_rank.sort_values(by=["year", "num_users"], ascending=[False, False])
df_item_rank[:40]

Unnamed: 0_level_0,num_users,year,movieName
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
58559,103,2008,"Dark Knight, The (2008)"
59315,66,2008,Iron Man (2008)
60069,56,2008,WALL·E (2008)
57669,18,2008,In Bruges (2008)
59784,16,2008,Kung Fu Panda (2008)
57368,14,2008,Cloverfield (2008)
58998,12,2008,Forgetting Sarah Marshall (2008)
60126,9,2008,Get Smart (2008)
60040,8,2008,"Incredible Hulk, The (2008)"
54286,96,2007,"Bourne Ultimatum, The (2007)"


In [31]:
df_item_rank[df_item_rank["movieName"].str.contains("Haunting")]

Unnamed: 0_level_0,num_users,year,movieName
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2719,8,1999,"Haunting, The (1999)"
2550,22,1963,"Haunting, The (1963)"


In [29]:
df_item_rank.sample(10)

Unnamed: 0_level_0,num_users,year,movieName
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2336,172,1998,Elizabeth (1998)
1241,40,1992,Dead Alive (Braindead) (1992)
2622,38,1999,"Midsummer Night's Dream, A (1999)"
1252,397,1974,Chinatown (1974)
50,1359,1995,"Usual Suspects, The (1995)"
6783,15,1939,"Rules of the Game, The (La Règle du jeu) (1939)"
40815,82,2005,Harry Potter and the Goblet of Fire (2005)
2664,62,1956,Invasion of the Body Snatchers (1956)
2023,76,1990,"Godfather: Part III, The (1990)"
2052,9,1993,Hocus Pocus (1993)


In [35]:
idx_to_names[2481]

'Haunting, The (1999)'

In [39]:
idx_to_names

{0: 'Boomerang (1992)',
 1: 'Net, The (1995)',
 2: 'Dumb & Dumber (1994)',
 3: 'Outbreak (1995)',
 4: 'Stargate (1994)',
 5: 'Star Trek: Generations (1994)',
 6: 'Flintstones, The (1994)',
 7: 'Forrest Gump (1994)',
 8: 'Jungle Book, The (1994)',
 9: 'Lion King, The (1994)',
 10: 'Naked Gun 33 1/3: The Final Insult (1994)',
 11: 'Speed (1994)',
 12: 'Beverly Hills Cop III (1994)',
 13: 'Hot Shots! Part Deux (1993)',
 14: 'Jurassic Park (1993)',
 15: 'Robin Hood: Men in Tights (1993)',
 16: 'Sleepless in Seattle (1993)',
 17: 'Home Alone (1990)',
 18: 'Aladdin (1992)',
 19: 'Terminator 2: Judgment Day (1991)',
 20: 'Snow White and the Seven Dwarfs (1937)',
 21: 'Aristocats, The (1970)',
 22: 'Braveheart (1995)',
 23: 'Rob Roy (1995)',
 24: 'Burnt by the Sun (Utomlyonnye solntsem) (1994)',
 25: 'Conspiracy Theory (1997)',
 26: 'Witness (1985)',
 27: 'Fabulous Baker Boys, The (1989)',
 28: 'Beautiful Mind, A (2001)',
 29: 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 3

In [34]:
cid_to_idx[2719]

2481

In [13]:
df_rank = catalog.load("item_rank")
df_rank

2022-03-21 15:56:04,876 - kedro.io.data_catalog - INFO - Loading data from `item_rank` (CSVDataSet)...


Unnamed: 0,num_users,year,movieName
0,8,2008,"Incredible Hulk, The (2008)"
1,9,2008,Get Smart (2008)
2,12,2008,Forgetting Sarah Marshall (2008)
3,14,2008,Cloverfield (2008)
4,16,2008,Kung Fu Panda (2008)
...,...,...,...
2615,39,1925,"Battleship Potemkin, The (Bronenosets Potyomki..."
2616,42,1925,"Gold Rush, The (1925)"
2617,67,1922,"Nosferatu (Nosferatu, eine Symphonie des Graue..."
2618,9,1921,"Kid, The (1921)"


## Factorization
### Train and test set

In [18]:
split_count = user_min_bought
split_fraction = 0.5
fraction = None
random_seed = 42
np.random.seed(random_seed)

train, test, test_users = RecommenderUtils.train_test_split_sparse(interactions, split_count, 
                                                                   split_fraction=split_fraction, fraction=fraction)

In [19]:
# create a train set where the train-only users have unknown recos
# this is for training set evaluation
eval_train = train.copy()
non_eval_users = list(set(range(train.shape[0])) - set(test_users))

eval_train = eval_train.tolil()
for u in non_eval_users:
    eval_train[u, :] = 0.0
eval_train = eval_train.tocsr()

In [20]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

epochs = 3
k = 5

list_train_prec = []
list_test_prec = []
warp_model = LightFM(no_components=50, loss='warp', random_state=random_seed)
for idx_epoch in range(epochs):
    warp_model.fit_partial(train, item_features=sp_item_feats, num_threads=2, epochs=1)    
    test_prec = precision_at_k(warp_model, test, train_interactions=train, k=k, item_features=sp_item_feats)
    train_prec = precision_at_k(warp_model, eval_train, train_interactions=None, k=k, item_features=sp_item_feats)
    
    test_prec = np.mean(test_prec)
    train_prec = np.mean(train_prec)
    
    print(f"Train: {train_prec}, Test: {test_prec}")
    
    list_test_prec.append(test_prec)
    list_train_prec.append(train_prec)

Train: 0.20356415212154388, Test: 0.19042138755321503
Train: 0.2421756237745285, Test: 0.21269723773002625
Train: 0.2638203203678131, Test: 0.22587712109088898


### Get Embeddings

In [26]:
B_v, V = warp_model.get_item_representations(features=sp_item_feats)
B_u, U = warp_model.get_user_representations()

In [27]:
V.shape

(2620, 50)

In [28]:
U.shape

(5387, 50)

In [24]:
interactions, sp_item_feats

(<5387x2620 sparse matrix of type '<class 'numpy.float64'>'
 	with 195359 stored elements in Compressed Sparse Row format>,
 <2620x2639 sparse matrix of type '<class 'numpy.float64'>'
 	with 8858 stored elements in COOrdinate format>)

### Produce Sample Recos

In [147]:
def sort_scores(scores, idx_to_cid, train_interactions, user_indexes, k=12,  remove_already_liked = True):
    batch_size = len(user_indexes)
    scores = scores.reshape(batch_size, -1)
    sorted_items = np.argpartition(-scores,k)
    
    # get interacted items
    M = train_interactions[user_indexes]
    bought_items = np.split(M.indices, M.indptr)[1:-1]
    
    list_recos_formatted = []
    list_bought_items = []
    
    top_k_items = sorted_items[:, :k*10]
    for idx, sublist in enumerate(top_k_items):
        # sort the smaller sublist
        sorted_sublist = sublist[np.argsort(scores[idx][sublist])]
        # transform to name
        if not remove_already_liked:
            top_k_items = [idx_to_cid[v] for v in sorted_sublist]
        else:
            top_k_items = [idx_to_cid[v] for v in sorted_sublist if v not in bought_items[idx]]

        # get interactions
        bought_items_names = [idx_to_cid[v] for v in bought_items[idx]]

        list_recos_formatted.append(top_k_items[:k])
        list_bought_items.append(bought_items_names)

    return list_recos_formatted, list_bought_items

In [108]:
# for a sample of users, yield recos
sample_users = [1, 10, 100]
users_coo = np.array(sample_users)
users_coo = np.repeat(users_coo, num_items)
items_coo = np.tile(np.arange(len(idx_to_cid)), len(sample_users))

scores = warp_model.predict(users_coo, items_coo, 
                           item_features=sp_item_feats, num_threads=2)

In [148]:
list_recos_formatted, list_bought_items = sort_scores(scores, idx_to_names, interactions, sample_users, remove_already_liked = False)

In [150]:
list_recos_formatted

[['Black Hawk Down (2001)',
  'Windtalkers (2002)',
  'Dirty Dozen, The (1967)',
  'Memphis Belle (1990)',
  'Big Red One, The (1980)',
  'Glory (1989)',
  'Apocalypse Now (1979)',
  'Great Raid, The (2005)',
  'Behind Enemy Lines (2001)',
  'Boat, The (Das Boot) (1981)',
  'We Were Soldiers (2002)',
  'Fighting Seabees, The (1944)'],
 ['Rumble in the Bronx (Hont faan kui) (1995)',
  'Star Wars: Episode IV - A New Hope (a.k.a. Star Wars) (1977)',
  'Star Trek III: The Search for Spock (1984)',
  'Rocketeer, The (1991)',
  "Logan's Run (1976)",
  'Sunshine (2007)',
  'Navy Seals (1990)',
  'Money Train (1995)',
  'Yojimbo (1961)',
  'Island, The (2005)',
  'Throne of Blood (Kumonosu jô) (1957)',
  'Munich (2005)'],
 ['Living Out Loud (1998)',
  'Prime (2005)',
  'Definitely, Maybe (2008)',
  'High Fidelity (2000)',
  'Cashback (2006)',
  'Family Stone, The (2005)',
  "Bridget Jones's Diary (2001)",
  'Good Year, A (2006)',
  'Adaptation (2002)',
  'Corrina, Corrina (1994)',
  'Family Ma

### Evaluate

### Convert to MLFlow Model for item to item prediction

# Scratch

In [135]:

# OLD!
#     if not remove_already_liked:
#         top_k_items = sorted_items[:, :k]
#         for idx, sublist in enumerate(top_k_items):
#             # sort the smaller sublist
#             sorted_sublist = sublist[np.argsort(scores[idx][sublist])]
#             # transform to name
#             top_k_items = [idx_to_cid[v] for v in sorted_sublist]
            
#             # get interactions
#             bought_items_names = [idx_to_cid[v] for v in bought_items[idx]]
            
#             list_recos_formatted.append(top_k_items)
#     else:
#         # a 
#         top_k_items = sorted_items[:, :k*10]
#         for idx, sublist in enumerate(top_k_items):
#             # sort the smaller sublist
#             sorted_sublist = sublist[np.argsort(scores[idx][sublist])]
#             # transform to name
#             top_k_items = [idx_to_cid[v] for v in sorted_sublist if v not in bought_items[idx]]
#             # limit to k
#             top_k_items = top_k_items[:k]
            
#             list_recos_formatted.append(top_k_items)
            
#     return list_recos_formatted