In [1]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv('2019-Nov-half.csv')
df.head()


interactions = df[['user_id', 'product_id', 'event_type']].copy()
interactions['rating'] = interactions['event_type'].apply(lambda x: 1 if x in ['cart', 'purchase'] else 0)

interactions = interactions[['user_id', 'product_id', 'rating']]

items = df[['product_id', 'category_id', 'category_code', 'brand', 'price']].drop_duplicates(subset=['product_id']).reset_index(drop=True)

# Quick checks
print('Interactions shape:', interactions.shape)
print('Items shape:', items.shape)

interactions.head(), items.head()

Interactions shape: (16875399, 3)
Items shape: (169578, 5)


(     user_id  product_id  rating
 0  558856683     1004775       0
 1  532364121    12708937       0
 2  512651494    24900193       0
 3  520037415     5100503       0
 4  566280860    26019863       0,
    product_id          category_id           category_code     brand   price
 0     1004775  2053013555631882655  electronics.smartphone    xiaomi  183.27
 1    12708937  2053013553559896355                     NaN  michelin   72.72
 2    24900193  2053013562183385881                     NaN       NaN    1.09
 3     5100503  2053013553375346967                     NaN    xiaomi   22.68
 4    26019863  2053013562837697343                     NaN       NaN   11.79)

In [None]:
import os
os.makedirs('processed_data', exist_ok=True)

print('Preparing id maps and creating train/test splits...')
user_ids = interactions['user_id'].unique()
item_ids = interactions['product_id'].unique()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {p: i for i, p in enumerate(item_ids)}

interactions['user_idx'] = interactions['user_id'].map(user2idx)
interactions['item_idx'] = interactions['product_id'].map(item2idx)


import numpy as np

def leave_one_out(df, seed=None):
    if seed is not None:
        np.random.seed(seed)
    train_rows = []
    test_rows = []
    grouped = df.groupby('user_idx')
    for user, group in grouped:
        if len(group) == 1:
            train_rows.append(group.index.values[0])
        else:
            test_idx = np.random.choice(group.index.values)
            test_rows.append(test_idx)
            train_rows.extend([i for i in group.index.values if i != test_idx])
    return df.loc[train_rows].copy(), df.loc[test_rows].copy()

train_df, test_df = leave_one_out(interactions, seed=42)
print('Train interactions:', len(train_df), 'Test interactions:', len(test_df))

train_df.to_csv('processed_data/train_interactions.csv', index=False)
test_df.to_csv('processed_data/test_interactions.csv', index=False)
pd.DataFrame(list(user2idx.items()), columns=['user_id', 'user_idx']).to_csv('processed_data/user_mapping.csv', index=False)
pd.DataFrame(list(item2idx.items()), columns=['product_id', 'item_idx']).to_csv('processed_data/item_mapping.csv', index=False)

print('Saved train/test splits and mappings to processed_data/')

Preparing id maps and creating train/test splits...
Train interactions: 15129495 Test interactions: 1745904
Saved train/test splits and mappings to processed_data/


In [None]:
from scipy import sparse
import pandas as pd

print('Loading splits and mappings...')
df_train = pd.read_csv('processed_data/train_interactions.csv')
df_test = pd.read_csv('processed_data/test_interactions.csv')
user_map = pd.read_csv('processed_data/user_mapping.csv')
item_map = pd.read_csv('processed_data/item_mapping.csv')

Loading splits and mappings...


In [2]:
df_train = df_train.drop(columns=['user_idx', 'item_idx'], axis=1)
df_test = df_test.drop(columns=['user_idx', 'item_idx'], axis=1)

In [None]:
df_items = item_map.merge(items, on='product_id', how='left')

In [None]:
def precision_at_k(r, k):
    assert 1 <= k <= r.size
    return (np.asarray(r)[:k]).mean()

def average_precision_at_k(r, k):
    r = np.asarray(r)
    n_rel = r.sum()
    if n_rel == 0:
        return 0.
    vectorized_precision = np.vectorize(lambda i: precision_at_k(r, i))
    indices = np.arange(1, len(r) + 1)
    precisions = vectorized_precision(indices)
    score = np.sum(precisions * r)
    return score / min(k, n_rel)

def dcg_at_k(r, k):
    r = np.asarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

user_items = {}
itemset = set()

for row in df_train.itertuples():
    user_items.setdefault(row[1], []).append(row[2])
    itemset.add(row[2])

itemset = np.sort(list(itemset))
item2idx = {item: i for i, item in enumerate(itemset)}
user_ids = {user: i for i, user in enumerate(user_items.keys())}

rows, cols, data = [], [], []

for user, items in user_items.items():
    u_idx = user_ids[user]
    for item in items:
        i_idx = item2idx[item]
        rows.append(u_idx)
        cols.append(i_idx)
        data.append(1)

sparse_matrix = csr_matrix((data, (rows, cols)), 
                           shape=(len(user_items), len(itemset)), 
                           dtype=np.int8)

user_item_matrix = sparse_matrix  
item_user_matrix = sparse_matrix.T

In [5]:
# Mapeo de user id a fila de la matriz sparse
user2row = {user_id: matrix_row for matrix_row, user_id in enumerate(user_items.keys())}
row2user = {matrix_row: user_id for user_id, matrix_row in user2row.items()}

# Mapeo de item id a columna de la matriz sparse
item2col = {item_id: matrix_col for matrix_col, item_id in enumerate(itemset)}
col2item = {matrix_col: item_id for item_id, matrix_col in item2col.items()}

In [40]:
user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

def evaluate_model(model, n):
  mean_ap = 0. # o MAP
  mean_ndcg = 0.
  for user_id in user_items_test.keys():
    user_row = user2row[user_id]
    rec = model.recommend(user_row, user_item_matrix[user_row], n)[0]
    rec = [col2item[col] for col in rec]
    rel_vector = np.isin(rec, user_items_test[user_id], assume_unique=True).astype(int)
    mean_ap += average_precision_at_k(rel_vector, n)
    mean_ndcg += ndcg_at_k(rel_vector, n)

  mean_ap /= len(user_items_test)
  mean_ndcg /= len(user_items_test)

  return mean_ap, mean_ndcg

In [22]:
def show_recommendations(model, user, n):
  recommendations = model.recommend(userid=user, user_items=user_item_matrix[user], N=n)[0]
  return items.loc[recommendations]['product_id']

In [23]:
def show_similar_movies(model, item, n=10):
  sim_items = model.similar_items(item, n)[0]
  return items.loc[sim_items]['product_id']

In [10]:
import implicit
model_als = implicit.als.AlternatingLeastSquares(factors=100, iterations=10, use_gpu=False)
model_als.fit(user_item_matrix)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 10/10 [00:48<00:00,  4.81s/it]


In [25]:
show_recommendations(model_als, user=2, n=10)

18420    45602451
18293    60400014
1211     16600231
18407    26300705
18398    26204628
18514     7900440
1235     13200828
18517    38900074
1016      1480544
923      28720409
Name: product_id, dtype: int64

In [39]:
maprec, ndcg = evaluate_model()
print('map: {}\nndcg: {}'.format(maprec, ndcg))

map: 0.057643754834339936
ndcg: 0.06650317209662725
