# Simple Collaborative Filtering model using Matrix Factorization

In [2]:
from collections import defaultdict
import pandas as pd
import yaml
import glob
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from surprise import SVD
from surprise.model_selection import cross_validate

# Load Config


In [None]:
%pwd

In [1]:
%cd ..

/Users/CPang/Ying/GNN-eCommerce


In [4]:
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)

config

{'base': {'random_seed': 42},
 'data': {'cosmetic_shop': 'data/raw/cosmetic-shop-ecommerce-events/'},
 'training': {'event_type_weights': {'view': 0.01,
   'cart': 0.1,
   'remove_from_cart': -0.09,
   'purchase': 1.0}},
 'reports': None}

# Load Dataset and EDA

In [5]:
# Get CSV files list of cosmetic shop dataset
path = config['data']['cosmetic_shop']
csv_files = glob.glob(path + "/20*.csv")

# Read each CSV file into DataFrame. This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
all_events = pd.concat(df_list, ignore_index=True)

In [6]:
print(all_events.shape)
all_events.sample(10)

(20692840, 9)


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
18439902,2019-10-14 07:49:19 UTC,view,5889092,1720400165430363096,,,3.65,560038527,d380fec1-4eee-4b40-940e-25ff5305571b
16072835,2019-11-28 10:55:53 UTC,remove_from_cart,5793077,1487580007776650194,,,0.75,478239488,f6d4d746-424a-4acb-9bc5-ac2ea7ac05a1
2834254,2020-01-22 11:03:23 UTC,cart,5786837,1783999068909863670,,smart,5.56,585185095,94cdcb42-9f6f-45d8-90b9-3770d39f3251
16844423,2019-10-02 13:20:46 UTC,view,5848407,1487580007675986893,,freedecor,0.79,533866480,7e922394-c369-4ae9-99a0-fc1b8495ba2d
15635685,2019-11-25 12:25:40 UTC,view,5886802,1487580013170524342,,,5.56,509522345,69bbbf16-fb51-4ec9-bf50-8aa14ea5c5cb
1298770,2020-01-11 21:36:56 UTC,view,5916489,1487580013279576251,,relouis,4.13,599906722,a55de4b7-0a87-4ae1-b5a0-f8f657764ef6
2691355,2020-01-21 11:33:24 UTC,view,5899164,1487580005293622112,,opi,26.95,586476743,e292d58c-ba70-4342-81cf-81c2a14fe5a8
2927998,2020-01-23 04:14:22 UTC,remove_from_cart,5857673,1487580007399162817,,artex,4.37,422178927,688e954e-668a-452a-a752-43af00635f0a
16733449,2019-10-02 00:06:39 UTC,view,5854330,1487580005134238553,,runail,2.38,555815917,e49be861-baf5-4331-b3b1-b4dc96ecc7b9
18092821,2019-10-11 07:57:09 UTC,purchase,5621738,1487580009605365797,,,0.3,558421695,e80202ce-0c0d-4d7e-b7b8-9fb4a8d24e8c


In [7]:
# Number of unique users
users_count = len(pd.unique(all_events['user_id']))
print("Unique users: ", users_count)

# Number of unique products
products_count = len(pd.unique(all_events['product_id']))
print("Unique products: ", products_count)

print("Event types: ", pd.unique(all_events.event_type).tolist())


Unique users:  1639358
Unique products:  54571
Event types:  ['view', 'cart', 'remove_from_cart', 'purchase']


In [30]:
# a small dataset used for code development only
all_events_mini = all_events
all_events_mini = all_events_mini[['user_id', 'product_id', 'event_type']]


In [31]:
def interaction_matrix(events, event_type_weights):

    events = events[['user_id', 'product_id', 'event_type']]
    # Add weight column to `events` dataframe
    events['weight'] = events.event_type.apply(lambda x: event_type_weights[x])

    i_matrix = events.groupby(['user_id', 'product_id'], as_index=False) \
        .agg({'event_type': list, 'weight': lambda x: 0.01 if x.sum() < 0 else 1.0 if x.sum() > 1.0 else x.sum()})

    return i_matrix

In [32]:
im = interaction_matrix(all_events_mini, config['training']['event_type_weights'])
im

Unnamed: 0,user_id,product_id,event_type,weight
0,465496,5769989,[view],0.01
1,465496,5865524,[view],0.01
2,465496,5865526,[view],0.01
3,1120748,5240,[view],0.01
4,1180452,5881337,[view],0.01
...,...,...,...,...
10157403,622090043,5850628,[view],0.01
10157404,622090052,5688691,[view],0.01
10157405,622090052,5931986,[view],0.01
10157406,622090098,5650609,[view],0.01


In [33]:
im.to_csv('data/preprocessed/interaction_matrix.csv')

##### refer code for relabeling user_nodes, item_nodes

In [None]:
# refer code for relabeling user_nodes, item_nodes
from sklearn.model_selection import train_test_split

# change df to im??
train, test = train_test_split(df.values, test_size=0.2, random_state=16)
train_df = pd.DataFrame(train, columns=df.columns)
test_df = pd.DataFrame(test, columns=df.columns)

In [None]:
from sklearn import preprocessing as pp

le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'].values)
train_df['item_id_idx'] = le_item.fit_transform(train_df['item_id'].values)

In [None]:
train_user_ids = train_df['user_id'].unique()
train_item_ids = train_df['item_id'].unique()

print('Unique train set user_ids/ item_ids:', len(train_user_ids), len(train_item_ids))

test_df = test_df[
  (test_df['user_id'].isin(train_user_ids)) & \
  (test_df['item_id'].isin(train_item_ids))
]
print('Size of test set before/ after(remove user/item nodes not in train set):', len(test), len(test_df))

In [None]:
test_df['user_id_idx'] = le_user.transform(test_df['user_id'].values)
test_df['item_id_idx'] = le_item.transform(test_df['item_id'].values)

In [11]:
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(0, 1))
surprise_dataset = Dataset.load_from_df(im[['user_id', 'product_id', 'weight']], reader)

In [None]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, surprise_dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [29]:
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=10, est_threshold=0.5, true_threshold=1.0):
    """Return precision and recall at k metrics for each user
    Ref: https://surprise.readthedocs.io/en/stable/FAQ.html
    """

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= true_threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= est_threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= true_threshold) and (est >= est_threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

num_folds = 5
kf = KFold(n_splits=num_folds)
algo = SVD()

from surprise.model_selection import train_test_split
#trainset, testset = train_test_split(surprise_dataset)
ps = []
rs = []
for trainset, testset in kf.split(surprise_dataset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, est_threshold=0.5, true_threshold=1.0)

    # Precision and recall can then be averaged over all users
    ps.append(sum(prec for prec in precisions.values()) / len(precisions))
    rs.append(sum(rec for rec in recalls.values()) / len(recalls))
    #print("precision: ", sum(prec for prec in precisions.values()) / len(precisions))
    #print("recall: ", sum(rec for rec in recalls.values()) / len(recalls))

print("precisions: ", ps)
print("recalls: ", rs)
print("average precision: ", sum(ps) / len(ps) )
print("average recall: ", sum(rs) / len(rs) )

precisions:  [0.015518272623960766, 0.015485735595748647, 0.014833547912451717, 0.015699322242760324, 0.014385321100917432]
recalls:  [0.011799509480291825, 0.013162719870719129, 0.011683919252431299, 0.012499469877011468, 0.011978649535988989]
average precision:  0.015184439895167778
average recall:  0.012224853603288541


# Further Work
Use event time to influence the weight of the event. i.e. More recent events carry more weight.