# Simple Collaborative Filtering model using Matrix Factorization

In [1]:
from collections import defaultdict
import pandas as pd
import yaml
from surprise.model_selection import KFold
from surprise import Dataset, Reader, SVD

# Load Config

In [2]:
%cd ..
with open("config.yaml") as config_file:
    config = yaml.safe_load(config_file)

/Users/yingkang/4thBrain/GNN-eCommerce


# Load Interaction Matrix from csv

In [3]:
interaction_matrix = pd.read_csv(config['data']['preprocessed'] + "u_i_weight_0.01_0.1_-0.09.csv")
# file 2 -- u_i_weight_0.15_0.35_-0.2.csv

In [5]:
mini = interaction_matrix.head(1000)
reader = Reader(rating_scale=(0, 1))
surprise_dataset = Dataset.load_from_df(mini[['user_id', 'item_id', 'weight']], reader)

In [6]:
num_folds = 5
kf = KFold(n_splits=num_folds)
algo = SVD()

for trainset, testset in kf.split(surprise_dataset):
    algo.fit(trainset)
    predictions = algo.test(testset)


In [12]:
predictions = pd.DataFrame(predictions)
predictions.drop("details", inplace=True, axis=1)
predictions.columns = ['userId', 'itemId', 'actual', 'prediction']
predictions.head()

Unnamed: 0,userId,itemId,actual,prediction
0,20554973,5676936,0.01,0.041316
1,20554973,5813483,0.01,0.041316
2,20554973,5650596,0.02,0.041316
3,19762782,5801431,0.01,0.205881
4,20554973,5847338,0.01,0.041316


In [14]:
predictions = predictions.pivot_table(index='userId', columns='itemId', values='prediction').fillna(0)
predictions.shape

(36, 198)

In [12]:
# n fold cross validation
num_folds = 5
kf = KFold(n_splits=num_folds)
algo = SVD()

ps = []  # precisions
rs = []  # recalls
for trainset, testset in kf.split(surprise_dataset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, est_threshold=0.5, true_threshold=1.0)

    # Precision and recall can then be averaged over all users
    average_p = sum(prec for prec in precisions.values()) / len(precisions)
    average_r = sum(rec for rec in recalls.values()) / len(recalls)
    print("precision:", average_p, "| recall:", average_r, "| users count:", len(recalls))
    ps.append(average_p)
    rs.append(average_r)

print("average precision:", sum(ps) / len(ps), "| average recall:", sum(rs) / len(rs) )

precision: 0.15431949972487433 | recall: 0.12702260807694613 | users count: 76127
precision: 0.15476842531803514 | recall: 0.12748756584243737 | users count: 76072
precision: 0.15401223197329858 | recall: 0.12714085528976968 | users count: 76003
precision: 0.15329944747097216 | recall: 0.12593473286268783 | users count: 76152
precision: 0.15532556263789318 | recall: 0.12743901449786119 | users count: 75865
average precision: 0.15434503342501468 | average recall: 0.12700495531394046


# Precion@K and Recall@K metric function

In [11]:
def precision_recall_at_k(predictions, k=10, est_threshold=0.5, true_threshold=1.0):
    """Return precision and recall at k metrics for each user
    Ref: https://surprise.readthedocs.io/en/stable/FAQ.html
    """

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= true_threshold) for (_, true_r) in user_ratings)

        # skip the user if the user does not have relevant items (i.e. has not bought at least one item)
        if n_rel == 0:
            continue

        # Number of recommended items in top k
        n_rec_k = sum((est >= est_threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= true_threshold) and (est >= est_threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel

    return precisions, recalls


# Further Work
Use event time to influence the weight of the event. i.e. More recent events carry more weight.