In this notebook, you will obtain an introductory acquiantiance to baseline recommendation algorithms and evaluation methods.
The recommendation dataset we will be using is from a collection called MovieLens, which contains users’ movie ratings and is popular for implementing and testing recommender systems. The specific dataset we will be using for this lab is MovieLens 100K Dataset which contains 100,000 movie ratings from 943 users and a selection of 1682 movies. In recommendation research works, usually a larger version of this dataset, MovieLens 20M is used instead.
First, we import the necessary packages.


In [1]:
# import required libraries
# !pip install wget
import os
import os.path
import numpy as np
import pandas as pd
import random
import math
from math import sqrt
from heapq import nlargest
from tqdm import trange
from tqdm import tqdm
from pathlib import Path
from scipy.linalg import sqrtm
from dotmap import DotMap
from surprise import Reader, Dataset

import scipy
from scipy import stats
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import wget

Downloading the dataset and have a glance on its statistics

In [2]:
DATA_PATH = "/ssd003/projects/aieng/public/recsys_datasets/kasandr/de"

train_path = os.path.join(DATA_PATH, r'train_de.csv')
# test_path = os.path.join(DATA_PATH, r'test_de.csv')

df_train = pd.read_csv(train_path, delimiter = '\t')
# df_test = pd.read_csv(test_path, delimiter = '\t')


In [3]:
args = DotMap()

args.min_uc = 5
args.max_uc = 1000
args.min_mc = 5
args.min_us_prod_types = 10


def sample_by_user(df, sample_rate=0.05):
    user_list = list(set(df['userid']))
    sampled_users = random.sample(user_list, math.floor(len(user_list)*sample_rate))
    df = df[df['userid'].isin(sampled_users)]
    return df


def filter_min_mc(df):
    """removes the movie records that have been rated less frequent than minimun

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_mc > 0:
            item_sizes = df.groupby('offerid').size()
            good_items = item_sizes.index[item_sizes >= args.min_mc]
            df = df[df['offerid'].isin(good_items)]
    return df

def filter_min_uc(df):
    """removes the user records that have rated less frequent than minimun

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_uc > 0:
            user_sizes = df.groupby('userid').size()
            good_users = user_sizes.index[user_sizes >= args.min_uc]
            df = df[df['userid'].isin(good_users)]
    return df

def filter_max_uc(df):
    """removes the user records that have too many records

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.max_uc > 0:
            user_sizes = df.groupby('userid').size()
            good_users = user_sizes.index[user_sizes <= args.max_uc]
            df = df[df['userid'].isin(good_users)]
    return df

def filter_min_user_product_types(df, prod_types=args.min_us_prod_types):
    if prod_types > 0:
            user_num_products = df[df['rating'] == 1].groupby('userid')['offerid'].agg(unique_count='nunique').reset_index()
            
            good_users = user_num_products[user_num_products['unique_count'] >= prod_types]['userid']
            df = df[df['userid'].isin(good_users)]
    return df

def filter_single_label(df):
    user_num_products = df.groupby('userid')['rating'].agg(rating_types='nunique').reset_index()
    good_users = user_num_products[user_num_products['rating_types'] >= 2]['userid']
    df = df[df['userid'].isin(good_users)]
    return df



def densify_index(df):
    """reassigns the user and movie ids to remove the gaps caused by deletions

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    umap = {u: i for i, u in enumerate(set(df['userid']), 1)}
    smap = {s: i for i, s in enumerate(set(df['offerid']), 1)}
    df['userid'] = df['userid'].map(umap)
    df['offerid'] = df['offerid'].map(smap)
    return df, umap, smap

In [4]:
# df_sampled = sample_by_user(df_train)
# df_sampled

In [5]:
def data_preprocessing(df):
    df = filter_min_mc(df)
    df = filter_min_uc(df)
    df = filter_max_uc(df)
    df = filter_min_user_product_types(df)
    df = filter_single_label(df)
    df, umap, smap = densify_index(df)

    df['utcdate']= pd.to_datetime(df['utcdate'])
    df['timestamp'] = df['utcdate'].astype('int64') // 10 ** 9

    df.rename(columns={'userid':'userID',
                        'offerid':'itemID'}, inplace=True)
    
    # reassign label to fit model format
#     df.loc[df['rating'] == 1, 'rating'] = 2
#     df.loc[df['rating'] == 0, 'rating'] = 1    

    df = df[['userID','itemID','rating','timestamp']]
    df = df.sort_values(by=['userID', 'timestamp'])

    return df

df2 = data_preprocessing(df_train)
df2

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,userID,itemID,rating,timestamp
2384648,1,21080,0,1464792760
4121032,1,6914,0,1464792760
5166752,1,20031,0,1464792760
5166753,1,2412,0,1464792760
5401241,1,17342,0,1464792760
...,...,...,...,...
10127061,418,6660,0,1465722712
12571164,418,16855,0,1465722712
14707063,418,17100,0,1465722712
14788638,418,14266,0,1465722712


In [6]:
rating_df = df2.copy()
rating_df_train, rating_df_test = train_test_split(rating_df, test_size=0.2)
rating_df_train, rating_df_val = train_test_split(rating_df_train, test_size=0.25)

def sort_data(df):
    return df.sort_values(by=['userID', 'timestamp'])

rating_df_train = sort_data(rating_df_train)
rating_df_test = sort_data(rating_df_test)
rating_df_val = sort_data(rating_df_val)

rating_df_train

Unnamed: 0,userID,itemID,rating,timestamp
5166753,1,2412,0,1464792760
5955282,1,6900,0,1464792760
12706217,1,13366,0,1464792760
12223720,1,9317,0,1464792760
5955295,1,15862,1,1464792859
...,...,...,...,...
10127061,418,6660,0,1465722712
1180755,418,18893,0,1465722712
3552123,418,9357,1,1465722712
1495560,418,14210,0,1465722712


In [7]:
data_threshold = len(rating_df_train.loc[rating_df_train.rating == 1])/len(rating_df_train.loc[rating_df_train.rating == 0])
print(data_threshold)

0.23209360635185958


In [8]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD, BaselineOnly
from surprise.model_selection import cross_validate
 
# Get minimum and maximum rating from the dataset
min_rating = rating_df_train.rating.min()
max_rating = rating_df_train.rating.max()
 
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(rating_df_train[['userID', 'itemID', 'rating']], reader)

In [9]:
svd = SVD(n_epochs=10)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3095  0.3021  0.3047  0.3052  0.3027  0.3048  0.0026  
MAE (testset)     0.2006  0.1928  0.1983  0.1963  0.1962  0.1968  0.0026  
Fit time          0.79    0.79    0.79    0.79    0.80    0.79    0.00    
Test time         0.04    0.15    0.04    0.04    0.04    0.06    0.04    


In [10]:
from surprise.model_selection import GridSearchCV
 
param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.28102772108044655
{'n_factors': 100, 'n_epochs': 20}


In [11]:
from surprise.model_selection import train_test_split
 
# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']

# best_factor = 100
# best_epoch = 20
 
# sample random trainset and testset
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.20)
 
# # We'll use the famous SVD algorithm.
# svd = SVD(n_factors=best_factor, n_epochs=best_epoch)
 
# # Train the algorithm on the trainset
# svd.fit(trainset)

In [12]:
from sklearn.metrics import ndcg_score

def precision_recall_at_k(predictions, k=10, threshold=None):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    ndcgs = dict()
    
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        
        # NDCG@K
        # Note current implementation of NDCG is not accurate due to very few positive samples in testing set
        
        est_list, label_list = zip(*user_ratings)
        est_list = [list(est_list)]
        label_list = [list(label_list)]
        
        try:
            ndcgs[uid] = ndcg_score(est_list, label_list, k=k)
        except Exception:
            pass

    return precisions, recalls, ndcgs

In [13]:
from collections import defaultdict
from surprise.model_selection import KFold

kf = KFold(n_splits=5)
# algo = SVD()
algo = BaselineOnly()

prec_list = []
rec_list = []
ndcg_list = []
metrics_val = [1,3,5]

for i in metrics_val:
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls, ndcgs = precision_recall_at_k(predictions, k=i, threshold=data_threshold)

    #     # Precision and recall can then be averaged over all users
    #     print(sum(prec for prec in precisions.values()) / len(precisions))
    #     print(sum(rec for rec in recalls.values()) / len(recalls))

        prec_list.append(sum(prec for prec in precisions.values()) / len(precisions))
        rec_list.append(sum(rec for rec in recalls.values()) / len(recalls))
        ndcg_list.append(sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs))
        
    
    print("average precision@{}:".format(i) + str(np.mean(prec_list)))
    print("average recall@{}:".format(i) + str(np.mean(rec_list)))
    print("average ndcg@{}:".format(i) + str(np.mean(ndcg_list)))    

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
average precision@1:0.6106420279362023
average recall@1:0.29303506796947665
average ndcg@1:0.8471005984246862
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
average precision@3:0.5787585267402294
average recall@3:0.4038735965654509
average ndcg@3:0.8714687675401814
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
average precision@5:0.5606893899184489
average recall@5:0.46599609476988696
average ndcg@5:0.8854021550927585


### random model baseline

In [23]:
algo.fit(trainset)
predictions = algo.test(testset)

Estimating biases using als...


In [28]:
user_est_true = defaultdict(list)
for uid, _, true_r, est, _ in predictions:
    user_est_true[uid].append((est, true_r))

In [75]:
est_list, label_list = zip(*user_est_true[1069])



In [66]:
ndcg_score(est_list, label_list)

0.8981556457588459