### Evaluation Metrics

In [1]:
import pandas as pd
import os
from pathlib import Path
import scipy
import pickle
import scipy.stats
from numpy import *
import numpy as np
from numpy import linalg as la
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
root = Path(".")

In [2]:
rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')
rating_data.columns = ['UserID', 'ItemID', 'Rating', 'Timestamp']
rating_data.drop(columns=['Timestamp'], axis=1, inplace=True)

  rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')


In [3]:
rating_data

Unnamed: 0,UserID,ItemID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [4]:
item_cnt = max(rating_data['ItemID']) + 1
users_cnt = max(rating_data['UserID']) + 1
item_cnt, users_cnt

(3953, 6041)

In [5]:
rating_train = rating_data.sample(frac=0.8, random_state=200)
rating_test = rating_data.drop(rating_train.index)

In [6]:
rating_test

Unnamed: 0,UserID,ItemID,Rating
0,1,1193,5
2,1,914,3
8,1,594,4
9,1,919,4
13,1,2918,4
...,...,...,...
1000201,6040,1080,4
1000202,6040,1089,4
1000204,6040,1091,1
1000207,6040,1096,4


In [7]:
matrix = np.zeros(shape=(users_cnt, item_cnt))
matrix.shape

(6041, 3953)

In [8]:
for row in rating_data.itertuples():
    matrix[row.UserID][row.ItemID] = row.Rating
actual_matrix = matrix
actual_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.]])

### Precision at top K
Precision at k is the propotion of recommended items in the top k that are relevant

In [9]:
def topKPrecision(actual_mat,pred_mat,test_data,K):
    pred_ui_data = dict()
    for row in test_data.itertuples():
        if pred_ui_data.get(row.UserID) == None:
            pred_ui_data[row.UserID] = [(row.ItemID,pred_mat[row.UserID,row.ItemID])]
        else:
            pred_ui_data[row.UserID].append((row.ItemID,pred_mat[row.UserID,row.ItemID]))
    cnt = 0
    tot = 0
    for user in pred_ui_data.keys():
        pred_ui_data[user] = sorted(pred_ui_data[user],key=lambda x: x[-1],reverse = True)
        if(len(pred_ui_data[user]) >= K):
            cnt += 1
            tp = 0
            for item,pr in pred_ui_data[user][:K]:
                if actual_mat[user,item] >= 3.5:
                    tp += 1
            tot += tp/K
    if cnt > 0:
        return tot/cnt
    else:
        print("Error : very large K")

### Spearman Coefficient
Instead of calculating correlation over the raw item score,  we calculate it over the rank of the items ordered in the set. It the correlation of the ranks of items between two different sets.

In [10]:
def spearmanCoeff(actual_mat,pred_mat,test_data):
    pred_ui_data = dict()
    actual_ui_data = dict()
    for row in test_data.itertuples():
        if pred_ui_data.get(row.UserID) == None:
            pred_ui_data[row.UserID] = [(row.ItemID,pred_mat[row.UserID,row.ItemID]),]
            actual_ui_data[row.UserID] = [(row.ItemID,actual_mat[row.UserID,row.ItemID]),]
        else:
            pred_ui_data[row.UserID].append((row.ItemID,pred_mat[row.UserID,row.ItemID]))
            actual_ui_data[row.UserID].append((row.ItemID,actual_mat[row.UserID,row.ItemID]))
    
    cnt = 0
    tot = 0
    for user in pred_ui_data.keys():
        pred_ui_data[user] = sorted(pred_ui_data[user],key=lambda x: x[-1],reverse = True)
        actual_ui_data[user] = sorted(actual_ui_data[user],key=lambda x: x[-1],reverse = True)
        
        pred_ranking = dict()
        actual_ranking = dict()
        for i in range(len(pred_ui_data[user])):
            pred_ranking[pred_ui_data[user][i][0]] = i+1
            actual_ranking[actual_ui_data[user][i][0]] = i+1
        x = []
        y = []
        for key in pred_ranking.keys():
            x.append(pred_ranking[key])
            y.append(actual_ranking[key])
        if len(x) > 1 and len(y) > 1 :
            cnt += 1
            tot += pearsonr(x, y)[0]
    if cnt > 0:
        return tot/cnt
    else:
        print("Error : ....")

In [192]:
my_path = root / "Pickled_files" / "pred_matrix_collab"
dbfile = open(my_path, 'rb')     
pred_matrix = pickle.load(dbfile)
dbfile.close()

In [193]:
pred_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.8205123 , 4.39766519, ..., 4.70378116, 4.73979053,
        4.68459535],
       [0.        , 4.13735754, 3.        , ..., 3.27905294, 3.47972922,
        3.34977903],
       ...,
       [0.        , 4.60885315, 4.16276878, ..., 4.31833435, 4.38061175,
        4.28871652],
       [0.        , 3.51136871, 3.41617474, ..., 3.61797934, 3.6076365 ,
        3.56541669],
       [0.        , 3.        , 3.56061577, ..., 3.52780734, 3.44930387,
        3.49752001]])

In [194]:
actual_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.]])

In [195]:
topKPrecision(actual_matrix,pred_matrix,rating_test,10)

0.6423736565286998

In [196]:
spearmanCoeff(actual_matrix,pred_matrix,rating_test)

0.042090353730770746

In [197]:
my_path = root / "Pickled_files" / "SVD"
dbfile = open(my_path, 'rb')     
pred_matrix = pickle.load(dbfile)
dbfile.close()

In [198]:
pred_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.34090659, 4.00957505, ..., 3.75676603, 3.7762496 ,
        3.76891274],
       [0.        , 2.67049888, 3.32927022, ..., 2.73296471, 2.9015349 ,
        2.76117535],
       ...,
       [0.        , 3.3217707 , 3.33221987, ..., 3.03125969, 3.09160169,
        3.03405526],
       [0.        , 2.9996204 , 3.3182429 , ..., 3.79183653, 3.29302465,
        3.82776389],
       [0.        , 3.        , 2.97345656, ..., 3.51274296, 3.05631152,
        3.42150004]])

In [199]:
topKPrecision(actual_matrix,pred_matrix,rating_test,10)

0.6107935056025595

In [200]:
spearmanCoeff(actual_matrix,pred_matrix,rating_test)

0.007064679196030974

In [201]:
my_path = root / "Pickled_files" / "SVD90"
dbfile = open(my_path, 'rb')     
pred_matrix = pickle.load(dbfile)
dbfile.close()

In [202]:
topKPrecision(actual_matrix,pred_matrix,rating_test,10)

0.6108621083924074

In [203]:
spearmanCoeff(actual_matrix,pred_matrix,rating_test)

0.009335124141536059

In [11]:
my_path = root / "Pickled_files" / "CUR90"
dbfile = open(my_path, 'rb')     
pred_matrix = pickle.load(dbfile)
dbfile.close()

In [12]:
topKPrecision(actual_matrix,pred_matrix,rating_test,10)

0.6075691744797624

In [13]:
spearmanCoeff(actual_matrix,pred_matrix,rating_test)

0.0010459932683929657

In [14]:
my_path = root / "Pickled_files" / "CUR100"
dbfile = open(my_path, 'rb')     
pred_matrix = pickle.load(dbfile)
dbfile.close()

In [15]:
topKPrecision(actual_matrix,pred_matrix,rating_test,10)

0.6119826206265719

In [16]:
spearmanCoeff(actual_matrix,pred_matrix,rating_test)

0.0033905042404267717

### Tabulated Summary
| Recommender System Technique | RMSE |Precision on top K|Spearman Correlation|Prediction Time|
|-----|--------|------|------|------|
|Collaborative Filtering|1.1558117182207637|0.6423736565286998|0.042090353730770746|------|
|Collaborative Filtering with baseline|0.9211736239738006|-------|------|------|
|SVD|1.1876982325501928|0.6107935056025595|0.007064679196030974|------|
|SVD with 90% energy|1.1767304502722231|0.6108621083924074|0.009335124141536059|------|
|CUR|1.1857676139882922|0.6119826206265719|0.0033905042404267717|------|
|CUR with 90% energy|1.1729375072496155|0.6075691744797624|0.0010459932683929657|------|