In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from math import sqrt
import sys, os
from contextlib import contextmanager
import matplotlib as mpl
import seaborn as sns
import sklearn

from scipy.spatial.distance import cosine
import sklearn.metrics as metrics
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error

In [20]:
#make array
movies = np.asarray([[np.nan, 5, 1, 4],
                     [2, 5, 3, np.nan],
                     [4, np.nan, 5, 4],
                     [3, 5, 4, 5], ])

In [21]:
print(movies.shape) #dimensions of the array

(4, 4)


In [22]:
print(np.nanmean(movies, axis=0)) #displays the mean for each cloumn

[3.         5.         3.25       4.33333333]


In [23]:
pd.DataFrame(movies) #puting the data into a dataframe

Unnamed: 0,0,1,2,3
0,,5.0,1.0,4.0
1,2.0,5.0,3.0,
2,4.0,,5.0,4.0
3,3.0,5.0,4.0,5.0


In [30]:
#compute the similarities using cosine
import math
def cos_sim(v1, v2, metric='cosine'):
    if metric == 'correlation':
        v1 = v1 - np.nanmean(v1)
        v2 = v2 - np.nanmean(v2)
    "Compute Similarities for v1 and v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        if np.isnan(x) or np.isnan(y): continue
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def sim_matrix(M, dimension='user', metric='cosine'):
    N = movies.shape[0] if dimension == 'user' else movies.shape[1]
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0
                continue
            if dimension == 'user':
                v1, v2 = movies[i,:], movies[j,:]
            else:
                v1, v2 = movies[:,i], movies[:,j]
            sim[i][j] = cos_sim(v1,v2,metric)
    return sim

In [33]:
sim_matrix(movies, 'item')

array([[0.        , 0.98058068, 0.99792889, 0.96827732],
       [0.98058068, 0.        , 0.90582163, 0.99388373],
       [0.99792889, 0.90582163, 0.        , 0.89927103],
       [0.96827732, 0.99388373, 0.89927103, 0.        ]])

In [38]:
cos_sim(movies[0,:], movies[2,:], 'cosine')

0.7954317200324987

In [32]:
sim_matrix(movies, '')

array([[0.        , 0.98058068, 0.99792889, 0.96827732],
       [0.98058068, 0.        , 0.90582163, 0.99388373],
       [0.99792889, 0.90582163, 0.        , 0.89927103],
       [0.96827732, 0.99388373, 0.89927103, 0.        ]])

In [39]:
cos_sim(movies[0,:], movies[2,:], 'correlation')

-0.9828721869343218

In [35]:
sim_matrix(movies, 'user', 'correlation')

array([[ 0.00000000e+00,  7.29537204e-01, -9.82872187e-01,
         7.27334067e-01],
       [ 7.29537204e-01,  0.00000000e+00,  2.16930458e-01,
         9.38952956e-01],
       [-9.82872187e-01,  2.16930458e-01,  0.00000000e+00,
        -1.83870401e-16],
       [ 7.27334067e-01,  9.38952956e-01, -1.83870401e-16,
         0.00000000e+00]])

In [36]:
sim_matrix(movies, 'item', 'correlation')

  return sumxy/math.sqrt(sumxx*sumyy)


array([[ 0.        ,         nan,  0.73645969, -0.4472136 ],
       [        nan,  0.        ,         nan,         nan],
       [ 0.73645969,         nan,  0.        ,  0.27701734],
       [-0.4472136 ,         nan,  0.27701734,  0.        ]])

In [42]:
#some tests
u_users, n_items = movies.shape #dimensions of the array
avg_ratings = np.nanmean(movies, axis=1)
sim_users = sim_matrix(movies, 'user', 'cosine')
print(sim_users[1])



[0.94174191 0.         0.99624059 0.98648766]


In [61]:

def user_cf(movies, metric='cosine'):
    pred = np.copy(movies)
    n_users, n_items = movies.shape #dimensions of the array
    avg_ratings = np.nanmean(movies, axis=1)
    sim_users = sim_matrix(movies, 'user', metric)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(movies[i,j]):
                pred[i,j] = avg_ratings[i] + np.nansum(sim_users[i] * (movies[:,j] - avg_ratings)) / sum(sim_users[i])
    return pred

In [62]:
print("User-based CF (Cosine): \n" + str(pd.DataFrame(user_cf(movies, 'cosine'))))
print("User-based CF (Pearson): \n" + str(pd.DataFrame(user_cf(movies, 'correlation'))))

User-based CF (Cosine): 
          0         1    2         3
0  2.327225  5.000000  1.0  4.000000
1  2.000000  5.000000  3.0  3.687453
2  4.000000  5.677163  5.0  4.000000
3  3.000000  5.000000  4.0  5.000000
User-based CF (Pearson): 
          0    1    2         3
0  0.054298  5.0  1.0  4.000000
1  2.000000  5.0  3.0  3.926444
2  4.000000  6.0  5.0  4.000000
3  3.000000  5.0  4.0  5.000000


In [63]:
#item based
def item_cf(movies, metric='cosine'):
    pred = np.copy(movies)
    n_users, n_items = movies.shape
    avg_ratings = np.nanmean(movies, axis=0)
    sim_items = sim_matrix(movies, 'item', metric)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(movies[i,j]):
                pred[i,j] = avg_ratings[j] + np.nansum(sim_items[j] * (movies[i,:] - avg_ratings)) / sum(sim_items[j])
    return pred


In [65]:
print("Item-based CF (Cosine): \n" + str(pd.DataFrame(item_cf(movies, 'cosine'))))
print("Item-based CF (Pearson): \n" + str(pd.DataFrame(item_cf(movies, 'correlation'))))

Item-based CF (Cosine): 
          0         1    2         3
0  2.128509  5.000000  1.0  4.000000
1  2.000000  5.000000  3.0  3.916376
2  4.000000  5.775782  5.0  4.000000
3  3.000000  5.000000  4.0  5.000000
Item-based CF (Pearson): 
     0    1    2    3
0  NaN  5.0  1.0  4.0
1  2.0  5.0  3.0  NaN
2  4.0  NaN  5.0  4.0
3  3.0  5.0  4.0  5.0
  return sumxy/math.sqrt(sumxx*sumyy)


In [66]:
movie_res = np.asarray([[4,5,1,4],
                        [2,5,3,3],
                        [4,4,5,4],
                        [5,5,4,5], ])

In [67]:
pd.DataFrame(movie_res)

Unnamed: 0,0,1,2,3
0,4,5,1,4
1,2,5,3,3
2,4,4,5,4
3,5,5,4,5


In [71]:
def evaluateRS(ratings, groundtruth, method='user_cf', metric='cosine'):
    if method == 'user_cf':
        prediction = user_cf(ratings, metric)
    else:
        prediction = item_cf(ratings, metric)
    MSE = mean_squared_error(prediction, groundtruth)
    RMSE = round(sqrt(MSE),3)
    print("RMSE using {0} approach ({2}) is: {1}".format(method, RMSE, metric))
    print(pd.DataFrame(prediction))
    return

In [72]:
evaluateRS(movies, movie_res, 'user_cf', 'cosine')
evaluateRS(movies, movie_res, 'user_cf', 'correlation')
evaluateRS(movies, movie_res, 'item_cf', 'cosine')
evaluateRS(movies, movie_res, 'item_cf', 'correlation')

RMSE using user_cf approach (cosine) is: 0.794
          0         1    2         3
0  2.327225  5.000000  1.0  4.000000
1  2.000000  5.000000  3.0  3.687453
2  4.000000  5.677163  5.0  4.000000
3  3.000000  5.000000  4.0  5.000000
RMSE using user_cf approach (correlation) is: 1.236
          0    1    2         3
0  0.054298  5.0  1.0  4.000000
1  2.000000  5.0  3.0  3.926444
2  4.000000  6.0  5.0  4.000000
3  3.000000  5.0  4.0  5.000000
RMSE using item_cf approach (cosine) is: 0.848
          0         1    2         3
0  2.128509  5.000000  1.0  4.000000
1  2.000000  5.000000  3.0  3.916376
2  4.000000  5.775782  5.0  4.000000
3  3.000000  5.000000  4.0  5.000000
  return sumxy/math.sqrt(sumxx*sumyy)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').