In [1]:
import pip
import pandas as pd
import numpy as np
import math
from sklearn import model_selection
from sklearn import metrics
from scipy import stats

In [2]:
########## Read files ##########

# movieId, title, genres
movies = pd.read_csv('data/movies.csv', sep=',',header=0)

# movieId, imdbId, tmdbId
links = pd.read_csv('data/links.csv', sep=',',header=0)

# userId, movieId, tag, timestamp
tags = pd.read_csv('data/tags.csv', sep=',',header=0)

# userId, movieId, rating, timestamp
ratings = pd.read_csv('data/ratings.txt', sep='\t',header=0)

# tagId, tag
genomeTags = pd.read_csv('data/genome-tags.csv', sep=',',header=0)

# movieId, tagId, relevance
genomeScores = pd.read_csv('data/genome-scores.txt', sep='\t',header=0)

In [3]:
########## Spliting into train and test ##########
train_ratings, test_ratings = model_selection.train_test_split(ratings, test_size = 0.33)

In [4]:
########## Investigating the data and making auxilary data structures ##########

trainUserIds = train_ratings.userId.unique()
trainUserIds.sort()
print ("%20s" % "Train user IDs: " + str((trainUserIds.shape, trainUserIds)))

testUserIds = test_ratings.userId.unique()
testUserIds.sort()
print ("%20s" % "Test user IDs: " + str((testUserIds.shape, testUserIds)))

trainMovieIds = train_ratings.movieId.unique()
trainMovieIds.sort()
print ("%20s" % "Train movie IDs: " + str((trainMovieIds.shape, trainMovieIds)))

testMovieIds = test_ratings.movieId.unique()
testMovieIds.sort()
print ("%20s" % "Test movie IDs: " + str((testMovieIds.shape, testMovieIds)))

userIds = ratings.userId.unique()
userIds.sort()
print ("%20s" % "User IDs: " + str((userIds.shape, userIds)))

movieIds = ratings.movieId.unique()
movieIds.sort()
print ("%20s" % "Movie IDs: " + str((movieIds.shape, movieIds)))

# We can use the UserId as an index of the matrix, but we cannot use movieId
# Let's make index to ID dictionary anyway for both for robustness

userCount = userIds.size
movieCount = movieIds.size

userIndexToIdDict = dict()
userIdToIndexDict = dict()
for i in range(userCount):
    userIndexToIdDict[i] = userIds[i]
    userIdToIndexDict[userIds[i]] = i

movieIndexToIdDict = dict()
movieIdToIndexDict = dict()
for i in range(movieCount):
    movieIndexToIdDict[i] = movieIds[i]
    movieIdToIndexDict[movieIds[i]] = i

    Train user IDs: ((7120,), array([   1,    2,    3, ..., 7118, 7119, 7120], dtype=int64))
     Test user IDs: ((7119,), array([   1,    2,    3, ..., 7118, 7119, 7120], dtype=int64))
   Train movie IDs: ((13044,), array([     1,      2,      3, ..., 130462, 130490, 130642], dtype=int64))
    Test movie IDs: ((11150,), array([     1,      2,      3, ..., 129350, 129354, 130490], dtype=int64))
          User IDs: ((7120,), array([   1,    2,    3, ..., 7118, 7119, 7120], dtype=int64))
         Movie IDs: ((14026,), array([     1,      2,      3, ..., 130462, 130490, 130642], dtype=int64))


In [5]:
########## Constructing user/movie rating matrices ##########

# Constructing a train matrix
trainMatrix = np.zeros((userCount, movieCount))
for rating in train_ratings.itertuples():
    trainMatrix[int(userIdToIndexDict[rating.userId]), int(movieIdToIndexDict[rating.movieId])] = rating.rating
    
# Constructing a test matrix
testMatrix = np.zeros((userCount, movieCount))
for rating in test_ratings.itertuples():
    testMatrix[int(userIdToIndexDict[rating.userId]), int(movieIdToIndexDict[rating.movieId])] = rating.rating

In [6]:
########## Calculating simmilarity matrices ##########

appliedMetrics = np.array(['cosine', 'euclidean'])
appliedMetricCount = appliedMetrics.shape[0]
userSimilarities = dict()

# cosine simmilarity:
## Here is an explanation why cosine distance was returning 0 instead of 1 (discussed on project presentation)
## Long story short it is a DISTANCE, not similarity (snippet from documentation):
##
## Signature: metrics.pairwise.cosine_distances(X, Y=None)
## Docstring:
## Compute cosine distance between samples in X and Y.
##
## Cosine distance is defined as 1.0 minus the cosine similarity.
userSimilarities[appliedMetrics[0]] = 1 - metrics.pairwise_distances(trainMatrix, metric = appliedMetrics[0])

# euclidean simmilarity
userSimilaritiyEuclidean = metrics.pairwise_distances(trainMatrix, metric = appliedMetrics[1])
userSimilarities[appliedMetrics[1]] = 1 - userSimilaritiyEuclidean / userSimilaritiyEuclidean.max()

In [7]:
########## Calculating the predictions ##########

# First algorithm, taken as a basic algorithm is from andjelkaz' website
def prediction_basic(matrix, user_similarity):
    x_mean = matrix.mean(axis = 1)
    matrix_diff = (matrix - x_mean[:, np.newaxis])
    p = x_mean[:, np.newaxis] + user_similarity.dot(matrix_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return p

def prediction_basic_with_threshold(matrix, user_similarity, threshmin):
    x_mean = matrix.mean(axis = 1)
    user_similarity = stats.threshold(user_similarity, threshmin=threshmin, threshmax=1, newval=0)
    matrix_diff = (matrix - x_mean[:, np.newaxis])
    p = x_mean[:, np.newaxis] + user_similarity.dot(matrix_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return p

def prediction_basic_no_mean(matrix, user_similarity):
    p = user_similarity.dot(matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return p

def prediction_basic_no_mean_with_threshold(matrix, user_similarity, threshmin):
    user_similarity = stats.threshold(user_similarity, threshmin=threshmin, threshmax=1, newval=0)
    p = user_similarity.dot(matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return p

def prediction_basic_k_nearest_neighbours(matrix2, user_similarity2, k):
    userSimilarity = np.matrix.copy(user_similarity2)
    for i in range(userSimilarity.shape[1]):
        userSimilarityForUser = userSimilarity[i]
        kNearestNeighbours = userSimilarityForUser.argsort()[-(k + 1):][::-1][1:]

        #print ("Count before: " + str(len(userSimilarityForUser.nonzero()[0])))
        userSimilarityForUserNew = np.zeros(userSimilarity.shape[0])
        userSimilarityForUserNew[kNearestNeighbours] = userSimilarityForUser[kNearestNeighbours]
        userSimilarity[i] = userSimilarityForUserNew
        #print ("Count after (expected " + str(k) + "): " + str(len(userSimilarityForUser.nonzero()[0])))
        
    p = userSimilarity.dot(matrix2) / np.array([np.abs(userSimilarity).sum(axis=1)]).T
    p[np.bitwise_not(np.isfinite(p))] = 0
    return p


userMoviePredictions = dict()

for appliedMetric in appliedMetrics:
    userSimilarity = userSimilarities[appliedMetric]
    
    if (appliedMetric == 'cosine'):
        threshmin = 0.5
    else:
        threshmin = 0.9

    userMoviePredictions[(appliedMetric, 'basic')] = prediction_basic(testMatrix, userSimilarity)
    userMoviePredictions[(appliedMetric, 'basicWithThreshold')] = prediction_basic_with_threshold(testMatrix, userSimilarity, threshmin)
    userMoviePredictions[(appliedMetric, 'basicNoMean')] = prediction_basic_no_mean(testMatrix, userSimilarity)
    userMoviePredictions[(appliedMetric, 'basicNoMeanWithThreshold')] = prediction_basic_no_mean_with_threshold(testMatrix, userSimilarity, threshmin)
    userMoviePredictions[(appliedMetric, 'basicKNearestNeighbours')] = prediction_basic_k_nearest_neighbours(testMatrix, userSimilarity, 20)


stats.threshold is deprecated in scipy 0.17.0
  if sys.path[0] == '':
stats.threshold is deprecated in scipy 0.17.0


In [8]:
########## Visualization of the recommendations ##########

# Picking a user (similar to Milan so he can evaluate the quality of the recommender system)
userId = 1000
userIndex = userIdToIndexDict[userId]

# Picking a metric ('cosine' or 'euclidean')
appliedMetric = 'cosine'

# Picking a method ('basic', 'basicWithThreshold', 'basicNoMean', 'basicNoMeanWithThreshold', 'basicKNearestNeighbours')
predictorMethod = 'basicKNearestNeighbours'


# Getting user ratings
userRatings = ratings.where(ratings.userId == userId)
userRatings = userRatings[~np.isnan(userRatings.userId)]

# Join with movies
userRatings = userRatings.join(movies.set_index('movieId'), on='movieId', how='inner')
userRatings = pd.DataFrame.sort_values(userRatings, by='rating', ascending=False)

userSimilarity = userSimilarities[appliedMetric]
userMoviePrediction = userMoviePredictions[appliedMetric, predictorMethod]

userMoviePredictionForUser = userMoviePrediction[userIndex]

maxMovieIndexes = userMoviePredictionForUser.argsort()
maxMovieIds = np.vectorize(movieIndexToIdDict.get)(maxMovieIndexes)
maxMovieScores = np.vectorize(userMoviePredictionForUser.__getitem__)(maxMovieIndexes)
recommendedMovies = pd.DataFrame({ 'movieIndex' : maxMovieIndexes, 'movieId' : maxMovieIds, 'score' : maxMovieScores })

recommendedMovies = recommendedMovies[recommendedMovies.movieId.isin(userRatings.movieId) == False]
recommendedMovies = recommendedMovies.join(movies.set_index('movieId'), on='movieId', how='inner')
recommendedMovies = recommendedMovies[-10:][::-1]

recommendedMovies

Unnamed: 0,movieId,movieIndex,score,title,genres
14023,318,312,1.385124,"Shawshank Redemption, The (1994)",Crime|Drama
14022,1196,1142,1.310222,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
14021,1210,1155,1.128426,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
14018,480,470,1.017276,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
14016,1610,1518,0.8368,"Hunt for Red October, The (1990)",Action|Adventure|Thriller
14015,5418,5136,0.823459,"Bourne Identity, The (2002)",Action|Mystery|Thriller
14014,34405,8947,0.809744,Serenity (2005),Action|Adventure|Sci-Fi
14013,3793,3580,0.797925,X-Men (2000),Action|Adventure|Sci-Fi
14011,1374,1314,0.717659,Star Trek II: The Wrath of Khan (1982),Action|Adventure|Sci-Fi|Thriller
14010,48394,9608,0.70365,"Pan's Labyrinth (Laberinto del fauno, El) (2006)",Drama|Fantasy|Thriller


In [9]:
########## Print ratings of the selected user ##########
userRatings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
150612,1000.0,60684,5.0,1325552000.0,Watchmen (2009),Action|Drama|Mystery|Sci-Fi|Thriller|IMAX
150602,1000.0,33794,5.0,1325553000.0,Batman Begins (2005),Action|Crime|IMAX
150609,1000.0,58559,5.0,1325553000.0,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
150595,1000.0,5952,5.0,1325553000.0,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
150588,1000.0,2959,5.0,1325552000.0,Fight Club (1999),Action|Crime|Drama|Thriller
150587,1000.0,2571,5.0,1325552000.0,"Matrix, The (1999)",Action|Sci-Fi|Thriller
150596,1000.0,7153,5.0,1325553000.0,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
150620,1000.0,79132,5.0,1325552000.0,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
150592,1000.0,4993,5.0,1325553000.0,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
150580,1000.0,1690,5.0,1325552000.0,Alien: Resurrection (1997),Action|Horror|Sci-Fi


In [10]:
########## Scoring different models ##########

def evaluation(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return np.sqrt(metrics.mean_squared_error(ground_truth, prediction))

evaluations = dict()

for userMoviePrediction in userMoviePredictions:
    evaluations[userMoviePrediction] = evaluation(userMoviePredictions[userMoviePrediction], testMatrix)

evaluations

('cosine', 'basic')
('cosine', 'basicWithThreshold')
('cosine', 'basicNoMean')
('cosine', 'basicNoMeanWithThreshold')
('cosine', 'basicKNearestNeighbours')
('euclidean', 'basic')
('euclidean', 'basicWithThreshold')
('euclidean', 'basicNoMean')
('euclidean', 'basicNoMeanWithThreshold')
('euclidean', 'basicKNearestNeighbours')


{('cosine', 'basic'): 3.468107817873527,
 ('cosine', 'basicKNearestNeighbours'): 3.2253938341383761,
 ('cosine', 'basicNoMean'): 3.4875786513388198,
 ('cosine', 'basicNoMeanWithThreshold'): 0.46654955884204791,
 ('cosine', 'basicWithThreshold'): 0.46639333970177721,
 ('euclidean', 'basic'): 3.5420104390722411,
 ('euclidean', 'basicKNearestNeighbours'): 3.5802202253852204,
 ('euclidean', 'basicNoMean'): 3.5674429965073302,
 ('euclidean', 'basicNoMeanWithThreshold'): 0.81396384068789074,
 ('euclidean', 'basicWithThreshold'): 0.81372394242190893}

########## ATENTION: experimental and possibly non-working code below ##########

In [11]:
trainMatrixCut = np.matrix.copy(trainMatrix[:1001, :2000])

for i in range(trainMatrixCut.shape[1]):
    movieRatings = trainMatrixCut[:, i]
    movieRatings = movieRatings[movieRatings.nonzero()]
    if (movieRatings.size == 0):
        continue
    mean = np.mean(movieRatings)
    for j in range(trainMatrixCut.shape[0]):
        if(trainMatrixCut[j, i] == 0):
            trainMatrixCut[j, i] = mean
            
U, s, V = np.linalg.svd(trainMatrixCut)
m = U.shape[0]
n = V.shape[0]
#S = np.hstack((np.diag(s), np.zeros((m, n - m))))
#S = (np.diag(s))
S = np.zeros((m, n))
S[:m, :m] = np.diag(s)

In [12]:
P = np.dot(U, np.sqrt(S[:, :m].T))
Q = np.dot(np.sqrt(S), V.T)

# We also tried different combinations of matrix sizes (because clearly the formulae matrix sizes are not compatible),
# but none seem to change the outcome

In [13]:
trainMatrix.shape, U.shape, S.shape, V.shape, P.shape, Q.shape

((7120, 14026),
 (1001, 1001),
 (1001, 2000),
 (2000, 2000),
 (1001, 1001),
 (1001, 2000))

In [14]:
trainMatrixCut

array([[ 3.97639485,  3.5       ,  3.171875  , ...,  3.73076923,
         4.        ,  3.5       ],
       [ 3.97639485,  3.23893805,  4.        , ...,  3.73076923,
         3.625     ,  3.5       ],
       [ 3.97639485,  3.23893805,  3.171875  , ...,  3.73076923,
         3.625     ,  3.5       ],
       ..., 
       [ 3.97639485,  3.23893805,  3.        , ...,  3.73076923,
         3.625     ,  3.5       ],
       [ 3.97639485,  3.23893805,  3.171875  , ...,  3.73076923,
         3.625     ,  3.5       ],
       [ 3.97639485,  2.5       ,  3.171875  , ...,  3.5       ,
         3.625     ,  3.        ]])

In [15]:
# We can see that SVD is working as expected
np.dot(U, np.dot(S, V))

array([[ 3.97639485,  3.5       ,  3.171875  , ...,  3.73076923,
         4.        ,  3.5       ],
       [ 3.97639485,  3.23893805,  4.        , ...,  3.73076923,
         3.625     ,  3.5       ],
       [ 3.97639485,  3.23893805,  3.171875  , ...,  3.73076923,
         3.625     ,  3.5       ],
       ..., 
       [ 3.97639485,  3.23893805,  3.        , ...,  3.73076923,
         3.625     ,  3.5       ],
       [ 3.97639485,  3.23893805,  3.171875  , ...,  3.73076923,
         3.625     ,  3.5       ],
       [ 3.97639485,  2.5       ,  3.171875  , ...,  3.5       ,
         3.625     ,  3.        ]])

In [16]:
# This does not seem to correctly approximate 
np.dot(Q, P)

ValueError: shapes (1001,2000) and (1001,1001) not aligned: 2000 (dim 1) != 1001 (dim 0)