In [1]:
import pip
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from scipy import stats

In [2]:
# Read files

# movieId, title, genres
movies = pd.read_csv('data/movies.csv', sep=',',header=0)

# movieId, imdbId, tmdbId
links = pd.read_csv('data/links.csv', sep=',',header=0)

# userId, movieId, tag, timestamp
tags = pd.read_csv('data/tags.csv', sep=',',header=0)

# userId, movieId, rating, timestamp
ratings = pd.read_csv('data/ratings.txt', sep='\t',header=0)

# tagId, tag
genomeTags = pd.read_csv('data/genome-tags.csv', sep=',',header=0)

# movieId, tagId, relevance
genomeScores = pd.read_csv('data/genome-scores.txt', sep='\t',header=0)

In [3]:
# Spliting into train and test
train_ratings, test_ratings = model_selection.train_test_split(ratings, test_size = 0.33)

In [4]:
trainUserIds = train_ratings.userId.unique()
trainUserIds.sort()
print ("%20s" % "Train user IDs: " + str((trainUserIds.shape, trainUserIds)))

testUserIds = test_ratings.userId.unique()
testUserIds.sort()
print ("%20s" % "Test user IDs: " + str((testUserIds.shape, testUserIds)))

trainMovieIds = train_ratings.movieId.unique()
trainMovieIds.sort()
print ("%20s" % "Train movie IDs: " + str((trainMovieIds.shape, trainMovieIds)))

testMovieIds = test_ratings.movieId.unique()
testMovieIds.sort()
print ("%20s" % "Test movie IDs: " + str((testMovieIds.shape, testMovieIds)))

userIds = ratings.userId.unique()
userIds.sort()
print ("%20s" % "User IDs: " + str((userIds.shape, userIds)))

movieIds = ratings.movieId.unique()
movieIds.sort()
print ("%20s" % "Movie IDs: " + str((movieIds.shape, movieIds)))

# We can use the UserId as an index of the matrix, but we cannot use movieId
# Let's make index to ID dictionary anyway for both for robustness

userCount = userIds.size
movieCount = movieIds.size

userIndexToIdDict = dict()
userIdToIndexDict = dict()
for i in range(userCount):
    userIndexToIdDict[i] = userIds[i]
    userIdToIndexDict[userIds[i]] = i

movieIndexToIdDict = dict()
movieIdToIndexDict = dict()
for i in range(movieCount):
    movieIndexToIdDict[i] = movieIds[i]
    movieIdToIndexDict[movieIds[i]] = i

    Train user IDs: ((7120,), array([   1,    2,    3, ..., 7118, 7119, 7120], dtype=int64))
     Test user IDs: ((7120,), array([   1,    2,    3, ..., 7118, 7119, 7120], dtype=int64))
   Train movie IDs: ((13007,), array([     1,      2,      3, ..., 130073, 130219, 130490], dtype=int64))
    Test movie IDs: ((11214,), array([     1,      2,      3, ..., 130052, 130462, 130642], dtype=int64))
          User IDs: ((7120,), array([   1,    2,    3, ..., 7118, 7119, 7120], dtype=int64))
         Movie IDs: ((14026,), array([     1,      2,      3, ..., 130462, 130490, 130642], dtype=int64))


In [5]:
# Constructing a train matrix
trainMatrix = np.zeros((userCount, movieCount))
for rating in train_ratings.itertuples():
    trainMatrix[int(userIdToIndexDict[rating.userId]), int(movieIdToIndexDict[rating.movieId])] = rating.rating
    
# Constructing a test matrix
testMatrix = np.zeros((userCount, movieCount))
for rating in test_ratings.itertuples():
    testMatrix[int(userIdToIndexDict[rating.userId]), int(movieIdToIndexDict[rating.movieId])] = rating.rating

In [6]:
userSimilarity = metrics.pairwise_distances(trainMatrix, metric = 'cosine')

In [7]:
# matrica svih ocena za sve korisnike
def prediction_basic(matrix, user_similarity, number_of_users, number_of_items):
        
    x_mean = matrix.mean(axis = 1)
    
    matrix_diff = (matrix - x_mean[:, np.newaxis])
    p = x_mean[:, np.newaxis] + user_similarity.dot(matrix_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
                    
    return p

In [8]:
# matrica svih ocena za sve korisnike
def prediction_basic_with_threshold(matrix, user_similarity, number_of_users, number_of_items):
        
    x_mean = matrix.mean(axis = 1)
            
    user_similarity = stats.threshold(user_similarity, threshmin=0.99, threshmax=1, newval=0)
    
    matrix_diff = (matrix - x_mean[:, np.newaxis])
    p = x_mean[:, np.newaxis] + user_similarity.dot(matrix_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
                    
    return p

In [16]:
# matrica svih ocena za sve korisnike
def prediction_basic_no_mean(matrix, user_similarity, number_of_users, number_of_items):
    p = user_similarity.dot(matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return p

In [97]:
# matrica svih ocena za sve korisnike
def prediction_basic_no_mean_with_threshold(matrix, user_similarity, number_of_users, number_of_items):
    user_similarity = stats.threshold(user_similarity, threshmin=0.99, threshmax=1, newval=0)
    p = user_similarity.dot(matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return p

In [98]:
#userMoviePrediction_Basic = prediction_basic(testMatrix, userSimilarity, userCount, movieCount)
#userMoviePrediction_BasicWithThreshold = prediction_basic_with_threshold(testMatrix, userSimilarity, userCount, movieCount)
#userMoviePrediction_BasicNoMean = prediction_basic_no_mean(testMatrix, userSimilarity, userCount, movieCount)
userMoviePrediction_BasicNoMeanWithThreshold = prediction_basic_no_mean_with_threshold(testMatrix, userSimilarity, userCount, movieCount)

stats.threshold is deprecated in scipy 0.17.0
  This is separate from the ipykernel package so we can avoid doing imports until


In [99]:
# Picking a user (similar to Milan so he can evaluate the quality of the recommender system)
userId = 1890
userIndex = userIdToIndexDict[userId]

# Getting user ratings
userRatings = ratings.where(ratings.userId == userId)
userRatings = userRatings[~np.isnan(userRatings.userId)]

# Join with movies
userRatings = userRatings.join(movies.set_index('movieId'), on='movieId', how='inner')
userRatings = pd.DataFrame.sort_values(userRatings, by='rating', ascending=False)

In [100]:
#userMoviePrediction = userMoviePrediction_Basic
#userMoviePrediction = userMoviePrediction_BasicWithThreshold
#userMoviePrediction = userMoviePrediction_BasicNoMean
userMoviePrediction = userMoviePrediction_BasicNoMeanWithThreshold

userMoviePredictionForUser = userMoviePrediction[userIndex]

maxMovieIndexes = userMoviePredictionForUser.argsort()
maxMovieIds = np.vectorize(movieIndexToIdDict.get)(maxMovieIndexes)
maxMovieScores = np.vectorize(userMoviePredictionForUser.__getitem__)(maxMovieIndexes)
recommendedMovies = pd.DataFrame({ 'movieIndex' : maxMovieIndexes, 'movieId' : maxMovieIds, 'score' : maxMovieScores })

recommendedMovies = recommendedMovies[recommendedMovies.movieId.isin(userRatings.movieId) == False]
recommendedMovies = recommendedMovies.join(movies.set_index('movieId'), on='movieId', how='inner')
recommendedMovies = recommendedMovies[-10:][::-1]

In [101]:
userMoviePrediction.shape

(7120, 14026)

In [102]:
recommendedMovies

Unnamed: 0,movieId,movieIndex,score,title,genres
14021,3793,3580,0.122956,X-Men (2000),Action|Adventure|Sci-Fi
14020,912,874,0.122754,Casablanca (1942),Drama|Romance
14017,2424,2278,0.108103,You've Got Mail (1998),Comedy|Romance
14016,3578,3370,0.107888,Gladiator (2000),Action|Adventure|Drama
14015,79132,11743,0.103077,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
14012,2572,2423,0.102967,10 Things I Hate About You (1999),Comedy|Romance
14009,1261,1204,0.09135,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror
14007,1345,1286,0.089384,Carrie (1976),Drama|Fantasy|Horror|Thriller
14003,3148,2979,0.081502,"Cider House Rules, The (1999)",Drama
14002,236,230,0.081456,French Kiss (1995),Action|Comedy|Romance


In [103]:
userRatings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
275461,1890.0,4973,5.0,1.152946e+09,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
275336,1890.0,29,5.0,1.152946e+09,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
275393,1890.0,1199,5.0,1.152946e+09,Brazil (1985),Fantasy|Sci-Fi
275462,1890.0,4979,5.0,1.152946e+09,"Royal Tenenbaums, The (2001)",Comedy|Drama
275473,1890.0,5902,5.0,1.152946e+09,Adaptation (2002),Comedy|Drama|Romance
275460,1890.0,4878,5.0,1.152946e+09,Donnie Darko (2001),Drama|Mystery|Sci-Fi|Thriller
275401,1890.0,1281,5.0,1.152946e+09,"Great Dictator, The (1940)",Comedy|Drama|War
275471,1890.0,5673,5.0,1.152947e+09,Punch-Drunk Love (2002),Comedy|Drama|Romance
275340,1890.0,50,5.0,1.152946e+09,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
275469,1890.0,5618,5.0,1.152946e+09,Spirited Away (Sen to Chihiro no kamikakushi) ...,Adventure|Animation|Fantasy


In [104]:
userMoviePrediction.shape, testMatrix.shape

((7120, 14026), (7120, 14026))

In [105]:
def evaluation(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return np.sqrt(metrics.mean_squared_error(ground_truth, prediction))

In [109]:
print (evaluation(userMoviePrediction_Basic, testMatrix))
print (evaluation(userMoviePrediction_BasicWithThreshold, testMatrix))
print (evaluation(userMoviePrediction_BasicNoMean, testMatrix))
print (evaluation(userMoviePrediction_BasicNoMeanWithThreshold, testMatrix))

3.53994733215
3.62723213243
3.56460240756
3.6598041661
