# Simple movie Recommender with matrix factorization 

First we import the necessary libraries (we will use nupy and pandas) and
read the data from the csv file into a pandas dataframe.

In [4]:
import pandas as pd
from datetime import datetime
import csv
import collections

In [5]:

df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')
df_links = pd.read_csv('data/ml-latest-small/links.csv')
df_cast = pd.read_csv('data/ml-latest-small/cast.csv')
df_tags = pd.read_csv('data/ml-latest-small/tags.csv')


TODO:
- matrix factorization algorithm
- different search and recomendation options
- simple console ui
- users backend (new users, new ratings, users adding new tags, and so on)
- frontend possibly
- 


Now, for the matrix factorization to work, we need to form a matrix X (users * movies), this matriks will show us with what rating and which movies a certain user rated.


In [3]:

print(df_ratings.groupby('movieId')['rating'].describe())

         count      mean       std  min   25%  50%  75%  max
movieId                                                     
1        247.0  3.872470  0.958981  1.0  3.00  4.0  5.0  5.0
2        107.0  3.401869  0.880714  1.5  3.00  3.0  4.0  5.0
3         59.0  3.161017  1.150115  0.5  2.25  3.0  4.0  5.0
4         13.0  2.384615  0.938835  1.0  1.50  3.0  3.0  3.5
5         56.0  3.267857  0.948512  1.0  3.00  3.0  4.0  5.0
...        ...       ...       ...  ...   ...  ...  ...  ...
161944     1.0  5.000000       NaN  5.0  5.00  5.0  5.0  5.0
162376     1.0  4.500000       NaN  4.5  4.50  4.5  4.5  4.5
162542     1.0  5.000000       NaN  5.0  5.00  5.0  5.0  5.0
162672     1.0  3.000000       NaN  3.0  3.00  3.0  3.0  3.0
163949     1.0  5.000000       NaN  5.0  5.00  5.0  5.0  5.0

[9066 rows x 8 columns]


In [7]:
#what tables we need from data
# - movie genres                                                ×
# - avg rating for each movie                                   ×
# - avg rating for each genre
# - rating count for each movie                                 ×
# - what movies user has rated and the ratings for each user    ×
# - how much movies has each user rated                         ×
# - movie references (everything connected to one movie id)

#matrix x for users and movies, used for recommendation system
df_X = df_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
np_X = df_X.to_numpy()
display(df_X)

#other matrices for features like recommending by genre, cast, etc.
#set of movie genres
movie_genres = set(genre.lower() 
                    for i in range(len(df_movies)) 
                    for genre in df_movies.loc[i, 'genres'].split('|'))


#avg rating for each movie
avgMovieRating = {movieId[0]: [movieId[1]['rating'].mean(), len(movieId[1])] 
                    for movieId in df_ratings.groupby('movieId')}


#avg rating for each genre
avgGenreRating = {movieId[0]: [movieId[1]['rating'].mean(), len(movieId[1])] 
                    for movieId in df_ratings.groupby('movieId')}


#movie references movieId: (genres + tags + cast)
list_movie_tag = list(zip(df_tags['movieId'], df_tags['tag']))


movie_tags = {movie[0]: movie[1] for movie in list_movie_tag}

movie_references = {movie[0]: (movie[1]['title'].values, movie[1]['cast'].values) 
                    for movie in pd.merge(df_movies, df_cast, on='movieId').groupby('movieId')}


for movieId in movie_tags:
    movie_references[movieId] = (movie_tags[movieId], movie_references[movieId][0], movie_references[movieId][1])

movie_cast = pd.merge(df_movies, df_cast, on='movieId')





movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The idea behind matrix factorization is to find two smaller matrices, that then multiply into the main matrix and this is why we call it matrix factorization.
So we need to fill the 2 smaller matrices with values, so when they multiply, they will give us the main matrix.
The important thing is, that the values at the same position int the initial matrix as well as in the new matrix
(meaning the one we got by multiplying the two smaller ones) are as close as possible.



In [118]:
import time

def funkSVD(X, k, iterations, lr, rp):
    funkSVD.__annotations__ = {'X': 'user-items matrix', 'k': 'num of latent factors', 'iterations': '', 'lr': 'learning rate', 'rp': 'regularization parameter'}
    
    rows, collumns = X.shape
    H = np.random.rand(rows, k)
    W = np.random.rand(k, collumns)
    print(H)
    print(W)
    
    start_time = time.time()

    for itr in range(iterations):
        progressIndicator = (itr/iterations) * 100
        print(f"\rCalculation progress: {progressIndicator:.0f}%", end="")

        for r in range(rows):
            for c in range(collumns):

                # if rating is valid, correct H and W
                if np_X[r, c] > 0: 
                    error = np_X[r, c] - np.dot(H[r,:],W[:,c])
                    for i in range(k):
                        H[r, i] += lr * (2 * error * W[i, c] - rp * H[r, i])
                        W[i, c] += lr * (2 * error * H[r, i] - rp * W[i, c])
        
        
        e = 0
        for r in range(rows):
            for c in range(collumns):
                if X[r, c] > 0:
                    e = e + pow(X[r, c] - np.dot(H[r,:],W[:,c]), 2)
                    for i in range(k):
                        e += (rp/2) * (pow(H[r, i],2) + pow(W[i, c],2))
        if e < 0.001:
            break
    
    exec_time = time.time() - start_time
    print(f"\Time elapsed: {exec_time} seconds")
    return np.dot(H, W)

Now if we run the algorithm we will have to wait quite a bit, because it is a lot of data to process, and each iteration will take longer, than the previous one because the number of undefined values '0' is geting smaller each iteration.

The time complexity is something like O(2 * iterations * rows * columns * k)

Thats why we will write a funtion, that returns us a sample of the data, so the algorithm will execute faster and the results will still be acceptable.
This will lower the number of rows and columns.
We must not forget, that the user for who we are generating the recommendations is in the sample.

In [9]:
df_users_contributions = df_ratings.groupby('userId')['rating'].count().reset_index(name='contributions')
df_users_contributions = df_users_contributions.sort_values(by=['contributions'], ascending=False)

df_movies_contributions = df_ratings.groupby('movieId')['rating'].count().reset_index(name='contributions')
df_movies_contributions = df_movies_contributions.sort_values(by=['contributions'], ascending=False)

#display(users_contributions)
#display(movies_contributions)

In [86]:
# get sample returns a sample of the main matrix, on which we then execute funkSVD
def getSample(n_users, n_movies, rcM, rcU, userId):

    # get set of users that did the most ratings 
    if (n_users or n_movies != 0):
        df_sample_users = df_users_contributions.head(n_users) if n_users > 0 else df_users_contributions.tail(abs(n_users))
        df_sample_movies = df_movies_contributions.head(n_movies) if n_movies > 0 else df_movies_contributions.tail(abs(n_movies))

    # an option to sample by rating (the sample matrix may not be square)
    elif (rcM or rcU != 0):
        df_sample_users = df_users_contributions[df_users_contributions['contributions'] >= rcU] if rcU > 0 else df_users_contributions[df_users_contributions['contributions'] <= abs(rcU)]
        df_sample_movies = df_movies_contributions[df_movies_contributions['contributions'] >= rcM] if rcM > 0 else df_movies_contributions[df_movies_contributions['contributions'] <= abs(rcM)]

    # if the user is not in the sample, we replace the last user (the one with the leas ratings) in the sample 
    # with the user we want to add      
    if userId not in df_sample_users['userId'].tolist():
        df_sample_users = df_sample_users.drop(df_sample_users.index[-1]).reset_index(drop=True)
        userId_row = df_users_contributions[df_users_contributions['userId'] == userId]
        df_sample_users = pd.concat([df_sample_users, userId_row], ignore_index=True)

    user_ids = df_sample_users['userId'].tolist()
    movie_ids = df_sample_movies['movieId'].tolist()

    filtered_ratings = df_ratings[(df_ratings['userId'].isin(user_ids)) & (df_ratings['movieId'].isin(movie_ids))]
    df_sample = pd.pivot_table(filtered_ratings, values='rating', index='userId', columns='movieId', fill_value=0)
    return df_sample.to_numpy(), df_sample


In [122]:
np_samplex, pd_samplex = getSample(10, 10, 0, 0, 600)

funksample = funkSVD(np_samplex, 12, 1000, 0.0002, 0.02)
display(pd_samplex)
display(pd.DataFrame(funksample, index=pd_samplex.index, columns=pd_samplex.columns))


[[0.81531615 0.36436471 0.56078772 0.87658056 0.92362846 0.34508219
  0.90483638 0.86307386 0.9023287  0.19115204 0.49528182 0.14866685]
 [0.29340951 0.83205863 0.78871437 0.68982346 0.14321777 0.19697417
  0.67913499 0.71005433 0.90621753 0.27613592 0.7439809  0.00878034]
 [0.32067046 0.54529455 0.69113925 0.3613323  0.07139538 0.54927102
  0.2958139  0.42719822 0.23945473 0.61645558 0.91637436 0.78645285]
 [0.04351461 0.68660521 0.89650488 0.29579813 0.9809502  0.64873777
  0.75335597 0.55848802 0.55521087 0.19661046 0.53677251 0.83947953]
 [0.32604768 0.08488789 0.49382143 0.2202277  0.58682532 0.88728414
  0.38115469 0.39436135 0.57375915 0.30842592 0.65862482 0.06486017]
 [0.18121496 0.601309   0.00636964 0.71359416 0.62054011 0.09505938
  0.15934694 0.17604718 0.99859001 0.17185044 0.11995056 0.61306115]
 [0.15177512 0.7621497  0.26023736 0.35205841 0.65200676 0.43194012
  0.2132112  0.79039325 0.85571512 0.72212964 0.21679615 0.07213859]
 [0.87034422 0.19866042 0.8429717  0.8231

movieId,1,260,296,318,356,480,527,589,593,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15,2.0,5.0,5.0,2.0,1.0,3.0,4.0,4.0,5.0,5.0
73,5.0,4.5,5.0,5.0,5.0,4.0,5.0,3.0,4.5,4.5
311,3.0,4.0,3.0,4.5,5.0,4.5,5.0,4.5,2.0,4.0
380,4.0,4.0,5.0,4.0,5.0,4.0,0.0,4.0,5.0,5.0
452,3.5,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,2.0
468,4.0,3.5,3.5,3.5,3.0,2.5,0.0,0.0,3.0,3.0
547,3.5,0.0,5.0,5.0,2.0,3.0,5.0,0.0,5.0,3.5
564,4.0,2.0,5.0,0.0,3.0,5.0,4.0,5.0,5.0,3.0
624,5.0,5.0,5.0,0.0,3.0,3.0,0.0,3.0,5.0,2.0


movieId,1,260,296,318,356,480,527,589,593,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15,3.715097,3.561293,4.182265,4.791422,3.716885,4.308909,4.791699,4.637375,4.446112,3.950607
73,4.006468,3.351695,4.439457,4.10707,3.916502,4.115093,4.991212,4.390804,4.323066,3.880407
311,3.809134,2.207357,3.572349,2.40077,2.873363,2.909641,4.224102,3.4494,3.608688,3.217692
380,4.046486,2.820426,4.424363,3.548475,3.245819,3.719706,4.491155,3.853731,3.826703,3.988675
452,3.134979,2.50085,3.949661,3.294547,2.83211,3.291596,4.069268,3.376203,3.800459,3.07993
468,2.364305,2.197589,2.774035,2.692985,2.451509,1.972613,2.818656,2.255532,2.727557,2.071839
547,2.884757,2.809608,3.742219,3.595047,3.002347,2.993827,3.610395,3.38501,3.699139,3.156763
564,4.374696,3.147203,4.197845,3.617026,3.513709,4.353614,5.404166,4.615708,4.747495,4.224231
624,3.981825,2.400244,4.165091,3.421287,2.780901,3.197768,3.952128,3.866505,3.578456,3.289057


Now all that is left is to test if the algorithm somewhat works.
Lets write a simple console UI and test it out.

TODO:
- ui 
- user database (adding new users, with 0 ratings, and so on)
- recommending on best avg ratings, genres, actors, tags (user inputs preferences and gets a recommendation)
- 

In [None]:
def addUser(userId):
    
    