In [1]:
import pandas as pd
import numpy as np
import matrix_factorization_utilities
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
R = pd.read_csv('movie_ratings_data_set.csv')

In [3]:
R.head(10)

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4
5,1,13,5
6,2,2,5
7,2,15,4
8,2,1,5
9,2,21,5


In [4]:
print('Number of users in the dataset', len(pd.unique(R.user_id)))

Number of users in the dataset 100


In [5]:
rating_df = pd.pivot_table(R, index='user_id',columns='movie_id',aggfunc=np.max)

In [6]:
rating_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,4.0,,...,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,...,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,...,,,,,,,,,,
4,5.0,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,3.0,2.0,5.0,5.0


In [7]:
movies_df = pd.read_csv('movies.csv', index_col='movie_id')
movies_df.shape

(34, 2)

In [47]:
movies_df.head()

Unnamed: 0_level_0,title,genre,difference_score,CosineDistance
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The Sheriff 1,"crime drama, western",6.547624,0.230622
2,The Big City Judge 1,legal drama,6.177561,0.201711
3,The Sheriff 2,"crime drama, western",5.496324,0.176868
4,Just a Regular Family,reality,0.0,0.0
5,The Big City Judge 2,legal drama,5.468398,0.193312


**Matrix Factorization**

For a lot of users, rating values are missing. Matrix factorization is used to predict the missing values. Rating matrix is factorized into U and M matrices. Since all elements of rating matrix are unknown, we will have to find U and M iteratively. 

* First elements of U and M are randomly chosen
* Product of U.M gives R* (approximate rating matrix)
* Difference between elements of R* and corresponding known elments of ratings matrix are compared for cost calculation
* Cost is minimized with optimiztion function to predict new element values

In [9]:
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(rating_df.values, num_features=15,\
                                                                    regularization_amount=0.1)

         Current function value: 32.504359
         Iterations: 3000
         Function evaluations: 4518
         Gradient evaluations: 4518


In [10]:
predicted_ratings = np.matmul(U,M)

In [11]:
predicted_ratings_df = pd.DataFrame(index=rating_df.index,columns = rating_df.columns,data=predicted_ratings)

In [12]:
predicted_ratings_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.003228,3.920804,4.155654,3.527607,3.920238,4.142495,2.655843,4.06599,4.010431,3.805024,...,3.79698,3.984577,3.329899,3.990534,3.457331,3.110349,2.724844,3.358118,3.039266,4.571924
2,4.970692,4.967963,4.445228,3.90403,4.830917,4.706778,3.394305,4.698824,4.531419,4.837337,...,4.314215,4.494277,4.414509,4.163279,3.462386,4.388291,2.99431,4.346959,4.013651,3.999406
3,4.017216,3.985712,4.955705,4.088404,4.5286,4.271119,2.81866,4.33764,4.491433,4.367656,...,4.080321,4.31688,3.878981,4.474773,3.205661,3.239724,2.235264,2.593159,3.444754,4.600936
4,4.991073,4.987244,4.928805,4.961957,5.010573,4.961638,3.945324,5.075958,4.951773,5.017124,...,4.062737,4.741855,4.502961,4.906618,3.116491,3.412357,2.009168,4.223538,4.171517,4.64439
5,4.980624,4.240558,5.460823,4.374178,5.161713,4.556249,3.148384,4.638848,4.982058,4.91839,...,4.507955,4.72359,3.737581,5.071973,3.017241,3.610066,2.996264,2.031793,4.980757,4.98731


**Finding Similar Movies and Recommending to User**

* M matrix contains vector describing characteristics of that movie
* If user has watched a movie, then to find the similar movie, rows of movie are are compared with other rows of M matrix
* Movies with lowest different are recommended

For example, User 4 liked Movie 5 the most. For User 4, we can predict which other movies he may like. 


In [13]:
M = np.transpose(M)
M.shape

(34, 15)

In [48]:
movie_id = 5

In [49]:
movie_information = movies_df.loc[movie_id]

In [50]:
print ('Finding movie similar to movie')
print ('Movie:', movie_information.title)
print ('Genere:', movie_information.genre)

Finding movie similar to movie
Movie: The Big City Judge 2
Genere: legal drama


In [51]:
current_movie_features = M[movie_id-1]
difference = M-current_movie_features
absolute_difference = np.abs(difference)
total_difference = np.sum(absolute_difference, axis=1)

In [52]:
movies_df['difference_score'] = total_difference

In [53]:
sorted_movie_list = movies_df.sort_values('difference_score')

In [54]:
print ('If you watched',movie_information.title)
print ('You may also like')
print (sorted_movie_list['title'][1:6])

If you watched The Big City Judge 2
You may also like
movie_id
10     Surrounded by Zombies 1
8     Sci-Fi Murder Detectives
9                  Biker Gangs
3                The Sheriff 2
26           Mafia Underground
Name: title, dtype: object


**Using Cosine as a measure of similarity**

Angle is calculated based on dot product between vector of the movie and other movies

In [55]:
import scipy
dists = []
for row in M:
    dists.append(scipy.spatial.distance.cosine(current_movie_features, row))

In [56]:
movies_df['CosineDistance'] = dists
movies_df.head()

Unnamed: 0_level_0,title,genre,difference_score,CosineDistance
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The Sheriff 1,"crime drama, western",4.557269,0.125748
2,The Big City Judge 1,legal drama,5.105589,0.147439
3,The Sheriff 2,"crime drama, western",4.037083,0.099756
4,Just a Regular Family,reality,5.468398,0.193312
5,The Big City Judge 2,legal drama,0.0,0.0


In [57]:
sorted_movie_list2 = movies_df.sort_values('CosineDistance')

In [58]:
print ('If you watched',movie_information.title)
print ('You may also like')
print (sorted_movie_list2['title'][1:6])

If you watched The Big City Judge 2
You may also like
movie_id
10     Surrounded by Zombies 1
9                  Biker Gangs
8     Sci-Fi Murder Detectives
6            Attack on Earth 1
3                The Sheriff 2
Name: title, dtype: object
