In [372]:
import pandas as pd
import numpy as np

# load data

In [373]:
data = pd.io.parsers.read_csv('data/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

movie_data = pd.io.parsers.read_csv('data/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::', encoding='latin-1')
data

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [374]:
from numpy.random import RandomState
prng = RandomState(333)
usidx = prng.choice(6040, size=600,place=False)

TypeError: choice() got an unexpected keyword argument 'place'

In [None]:
print('original data :'+ str(data.shape))
test_users = data.user_id.value_counts()[usidx]
data_test = data.loc[data.user_id.isin(test_users)]
data = data.loc[~data.user_id.isin(test_users)]
print('training_data:'+ str(data.shape))

original data :(1000209, 4)
training_data:(952707, 4)


In [None]:
most_rated_movie_ids = data.movie_id.value_counts().index.tolist()
movie_data.loc[movie_data.movie_id.isin(most_rated_movie_ids[:20])]

Unnamed: 0,movie_id,title,genre
108,110,Braveheart (1995),Action|Drama|War
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
476,480,Jurassic Park (1993),Action|Adventure|Sci-Fi
523,527,Schindler's List (1993),Drama|War
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
589,593,"Silence of the Lambs, The (1991)",Drama|Thriller
604,608,Fargo (1996),Crime|Drama|Thriller
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1179,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance


In [355]:
ratings_mat = data.pivot_table(index=['user_id'],columns=['movie_id'],values='rating')
ratings_mat

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [273]:
movie_ids = ratings_mat.columns.to_list()
users_ids = ratings_mat.index.to_list()

In [356]:
ratings_mat = ratings_mat.fillna(ratings_mat.mean(axis=0)).to_numpy().T

In [357]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

In [358]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

In [359]:
def top_cosine_similarity(data, movie_idx, top_n=10):
    movie_row = data[movie_idx, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))+0.00000001 ##to resolve numerical issues
    similarity = np.dot(movie_row, data.T) / (magnitude[movie_idx] * magnitude)
    sort_indexes = np.argsort(-similarity)

    sort_indexes = sort_indexes[sort_indexes!=movie_idx]
    return sort_indexes[:top_n]

def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0} Genre {1}\n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0],movie_data[movie_data.movie_id == movie_id].genre.values[0]))
    return movie_data.loc[movie_data.movie_id.isin(top_indexes)][['title','genre']]

In [360]:
k = 10
movie_idx = 0 # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_idx, top_n)
print_similar_movies(movie_data, movie_ids[movie_idx], np.array([movie_ids[i] for i in indexes]))


Recommendations for Toy Story (1995) Genre Animation|Children's|Comedy



Unnamed: 0,title,genre
33,Babe (1995),Children's|Comedy|Drama
584,Aladdin (1992),Animation|Children's|Comedy|Musical
591,Beauty and the Beast (1991),Animation|Children's|Musical
1163,"Grifters, The (1990)",Crime|Drama|Film-Noir
1444,City of Industry (1997),Crime|Thriller
1663,Midnight in the Garden of Good and Evil (1997),Comedy|Crime|Drama|Mystery
1831,"Children of Heaven, The (Bacheha-Ye Aseman) (1...",Drama
2285,"Rugrats Movie, The (1998)",Animation|Children's|Comedy
2314,Police Academy 6: City Under Siege (1989),Comedy
3044,End of Days (1999),Action|Thriller


In [361]:
k = 10
movie_idx = movie_ids.index(260) # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_idx, top_n)
print(print_similar_movies(movie_data, movie_ids[movie_idx], np.array([movie_ids[i] for i in indexes])).to_markdown())

Recommendations for Star Wars: Episode IV - A New Hope (1977) Genre Action|Adventure|Fantasy|Sci-Fi

|      | title                                       | genre                           |
|-----:|:--------------------------------------------|:--------------------------------|
| 1177 | Up in Smoke (1978)                          | Comedy                          |
| 1179 | Princess Bride, The (1987)                  | Action|Adventure|Comedy|Romance |
| 1191 | Once Upon a Time in the West (1969)         | Western                         |
| 1270 | Some Kind of Wonderful (1987)               | Drama|Romance                   |
| 1395 | Walkabout (1971)                            | Drama                           |
| 2046 | Indiana Jones and the Temple of Doom (1984) | Action|Adventure                |
| 2521 | Hideous Kinky (1998)                        | Drama                           |
| 2558 | Endurance (1998)                            | Documentary|Drama               |
| 3469 | 

In [362]:
k = 10
movie_idx = movie_ids.index(110) # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_idx, top_n)
print(print_similar_movies(movie_data, movie_ids[movie_idx], np.array([movie_ids[i] for i in indexes])).to_markdown())

Recommendations for Braveheart (1995) Genre Action|Drama|War

|      | title                            | genre              |
|-----:|:---------------------------------|:-------------------|
|   36 | Across the Sea of Time (1995)    | Documentary        |
|  315 | Shawshank Redemption, The (1994) | Drama              |
|  352 | Forrest Gump (1994)              | Comedy|Romance|War |
| 1073 | Reservoir Dogs (1992)            | Crime|Thriller     |
| 1612 | Life Less Ordinary, A (1997)     | Romance|Thriller   |
| 1958 | Mafia! (1998)                    | Comedy|Crime       |
| 2521 | Hideous Kinky (1998)             | Drama              |
| 3256 | Next Best Thing, The (2000)      | Comedy|Drama       |
| 3508 | Two Moon Juction (1988)          | Drama              |
| 3691 | Kentucky Fried Movie, The (1977) | Comedy             |


In [363]:
k = 10
movie_idx = movie_ids.index(480) # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_idx, top_n)
print(print_similar_movies(movie_data, movie_ids[movie_idx], np.array([movie_ids[i] for i in indexes])).to_markdown())

Recommendations for Jurassic Park (1993) Genre Action|Adventure|Sci-Fi

|      | title                             | genre                           |
|-----:|:----------------------------------|:--------------------------------|
|  585 | Terminator 2: Judgment Day (1991) | Action|Sci-Fi|Thriller          |
|  770 | Independence Day (ID4) (1996)     | Action|Sci-Fi|War               |
| 1110 | Drop Dead Fred (1991)             | Comedy|Fantasy                  |
| 1179 | Princess Bride, The (1987)        | Action|Adventure|Comedy|Romance |
| 1181 | Brazil (1985)                     | Sci-Fi                          |
| 1195 | GoodFellas (1990)                 | Crime|Drama                     |
| 1219 | Local Hero (1983)                 | Comedy                          |
| 2692 | Iron Giant, The (1999)            | Animation|Children's            |
| 2846 | Risky Business (1983)             | Comedy                          |
| 3457 | Parenthood (1989)                 | Comedy|Drama  

In [364]:
k = 10
movie_idx = movie_ids.index(3751) # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_idx, top_n)
print_similar_movies(movie_data, movie_ids[movie_idx], np.array([movie_ids[i] for i in indexes]))

Recommendations for Chicken Run (2000) Genre Animation|Children's|Comedy



Unnamed: 0,title,genre
775,Kingpin (1996),Comedy
1853,Whatever (1998),Drama
2613,Limbo (1999),Drama
2636,"Late August, Early September (Fin août, début ...",Drama
3110,Angela's Ashes (1999),Drama
3256,"Next Best Thing, The (2000)",Comedy|Drama
3547,Loser (2000),Comedy|Romance
3715,"Kid, The (2000)",Comedy
3796,Sunset Strip (2000),Comedy
3877,Get Carter (1971),Thriller


In [365]:
from tqdm import tqdm
def evaluate_accuracy(K,top_n=10):
    accuracy = []
    um = []
    recomend_m = []
    for tu in tqdm(test_users):
        tt = data_test.loc[data_test.user_id==tu]
        target_movie = tt.sort_values(by='rating', ascending=False).movie_id.to_list()[0]
        user_mean = tt.rating.median()
        if target_movie in movie_ids:
            movie_idx = movie_ids.index(target_movie) # (getting an id from movies.dat)
            sliced = V.T[:, :K] # representative data
            indexes = top_cosine_similarity(sliced, movie_idx, top_n)

            if tt.loc[tt.movie_id.isin(np.array([movie_ids[i] for i in indexes])) & (tt.movie_id!=target_movie)].shape[0]>1:
                recomended_mean = tt.loc[tt.movie_id.isin(np.array([movie_ids[i] for i in indexes])) & (tt.movie_id!=target_movie)].rating.mean()
                accuracy.append(recomended_mean>user_mean)
                um.append(user_mean)
                recomend_m.append(recomended_mean)
    return accuracy,um,recomend_m

In [377]:
accuracy,user_mean,recomend_mean = evaluate_accuracy(K=5,top_n=2)

100%|██████████| 700/700 [00:01<00:00, 351.55it/s]


In [378]:
np.array(accuracy).mean()

0.625

# references

https://grouplens.org/datasets/movielens/1m/

https://analyticsindiamag.com/singular-value-decomposition-svd-application-recommender-system/

https://towardsdatascience.com/beginners-guide-to-creating-an-svd-recommender-system-1fd7326d1f65