In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [38]:
movies = pd.read_csv('datasets/ml-latest-small/movies.csv')
ratings = pd.read_csv('datasets/ml-latest-small/ratings.csv')

In [52]:
rating_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=42)

In [53]:
rating_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
80568,509,7347,3.0,1435994597
50582,326,71462,4.0,1322252335
8344,57,2115,3.0,965798155
99603,610,1127,4.0,1479544102
71701,462,2409,2.0,1174438249


In [54]:
ratings_test.head()

Unnamed: 0,userId,movieId,rating,timestamp
67037,432,77866,4.5,1335139641
42175,288,474,3.0,978465565
93850,599,4351,3.0,1498524542
6187,42,2987,4.0,996262677
12229,75,1610,4.0,1158989841


In [55]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [56]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [57]:
from collections import OrderedDict
class MovieRecSys:
    def __init__(self, movies, reviews):
        self.movies = self._get_movies(movies.copy())
        self.reviews = self._get_reviews(reviews.copy())
        self.movie_views = self._get_views()
        self.movie_avg_ratings = self._get_avg_ratings()

    def _get_movies(self, movies):
        movies['genres'] = movies['genres'].apply(lambda x: set(x.split('|')))
        movies_dict = movies.set_index('movieId').to_dict(orient='index')
        return movies_dict
    
    def _get_reviews(self, reviews):
        # Convert 'timestamp' to datetime
        reviews['timestamp'] = pd.to_datetime(reviews['timestamp'], unit='s')

        # Convert 'timestamp' to ISO format
        reviews['timestamp'] = reviews['timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S')
        return reviews
    
    def _get_views(self):
        reviews = self.reviews
        counts = reviews.groupby('movieId').count()['userId'].to_dict()
        ordered_counts = OrderedDict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
        return ordered_counts
    
    def _get_avg_ratings(self):
        reviews = self.reviews
        counts = reviews.groupby('movieId')['rating'].mean().to_dict()
        ordered_counts = OrderedDict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
        return ordered_counts
    
    def most_popular(self, top_k=10):
        # Get the top k most viewed movies
        top_k_movies = list(self.movie_views.items())[:top_k]
        response = list(map(lambda x: f"id: {x[0]: <10} {self.movies[x[0]]['title']: <80} counts: {x[1]}", top_k_movies))
        return response
    
    def top_rated(self, top_k=10):
        top_k_movies = list(self.movie_avg_ratings.items())[:top_k]
        response = list(map(lambda x: f"id: {x[0]: <10} {self.movies[x[0]]['title']: <80} counts: {x[1]}", top_k_movies))
        return response


In [58]:
mdb = MovieRecSys(movies, rating_train)
print('Most viewed:')
print(*mdb.most_popular(), sep='\n')
print('\nTop Rated:')
print(*mdb.top_rated(), sep='\n')

Most viewed:
id: 356        Forrest Gump (1994)                                                              counts: 276
id: 318        Shawshank Redemption, The (1994)                                                 counts: 257
id: 296        Pulp Fiction (1994)                                                              counts: 256
id: 593        Silence of the Lambs, The (1991)                                                 counts: 226
id: 2571       Matrix, The (1999)                                                               counts: 214
id: 260        Star Wars: Episode IV - A New Hope (1977)                                        counts: 194
id: 110        Braveheart (1995)                                                                counts: 189
id: 480        Jurassic Park (1993)                                                             counts: 185
id: 589        Terminator 2: Judgment Day (1991)                                                counts: 182
id: 1          

In [59]:
mdb.reviews

Unnamed: 0,userId,movieId,rating,timestamp
80568,509,7347,3.0,2015-07-04T07:23:17
50582,326,71462,4.0,2011-11-25T20:18:55
8344,57,2115,3.0,2000-08-09T05:15:55
99603,610,1127,4.0,2016-11-19T08:28:22
71701,462,2409,2.0,2007-03-21T00:50:49
...,...,...,...,...
6265,42,4005,4.0,2001-07-27T18:37:39
54886,364,141,4.0,1997-07-21T00:02:47
76820,480,6867,4.0,2007-05-14T17:19:31
860,6,981,3.0,1996-10-17T12:49:27


In [106]:
def euc_similarity(user_1_items, user_2_items):
    common_items = set(user_1_items.keys()).intersection(set(user_2_items.keys()))

    if len(common_items) == 0:
        # No common movies, return a similarity of 0
        return 0
    dist = np.sqrt(np.sum([(user_1_items[key] - user_2_items[key])**2 for key in common_items]))
    similarity = 1 / (1 + dist)
    return similarity

def pearson_similarity(user_1_items, user_2_items):

        common_items = set(user_1_items.keys()).intersection(set(user_2_items.keys()))
    
        if len(common_items) == 0:
            # No common movies, return a similarity of 0
            return 0
        ratings_user1 = [user_1_items[item] for item in common_items]
        ratings_user2 = [user_2_items[item] for item in common_items]

        if len(ratings_user1) < 3 or len(ratings_user2) < 3:
            # Not enough data points to calculate correlation, return 0
            return 0
        else:
            return np.corrcoef(ratings_user1, ratings_user2)[0, 1]

    

In [108]:
r = {'user1': {1: 1, 2: 2, 3: 1}, 'user3': {1:5, 2:2, 3:5}}
pearson_similarity(r, 'user1', 'user3')

[1, 2, 1]
[5, 2, 5]


-1.0

In [109]:
user1 = r['user1']
user2 = {1:1}
user3 = r['user3']

print(euc_similarity(user1, user2))
print(pearson_similarity(user1, user3))

1.0
-1.0


In [121]:
class SimilarityRecSys:
    def __init__(self, movies, reviews, similarity_function):
        self.movies = self._get_movies(movies.copy())
        self.reviews = self._get_reviews(reviews.copy())
        self.sim_function = similarity_function

    def _get_movies(self, movies):
        movies['genres'] = movies['genres'].apply(lambda x: set(x.split('|')))
        movies_dict = movies.set_index('movieId').to_dict(orient='index')
        return movies_dict
    
    def _get_reviews(self, reviews):
        # Convert 'timestamp' to datetime
        review_dict = {}
        for _, row in reviews.iterrows():
            user_id = row['userId']
            movie_id = int(row['movieId'])
            rating = row['rating']
            if user_id not in review_dict:
                review_dict[user_id] = {}
            review_dict[user_id][movie_id] = rating
        return review_dict
    
    def _sort_users(self, key_user):
        similarities = []
        for user in self.reviews:
            if user != key_user:
                similarities.append((user, self.sim_function(self.reviews[key_user], self.reviews[user])))
        return sorted(similarities, key=lambda x: x[1], reverse=True)

    def _get_rankings(self, similarities, key_user, similarity_threshold=0):
        movie_scores = {}
        sum_sim = {}
        for user, sim_score in similarities:
            if sim_score <= similarity_threshold:
                continue
            for movie_id, score in self.reviews[user].items():
                if self.reviews[key_user].get(movie_id, 0) == 0:
                    movie_scores.setdefault(movie_id, 0)
                    movie_scores[movie_id] += score*sim_score 
                    sum_sim.setdefault(movie_id, 0)
                    sum_sim[movie_id] += sim_score
        for movie_id, score in movie_scores.items():
            if not score > 0:
                print(movie_id, score)
        ranks = [(movie_id, score/sum_sim[movie_id]) for movie_id, score in movie_scores.items()]
        ranks = sorted(ranks, key=lambda x: x[1], reverse=True)
        return ranks

    def recommend_for_user(self, user, top_k=10):
        similarities = self._sort_users(user)
        movies_rankings = self._get_rankings(similarities, user)
        top_k_movies = movies_rankings[:top_k]
        response = list(map(lambda x: f"id: {x[0]: <10} {self.movies[x[0]]['title']: <80} counts: {x[1]}", top_k_movies))
        return response


In [122]:
src = SimilarityRecSys(movies, rating_train, euc_similarity)
users_test = ratings_test['userId'].unique()
src.recommend_for_user(users_test[0])

['id: 136556     Kung Fu Panda: Secrets of the Masters (2011)                                     counts: 5.000000000000001',
 'id: 6983       Jane Eyre (1944)                                                                 counts: 5.0',
 'id: 626        Thin Line Between Love and Hate, A (1996)                                        counts: 5.0',
 'id: 53         Lamerica (1994)                                                                  counts: 5.0',
 'id: 3567       Bossa Nova (2000)                                                                counts: 5.0',
 'id: 3940       Slumber Party Massacre III (1990)                                                counts: 5.0',
 'id: 3941       Sorority House Massacre (1986)                                                   counts: 5.0',
 'id: 3942       Sorority House Massacre II (1990)                                                counts: 5.0',
 'id: 3939       Slumber Party Massacre II (1987)                                         

In [124]:
print(movies.head(2))
print()
print(rating_train.head(2))

   movieId             title                                       genres
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2    Jumanji (1995)                   Adventure|Children|Fantasy

       userId  movieId  rating   timestamp
80568     509     7347     3.0  1435994597
50582     326    71462     4.0  1322252335


In [127]:
from scipy import sparse as sp

def get_reviews(reviews):
        review_dict = {}
        for _, row in reviews.iterrows():
            user_id = row['userId']
            movie_id = int(row['movieId'])
            rating = row['rating']
            if user_id not in review_dict:
                review_dict[user_id] = {}
            review_dict[user_id][movie_id] = rating
        return review_dict



In [129]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
227,1,3744,4.0,964980694
228,1,3793,5.0,964981855
229,1,3809,4.0,964981220
230,1,4006,4.0,964982903


In [128]:
# Example: Create a COO matrix from the 'ratings' DataFrame
rows = ratings['userId'].values
cols = ratings['movieId'].values
values = ratings['rating'].values

# Create a sparse matrix in COO format
sparse_matrix_coo = sp.coo_matrix((values, (rows, cols))).tocsr()

print("Sparse Matrix (COO format):")
print(sparse_matrix_coo)

Sparse Matrix (COO format):
  (1, 1)	4.0
  (1, 3)	4.0
  (1, 6)	4.0
  (1, 47)	5.0
  (1, 50)	5.0
  (1, 70)	3.0
  (1, 101)	5.0
  (1, 110)	4.0
  (1, 151)	5.0
  (1, 157)	5.0
  (1, 163)	5.0
  (1, 216)	5.0
  (1, 223)	3.0
  (1, 231)	5.0
  (1, 235)	4.0
  (1, 260)	5.0
  (1, 296)	3.0
  (1, 316)	3.0
  (1, 333)	5.0
  (1, 349)	4.0
  (1, 356)	4.0
  (1, 362)	5.0
  (1, 367)	4.0
  (1, 423)	3.0
  (1, 441)	4.0
  :	:
  (610, 156371)	5.0
  (610, 156726)	4.5
  (610, 157296)	4.0
  (610, 158238)	5.0
  (610, 158721)	3.5
  (610, 158872)	3.5
  (610, 158956)	3.0
  (610, 159093)	3.0
  (610, 160080)	3.0
  (610, 160341)	2.5
  (610, 160527)	4.5
  (610, 160571)	3.0
  (610, 160836)	3.0
  (610, 161582)	4.0
  (610, 161634)	4.0
  (610, 162350)	3.5
  (610, 163937)	3.5
  (610, 163981)	3.5
  (610, 164179)	5.0
  (610, 166528)	4.0
  (610, 166534)	4.0
  (610, 168248)	5.0
  (610, 168250)	5.0
  (610, 168252)	5.0
  (610, 170875)	3.0


In [134]:
# Get non-empty column indices for the first row (row index 0)
# Get non-empty column indices for the first row (row index 0)
row_index = 1
non_empty_column_indices = sparse_matrix_coo.indices[
    sparse_matrix_coo.indptr[row_index]:sparse_matrix_coo.indptr[row_index + 1]
]

print(non_empty_column_indices)

[   1    3    6   47   50   70  101  110  151  157  163  216  223  231
  235  260  296  316  333  349  356  362  367  423  441  457  480  500
  527  543  552  553  590  592  593  596  608  648  661  673  733  736
  780  804  919  923  940  943  954 1009 1023 1024 1025 1029 1030 1031
 1032 1042 1049 1060 1073 1080 1089 1090 1092 1097 1127 1136 1196 1197
 1198 1206 1208 1210 1213 1214 1219 1220 1222 1224 1226 1240 1256 1258
 1265 1270 1275 1278 1282 1291 1298 1348 1377 1396 1408 1445 1473 1500
 1517 1552 1573 1580 1587 1617 1620 1625 1644 1676 1732 1777 1793 1804
 1805 1920 1927 1954 1967 2000 2005 2012 2018 2028 2033 2046 2048 2054
 2058 2078 2090 2093 2094 2096 2099 2105 2115 2116 2137 2139 2141 2143
 2161 2174 2193 2253 2268 2273 2291 2329 2338 2353 2366 2387 2389 2395
 2406 2414 2427 2450 2459 2470 2478 2492 2502 2528 2529 2542 2571 2580
 2596 2616 2617 2628 2640 2641 2644 2648 2654 2657 2692 2700 2716 2761
 2797 2826 2858 2872 2899 2916 2944 2947 2948 2949 2959 2985 2987 2991
 2993 

In [142]:
# Assuming sparse_matrix_csr is your CSR matrix
row_index = 1  # Replace with the desired row index

# Get the column indices of non-zero elements in the specified row
non_empty_column_indices = sparse_matrix_coo[row_index].indices

print(non_empty_column_indices)

[   1    3    6   47   50   70  101  110  151  157  163  216  223  231
  235  260  296  316  333  349  356  362  367  423  441  457  480  500
  527  543  552  553  590  592  593  596  608  648  661  673  733  736
  780  804  919  923  940  943  954 1009 1023 1024 1025 1029 1030 1031
 1032 1042 1049 1060 1073 1080 1089 1090 1092 1097 1127 1136 1196 1197
 1198 1206 1208 1210 1213 1214 1219 1220 1222 1224 1226 1240 1256 1258
 1265 1270 1275 1278 1282 1291 1298 1348 1377 1396 1408 1445 1473 1500
 1517 1552 1573 1580 1587 1617 1620 1625 1644 1676 1732 1777 1793 1804
 1805 1920 1927 1954 1967 2000 2005 2012 2018 2028 2033 2046 2048 2054
 2058 2078 2090 2093 2094 2096 2099 2105 2115 2116 2137 2139 2141 2143
 2161 2174 2193 2253 2268 2273 2291 2329 2338 2353 2366 2387 2389 2395
 2406 2414 2427 2450 2459 2470 2478 2492 2502 2528 2529 2542 2571 2580
 2596 2616 2617 2628 2640 2641 2644 2648 2654 2657 2692 2700 2716 2761
 2797 2826 2858 2872 2899 2916 2944 2947 2948 2949 2959 2985 2987 2991
 2993 

In [144]:
# Assuming sparse_matrix_csr is your CSR matrix
column_index = 1  # Replace with the desired column index

# Get the row indices of non-zero elements in the specified column
non_empty_row_indices = sparse_matrix_coo[:, column_index].nonzero()[0]

print(non_empty_row_indices)

[  1   5   7  15  17  18  19  21  27  31  32  33  40  43  44  45  46  50
  54  57  63  64  66  68  71  73  76  78  82  86  89  90  91  93  96  98
 103 107 112 119 121 124 130 132 134 135 137 140 141 144 145 151 153 155
 156 159 160 161 166 167 169 171 177 178 179 182 185 186 191 193 200 201
 202 206 213 214 216 217 219 220 223 226 229 232 233 234 239 240 247 249
 252 254 263 264 266 269 270 273 274 275 276 277 279 280 282 283 288 290
 291 292 293 298 304 307 314 322 323 328 330 332 334 336 337 339 341 347
 350 353 357 359 364 367 372 373 378 380 381 382 385 389 391 396 399 401
 411 412 414 420 422 432 434 436 438 443 448 451 453 456 460 462 468 469
 470 471 474 476 477 480 483 484 488 490 492 500 504 509 514 517 522 524
 525 528 529 533 534 541 544 550 555 559 560 561 562 567 570 572 573 579
 580 584 587 590 596 597 599 600 601 603 604 605 606 607 608 609 610]


In [138]:
# Assuming sparse_matrix_coo is your CSR matrix
column_index = 1  # Replace with the desired column index

# Get the column slice for the specified column
column_slice = sparse_matrix_coo[:, column_index]

# Find the non-zero indices in the column slice
non_empty_row_indices = column_slice.nonzero()[0]

print(non_empty_row_indices)


[  1   5   7  15  17  18  19  21  27  31  32  33  40  43  44  45  46  50
  54  57  63  64  66  68  71  73  76  78  82  86  89  90  91  93  96  98
 103 107 112 119 121 124 130 132 134 135 137 140 141 144 145 151 153 155
 156 159 160 161 166 167 169 171 177 178 179 182 185 186 191 193 200 201
 202 206 213 214 216 217 219 220 223 226 229 232 233 234 239 240 247 249
 252 254 263 264 266 269 270 273 274 275 276 277 279 280 282 283 288 290
 291 292 293 298 304 307 314 322 323 328 330 332 334 336 337 339 341 347
 350 353 357 359 364 367 372 373 378 380 381 382 385 389 391 396 399 401
 411 412 414 420 422 432 434 436 438 443 448 451 453 456 460 462 468 469
 470 471 474 476 477 480 483 484 488 490 492 500 504 509 514 517 522 524
 525 528 529 533 534 541 544 550 555 559 560 561 562 567 570 572 573 579
 580 584 587 590 596 597 599 600 601 603 604 605 606 607 608 609 610]


In [None]:
 R = sp.coo_array((ratings, (user_ids, item_ids)), shape=(num_users, num_items)).tocsr()

In [None]:
def ALS(user_ids, item_ids, ratings, num_users, num_items, num_dims=32, num_iters=10, eps=1e-7):
    R = scs.coo_array((ratings, (user_ids, item_ids)), shape=(num_users, num_items)).tocsr()
    X = np.random.randn(num_users, num_dims) #U
    Y = np.random.randn(num_items, num_dims) #V
    
    for t in tqdm(range(num_iters)):
        RY = R @ Y
        for u in range(num_users):
            relevant_items = item_ids[user_ids == u]
            Y_rel = Y[relevant_items]
            YY = Y_rel.reshape(-1, num_dims, 1) * Y_rel.reshape(-1, 1, num_dims)
            X[u] = np.linalg.inv(YY.sum(axis=0) + eps * np.eye(num_dims)) @ RY[u]

        RX = R.T @ X
        for i in range(num_items):
            relevant_users = user_ids[item_ids == i]
            X_rel = X[relevant_users]
            XX = X_rel.reshape(-1, num_dims, 1) * X_rel.reshape(-1, 1, num_dims)
            Y[i] = np.linalg.inv(XX.sum(axis=0) + eps * np.eye(num_dims)) @ RX[i]
    
    return R, X, Y

R, X, Y = ALS(train_ratings.userId, train_ratings.movieId, train_ratings.rating, num_users, num_movies, num_iters=10)

In [None]:
def _get_R(ratings):
    # Example: Create a COO matrix from the 'ratings' DataFrame
    rows = ratings['userId'].values
    cols = ratings['movieId'].values
    values = ratings['rating'].values

    # Create a sparse matrix in COO format
    R = sp.coo_matrix((values, (rows, cols))).tocsr()
    return R

def ALS(movies, ratings):
    R = _get_R(ratings)
    

In [243]:
class ALSRecSys:
    def __init__(self, movies, reviews):
        self.movies = self._get_movies(movies.copy())
        self.user_ids = self._get_unique_user_ids(reviews.copy())
        self.movie_ids = self._get_inique_movie_ids()
        self.u2idx = self._get_u2idx()
        self.m2idx = self._get_m2idx()
        self.idx2u = self._get_idx2u()
        self.idx2m = self._get_idx2m()
        self.R = self._get_R(reviews.copy())
        X, Y = self._ALS_fit()
        self.X = X
        self.Y = Y

    def _get_u2idx(self):
        u2idx = {user_id: idx for idx, user_id in enumerate(self.user_ids)}
        return u2idx
    
    def _get_idx2u(self):
        idx2u = {idx: user_id for idx, user_id in enumerate(self.user_ids)}
        return idx2u
    
    def _get_m2idx(self):
        m2idx = {movie_id: idx for idx, movie_id in enumerate(self.movie_ids)}
        return m2idx
    
    def _get_idx2m(self):
        idx2m = {idx: movie_id for idx, movie_id in enumerate(self.movie_ids)}
        return idx2m
    
    def _get_unique_user_ids(self, reviews):
        unique_user_ids = reviews['userId'].unique()
        return unique_user_ids
    
    def _get_inique_movie_ids(self):
        unique_movie_ids = list(self.movies.keys())
        return unique_movie_ids
    
    def _get_movies(self, movies):
        movies['genres'] = movies['genres'].apply(lambda x: set(x.split('|')))
        movies_dict = movies.set_index('movieId').to_dict(orient='index')
        return movies_dict
    
    def _get_R(self, reviews):
        # Example: Create a COO matrix from the 'ratings' DataFrame
        users = reviews['userId'].values
        items = reviews['movieId'].values
        values = reviews['rating'].values

        mapped_users = [self.u2idx[user_id] for user_id in users]
        mapped_movies = [self.m2idx[item_id] for item_id in items]
        num_users = len(self.user_ids)
        num_items = len(self.movie_ids)
        # Create a sparse matrix in COO format
        #R = sp.coo_array((values, (mapped_users, mapped_movies)), shape=(num_users, num_items)).tocsr()
        R = sp.coo_matrix((values, (mapped_users, mapped_movies))).tocsr()
        return R
    
    def _get_relevant_users(self, item):
        # Get the row indices of non-zero elements in the specified column
        relevant_users = self.R[:, item].nonzero()[0]
        return relevant_users
    
    def _get_relevant_items(self, user):
        # Get the column indices of non-zero elements in the specified row
        relevant_items = self.R[user].indices
        return relevant_items

    def _ALS_fit(self, num_dims=32, num_iters=10, eps=1e-7):
        R = self.R
        num_users = len(self.user_ids)
        num_items = len(self.movie_ids)
        X = np.random.randn(num_users, num_dims) #Users
        Y = np.random.randn(num_items, num_dims) #Items
        
        for t in range(num_iters):
            print(f"R dims: {R.shape} Y dims: {Y.shape} X dims: {X.shape}")
            RY = R @ Y
            for u in range(num_users):
                relevant_items = self._get_relevant_items(u)
                Y_rel = Y[relevant_items]
                YY = Y_rel.reshape(-1, num_dims, 1) * Y_rel.reshape(-1, 1, num_dims)
                X[u] = np.linalg.inv(YY.sum(axis=0) + eps * np.eye(num_dims)) @ RY[u]

            RX = R.T @ X
            for i in range(num_items):
                relevant_users = self._get_relevant_users(i)
                X_rel = X[relevant_users]
                XX = X_rel.reshape(-1, num_dims, 1) * X_rel.reshape(-1, 1, num_dims)
                Y[i] = np.linalg.inv(XX.sum(axis=0) + eps * np.eye(num_dims)) @ RX[i]
        
        return X, Y
    
    def _get_rankings(self, user_id_mapped):
        print(f'Shapes X[i].T: {self.X[user_id_mapped].T.shape} Y shape: {self.Y.shape}')
        rankings = self.Y @ self.X[user_id_mapped].T
        rankings = self.X[user_id_mapped] @ self.Y.T
        return rankings
    
    def get_rankings(self, user_id):
        user_id_mapped = self.u2idx[user_id]
        rankings = self._get_rankings(user_id_mapped)
        rankings = [(self.idx2m[idx], score) for idx, score in enumerate(rankings)]
        rankings = sorted(rankings, key=lambda x: x[1], reverse=True)
        return rankings
    
    def recommend(self, user_id, top_k=10):
        user_id_mapped = self.u2idx[user_id]
        rankings = self._get_rankings(user_id_mapped)
        viewed_items = set(self._get_relevant_items(user_id_mapped))

        # Filter out viewed items
        non_viewed_rankings = [(idx, score) for idx, score in enumerate(rankings) if idx not in viewed_items]

        # Get the top k items with the highest score
        top_k_recommendations = sorted(non_viewed_rankings, key=lambda x: x[1], reverse=True)[:top_k]

        recommendations = [(self.idx2m[idx], score) for idx, score in top_k_recommendations]

        response = list(map(lambda x: f"id: {x[0]: <10} {self.movies[x[0]]['title']: <80} score: {x[1]}", recommendations))
        return response

In [244]:
als = ALSRecSys(movies, ratings)
als.recommend(1)

R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
R dims: (610, 9742) Y dims: (9742, 32) X dims: (610, 32)
Shapes X[i].T: (32,) Y shape: (9742, 32)


['id: 6281       Phone Booth (2002)                                                               score: 98.27369180867166',
 "id: 2160       Rosemary's Baby (1968)                                                           score: 95.93369854401197",
 'id: 4874       K-PAX (2001)                                                                     score: 45.9573825094219',
 'id: 104841     Gravity (2013)                                                                   score: 43.84108713570221',
 'id: 31696      Constantine (2005)                                                               score: 43.19189194035022',
 'id: 6534       Hulk (2003)                                                                      score: 30.12738345126688',
 'id: 2393       Star Trek: Insurrection (1998)                                                   score: 27.682782550992385',
 'id: 152081     Zootopia (2016)                                                                  score: 25.9367499897449',
 

In [249]:
als.recommend(300)

Shapes X[i].T: (32,) Y shape: (9742, 32)


["id: 2160       Rosemary's Baby (1968)                                                           score: 965.3535742772376",
 'id: 31696      Constantine (2005)                                                               score: 624.7645069398019',
 'id: 68237      Moon (2009)                                                                      score: 609.6812883372057',
 'id: 2393       Star Trek: Insurrection (1998)                                                   score: 589.28215881504',
 'id: 1911       Dr. Dolittle (1998)                                                              score: 580.154236221196',
 'id: 1644       I Know What You Did Last Summer (1997)                                           score: 543.3274495414448',
 'id: 2944       Dirty Dozen, The (1967)                                                          score: 419.5538433574059',
 "id: 4015       Dude, Where's My Car? (2000)                                                     score: 315.87792956370436",
 '

In [235]:
als.recommend(274)

Shapes X[i].T: (5,) Y shape: (9742, 5)


['id: 2937       Palm Beach Story, The (1942)                                                     score: 160.2108893185041',
 'id: 90890      Jack and Jill (2011)                                                             score: 79.32818924605239',
 "id: 70599      Time Traveler's Wife, The (2009)                                                 score: 69.85317618517057",
 'id: 3858       Cecil B. DeMented (2000)                                                         score: 66.44841301775092',
 "id: 1633       Ulee's Gold (1997)                                                               score: 60.82619460302493",
 'id: 5632       Bloody Sunday (2002)                                                             score: 56.03790582127367',
 'id: 5298       Human Nature (2001)                                                              score: 50.10362518094819',
 'id: 3871       Shane (1953)                                                                     score: 48.03182905547998',


In [232]:
ratings[ratings['userId']==274].sort_values('rating', ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp
40138,274,7438,5.0,1171172659
40555,274,79132,5.0,1284685228
40497,274,68157,5.0,1253913692
39889,274,4262,5.0,1171409417
39720,274,2959,5.0,1171172709
...,...,...,...,...
40532,274,74075,1.0,1288172742
40305,274,43904,1.0,1172023625
40082,274,6763,1.0,1172030093
39529,274,1760,0.5,1171932613


In [227]:
print(ratings['userId'].value_counts())

userId
414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
442      20
569      20
320      20
576      20
53       20
Name: count, Length: 610, dtype: int64


In [162]:
print(ratings['movieId'].unique())

[     1      3      6 ... 160836 163937 163981]


In [199]:
als = ALSRecSys(movies, ratings)
als.recommend(1)

R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)
R dims: (610, 9742) Y dims: (9742, 5) X dims: (610, 5)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 9742 is different from 5)

In [176]:
print(als.R)

  (0, 0)	4.0
  (0, 2)	4.0
  (0, 5)	4.0
  (0, 43)	5.0
  (0, 46)	5.0
  (0, 62)	3.0
  (0, 89)	5.0
  (0, 97)	4.0
  (0, 124)	5.0
  (0, 130)	5.0
  (0, 136)	5.0
  (0, 184)	5.0
  (0, 190)	3.0
  (0, 197)	5.0
  (0, 201)	4.0
  (0, 224)	5.0
  (0, 257)	3.0
  (0, 275)	3.0
  (0, 291)	5.0
  (0, 307)	4.0
  (0, 314)	4.0
  (0, 320)	5.0
  (0, 325)	4.0
  (0, 367)	3.0
  (0, 384)	4.0
  :	:
  (609, 9256)	5.0
  (609, 9264)	4.5
  (609, 9274)	4.0
  (609, 9286)	5.0
  (609, 9292)	3.5
  (609, 9297)	3.5
  (609, 9300)	3.0
  (609, 9306)	3.0
  (609, 9322)	3.0
  (609, 9325)	2.5
  (609, 9330)	4.5
  (609, 9335)	3.0
  (609, 9342)	3.0
  (609, 9357)	4.0
  (609, 9359)	4.0
  (609, 9366)	3.5
  (609, 9389)	3.5
  (609, 9390)	3.5
  (609, 9392)	5.0
  (609, 9433)	4.0
  (609, 9434)	4.0
  (609, 9461)	5.0
  (609, 9462)	5.0
  (609, 9463)	5.0
  (609, 9503)	3.0


In [190]:
print(als.X.shape, als.Y.shape)
pred = np.array(als.X @ als.Y.T)
print(np.max(pred), np.min(pred))

(610, 5) (9742, 5)
3777.745317209904 -3846.367989291386


In [191]:
print(pred.shape)

(610, 9742)


In [192]:
print(pred[0])

[4.68833641 4.19056635 3.74694788 ... 0.72456087 0.72456087 0.18346809]


In [194]:
print(als.R[0, :].todense())

[[4. 0. 4. ... 0. 0. 0.]]


In [188]:
R_dense = als.R.todense()

d = np.array(R_dense - pred)
print(np.max(d), np.min(d))
rmse = np.sqrt(np.mean(d**2))
print(rmse)

3565.971597518866 -3230.014870364964
17.943103145890976
