In [40]:
import pandas as pd
import numpy as np

In [41]:
df = pd.read_csv('Users-Movies Matrix.csv', index_col='User_ID') #read user-movies matrix with user ids as indices
ratings = df.values

In [42]:
def matrix_factorization(R, P, Q, k=5, steps=100, alpha=0.02, beta=0.01, threshold=0.01):
    Q = Q.T # Transposes the item-feature matrix Q for easier matrix multiplication with P.
    prev_e = float('inf') # Initializes prev_e to infinity, which will be used to track the change in the error for early stopping.
    for step in range(steps): # Begins a loop that will run for a number of iterations specified by steps.
        for i in range(len(R)): 
            for j in range(len(R[i])): # These nested loops iterate over each element in the rating matrix R.
                if R[i][j] > 0: # Checks if the current rating is greater than 0 (indicating an actual rating, as opposed to a missing rating).
                    eij = R[i][j] - np.dot(P[i, :], Q[:, j]) # Calculates the error (eij) for the current rating by subtracting the predicted rating (obtained by the dot product of the user's and item's latent feature vectors) from the actual rating.
                    for r in range(k): # Iterates over each latent feature.
                        P[i][r] += alpha * (2 * eij * Q[r][j] - beta * P[i][r])
                        Q[r][j] += alpha * (2 * eij * P[i][r] - beta * Q[r][j])
                        # Updates the user and item latent feature matrices (P and Q) using gradient descent. This includes a term for the error (eij) and a regularization term controlled by beta.

        eR = np.dot(P, Q) # Calculates the complete predicted rating matrix by multiplying the updated user and item latent feature matrices.
        e = 0 # Initializes the total error for the current iteration to zero.
        for i in range(len(R)): # The nested loops again iterate over each rating in R.
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e += pow(R[i][j] - np.dot(P[i, :], Q[:, j]), 2) # Accumulates the squared error for each actual rating in R.
                    for r in range(k):
                        e += (beta / 2) * (pow(P[i][r], 2) + pow(Q[r][j], 2)) # Adds regularization terms to the total error.

        # Check for early stopping
        if step > 0 and abs(prev_e - e) < threshold: # Checks if the change in error is less than the threshold, indicating convergence.
            print(f"Stopping early at step {step}")
            break
        prev_e = e # Updates prev_e with the total error from the current iteration for comparison in the next iteration.

    return P, Q.T # Returns the final user-feature matrix P and the transposed item-feature matrix Q (transposed back to its original form).


In [43]:
'''
n_users = ratings.shape[0]
n_movies = ratings.shape[1]
k = 20  # Number of latent features

np.random.seed(0)
user_matrix = np.random.normal(scale=1./k, size=(n_users, k))
movie_matrix = np.random.normal(scale=1./k, size=(n_movies, k))
'''


'\nn_users = ratings.shape[0]\nn_movies = ratings.shape[1]\nk = 20  # Number of latent features\n\nnp.random.seed(0)\nuser_matrix = np.random.normal(scale=1./k, size=(n_users, k))\nmovie_matrix = np.random.normal(scale=1./k, size=(n_movies, k))\n'

In [44]:
df_subset = df.head(10) # taking small subset of users to test
ratings_subset = df_subset.values
n_users_subset = ratings_subset.shape[0]  # Number of users in the subset
n_movies = ratings_subset.shape[1]        # Number of movies
k = 5                                    # Number of latent features to extract

np.random.seed(0)
user_matrix_subset = np.random.normal(scale=1./k, size=(n_users_subset, k))
movie_matrix_subset = np.random.normal(scale=1./k, size=(n_movies, k))


In [45]:
P_subset, Q_subset = matrix_factorization(ratings_subset, user_matrix_subset, movie_matrix_subset, k)


In [51]:
predicted_ratings_subset = np.dot(P_subset, Q_subset.T)
predicted_ratings_subset = np.clip(predicted_ratings_subset, 1, 5)



In [52]:
movie_titles = df.columns.tolist()
user_id = 6
user_index = df.index.get_loc(user_id)
user_predicted_ratings = predicted_ratings_subset[user_index]

In [53]:
movies_with_predictions = list(zip(movie_titles, user_predicted_ratings))
sorted_movies_with_predictions = sorted(movies_with_predictions, key=lambda x: x[1], reverse=True)
sorted_movies_with_predictions

[('Beverly Hills Cop', 5.0),
 ('Sixteen Candles', 5.0),
 ('Training Day', 5.0),
 ('U.S. Marshals', 5.0),
 ('Chocolat', 4.941720928671315),
 ('Joy Ride', 4.903885252620871),
 ('Three Musketeers', 4.888544294443255),
 ('Harold and Kumar Go to White Castle', 4.865901935976154),
 ('Dawn of the Dead', 4.856367991298006),
 ('The Missing', 4.841239169187098),
 ('Shanghai Noon', 4.809981803954141),
 ('Ray', 4.803864434618918),
 ("Charlotte's Web", 4.802507401458694),
 ('Hercules', 4.731019843020174),
 ('Reservoir Dogs', 4.729777786533682),
 ('The Dead Zone: Season 2', 4.715258152509358),
 ('Rookie of the Year', 4.687948829341757),
 ('The Bourne Supremacy', 4.640423116778324),
 ('The Longest Yard', 4.607753677507673),
 ('Speed', 4.585863518989189),
 ("The Wizard of Oz: Collector's Edition", 4.576208211501693),
 ('Braveheart', 4.574362211356598),
 ('Free Willy', 4.5682257293103685),
 ('Bend It Like Beckham', 4.562757420310416),
 ('Ever After: A Cinderella Story', 4.520364542439153),
 ('Signs', 4

In [54]:
user_original_ratings = df.loc[user_id].values
user_original_ratings = df.loc[user_id].values

unrated_movies_with_predictions = [(movie, rating) for movie, rating in sorted_movies_with_predictions if user_original_ratings[movie_titles.index(movie)] == 0]


In [55]:
N = 10  # Number of top recommendations to extract
top_recommendations = unrated_movies_with_predictions[:N]

print(f"Top {N} movie recommendations for User ID {user_id}:")
for movie, predicted_rating in top_recommendations:
    print(f"Movie: {movie}, Predicted Rating: {predicted_rating:.2f}")


Top 10 movie recommendations for User ID 6:
Movie: Beverly Hills Cop, Predicted Rating: 5.00
Movie: Sixteen Candles, Predicted Rating: 5.00
Movie: Training Day, Predicted Rating: 5.00
Movie: U.S. Marshals, Predicted Rating: 5.00
Movie: Chocolat, Predicted Rating: 4.94
Movie: Joy Ride, Predicted Rating: 4.90
Movie: Harold and Kumar Go to White Castle, Predicted Rating: 4.87
Movie: Dawn of the Dead, Predicted Rating: 4.86
Movie: The Missing, Predicted Rating: 4.84
Movie: Shanghai Noon, Predicted Rating: 4.81
