In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import altair as alt

In [None]:
# prompt to Claude: Can you generate a user-movie rating matrix? users = ['Alice', 'Bob', 'Charlie', 'David'] movies = ['Parasite', 'Train to Busan', 'Inception', 'The Wolf of Wall Street', 'The Matrix', 'Forrest Gump', 'Avengers: Endgame', 'Spider-Man: Into the Spider-Verse', 'La La Land', 'Get Out', 'Dunkirk', 'Baby Driver', 'Blade Runner 2049', 'Arrival', 'The Grand Budapest Hotel'] Add 5 more users with desired genre. I want to use the matrix for testing my simple algorithm.
# Define users and movies
users = ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan']
movies = ['Parasite', 'Train to Busan', 'Inception', 'The Wolf of Wall Street', 'The Matrix', 'Forrest Gump', 'Avengers: Endgame', 'Spider-Man: Into the Spider-Verse', 'La La Land', 'Get Out', 'Dunkirk', 'Baby Driver', 'Blade Runner 2049', 'Arrival', 'The Grand Budapest Hotel']

# Define user preferences
user_preferences = {
    'Alice': ['Action', 'Sci-Fi'],
    'Bob': ['Drama', 'Comedy'],
    'Charlie': ['Horror', 'Thriller'],
    'David': ['Action', 'Adventure'],
    'Eve': ['Romance', 'Comedy'],
    'Frank': ['Action', 'Sci-Fi'],
    'Grace': ['Drama', 'Mystery'],
    'Heidi': ['Animation', 'Family'],
    'Ivan': ['Crime', 'Thriller']
}

# Define movie genres
movie_genres = {
    'Parasite': ['Drama', 'Thriller'],
    'Train to Busan': ['Horror', 'Action'],
    'Inception': ['Sci-Fi', 'Action'],
    'The Wolf of Wall Street': ['Biography', 'Crime', 'Drama'],
    'The Matrix': ['Sci-Fi', 'Action'],
    'Forrest Gump': ['Drama', 'Romance'],
    'Avengers: Endgame': ['Action', 'Adventure', 'Sci-Fi'],
    'Spider-Man: Into the Spider-Verse': ['Animation', 'Action', 'Adventure'],
    'La La Land': ['Comedy', 'Drama', 'Romance'],
    'Get Out': ['Horror', 'Mystery'],
    'Dunkirk': ['Action', 'Drama', 'History'],
    'Baby Driver': ['Action', 'Crime'],
    'Blade Runner 2049': ['Sci-Fi', 'Mystery'],
    'Arrival': ['Drama', 'Sci-Fi'],
    'The Grand Budapest Hotel': ['Adventure', 'Comedy', 'Drama']
}

# Create rating matrix
rating_matrix = np.zeros((len(users), len(movies)))

# Populate rating matrix
for i, user in enumerate(users):
    for j, movie in enumerate(movies):
        user_genres = user_preferences[user]
        movie_genres_set = set(movie_genres[movie])
        if any(genre in movie_genres_set for genre in user_genres):
            rating_matrix[i, j] = np.random.randint(3, 6)  # Rating between 3 and 5

print("User-Movie Rating Matrix:")
print(rating_matrix)
rating_matrix[rating_matrix < 0.5] = np.nan
ratings = rating_matrix

In [None]:

# Number of latent factors
k = 2

mses = []

# Function to compute the root mean squared error (RMSE)
def rmse(ratings, predicted_ratings):
    squared_errors = np.square(ratings - predicted_ratings)
    non_nan_errors = squared_errors[~np.isnan(squared_errors)]
    return np.sqrt(np.mean(non_nan_errors))

# Function to perform matrix factorization
def matrix_factorization(ratings, k, steps=10000, gamma=0.001):
    m, n = ratings.shape
    P = np.random.rand(m, k)  # User factor matrix
    Q = np.random.rand(n, k)  # Movie factor matrix

    for step in range(steps):
        for i in range(m):
            for j in range(n):
                if not np.isnan(ratings[i, j]):
                    err = ratings[i, j] - np.dot(P[i, :], Q[j, :].T)
                    P[i, :] += gamma * err * Q[j, :]
                    Q[j, :] += gamma * err * P[i, :]

        mses.append(rmse(ratings, np.dot(P, Q.T)))
        if step % 1000 == 0:
            print(f"Step: {step}, RMSE: {mses[-1]:.4f}")

    return P, Q

# Perform matrix factorization
P, Q = matrix_factorization(ratings, k)

# Predict ratings
predicted_ratings = np.dot(P, Q.T)

plt.plot(mses)
plt.xlabel("iteration"); plt.ylabel("mse")
plt.show()

In [None]:
# prompt example: Can you visualize predicted_rating matrix, using altair? I want to see user id and movie names. Can you use different colors depending on the preeicted_rating or rating?


# Create a DataFrame with the user IDs, movie names, ratings, and predicted ratings
df = pd.DataFrame({
    "user_id": np.repeat(users, len(movies)),
    "movie": np.tile(movies, len(users)),
    "rating": ratings.flatten(),
    "predicted_rating": predicted_ratings.flatten(),
})

# Create a scatter plot with the user IDs on the x-axis and the movie names on the y-axis
alt.Chart(df).mark_circle().encode(
    x="user_id:N",
    y="movie:N",
    color="rating:Q",
    tooltip=["user_id", "movie", "rating", "predicted_rating"],
).interactive()


In [None]:
# Create a heatmap with the user IDs on the x-axis and the movie names on the y-axis
alt.Chart(df).mark_rect().encode(
    x="user_id:N",
    y="movie:N",
    color="predicted_rating:Q",
    tooltip=["user_id", "movie", "rating", "predicted_rating"],
).interactive()


In [None]:
# visualize movie embedding
Q_df = pd.DataFrame(Q, columns=["x","y"])
df = Q_df.join(pd.DataFrame(movies, columns=["title"]))
chart = alt.Chart(df).mark_circle().encode(
    x='x', # Encoding along the x-axis
    y='y', # Encoding along the y-axis
    tooltip=['title'],
    # color='Origin', # Category encoding by color
)

chart.interactive()

In [None]:
# visualize user embedding
Q_df = pd.DataFrame(P, columns=["x","y"])
df = Q_df.join(pd.DataFrame(users, columns=["title"]))
chart = alt.Chart(df).mark_circle().encode(
    x='x', # Encoding along the x-axis
    y='y', # Encoding along the y-axis
    tooltip=['title'],
    # color='Origin', # Category encoding by color
)

chart.interactive()

# Appendix: Get user_rating_matrix from huggigface

In [None]:

!pip install datasets
from datasets import load_dataset
dataset = load_dataset("ashraq/movielens_ratings", split="validation")
# check data here: https://huggingface.co/datasets/ashraq/movielens_ratings

# prompt: Can you extract user-rating matrix from dataset?
# I manually change from userid to user_id

user_rating_matrix = dataset.map(lambda x: {'user_id': x["user_id"], 'item_id': x["movie_id"], 'rating': x["rating"]})
user_rating_matrix = user_rating_matrix.to_pandas()
user_rating_matrix = user_rating_matrix.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)

In [None]:
user_rating_matrix

# Appendix: Nonnegative matrix factorization (tood)