# Project work part 2

In [16]:
import numpy as np
import polars as pl

# read data
links_df = pl.read_csv("../data/links.csv", schema={"movieId": pl.Int32, "imdb": pl.Int32, "tmdbId": pl.Int32})
movies_df = (
    pl.read_csv("../data/movies.csv", schema={"movieId": pl.Int32, "title": pl.String, "genres": pl.String})
    .with_row_index()
    .with_columns(
        pl.col("index")
        .cast(pl.Int32)
    )
)
ratings_df = pl.read_csv("../data/ratings.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "rating": pl.Float32, "timestamp": pl.Int32})
tags_df = pl.read_csv("../data/tags.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "tag": pl.String, "timestamp": pl.Int32})

In [17]:
from sklearn.metrics.pairwise import nan_euclidean_distances

def get_top_n_most_similar_users_euclidean(user_id, n):
    rating_vectors_df = (
        ratings_df
        .drop("timestamp")
        .pivot("movieId", index="userId")
    )
    
    user_ratings = rating_vectors_df.filter(pl.col("userId") == user_id).drop("userId").to_numpy()
    
    rating_vectors = (
        rating_vectors_df
        .drop("userId")
        .to_numpy()
    )
    
    # Calculate distance
    distances = nan_euclidean_distances(rating_vectors, user_ratings).reshape(-1)
    
    # Find nan values and count the amount of common movies between user and all users
    nans_in_user_ratings = np.isnan(user_ratings[0])
    nans_in_rating_vectors = np.isnan(rating_vectors)
    common_rated_movies = 1 - np.logical_or(nans_in_user_ratings, nans_in_rating_vectors)
    common_rated_movies_count = np.sum(common_rated_movies, axis=1).astype(np.float32) 
    
    # Ignore users withe less than 5 common movies
    common_rated_movies_count[common_rated_movies_count < 5] = np.nan 
    
    # Count average distances
    average_distances = distances / common_rated_movies_count
    
    # Max scaling
    max_average_distance = np.nanmax(average_distances)
    
    # Distance to similarity. Nan values are set to zero
    similarities = np.nan_to_num(1 - (average_distances / max_average_distance))
    
    distance_df = (
        pl.DataFrame(
            {
                "userId": rating_vectors_df.select("userId"),
                "similarity": similarities
            }
        )
        .sort("similarity", descending=True)
        .limit(n)
    )
    
    return (
        distance_df
        .select("userId")
        .to_numpy()
        .reshape(-1)
    ), (
        distance_df
        .select("similarity")
        .to_numpy()
        .reshape(-1)
    )

In [18]:
# Define some helper functions to extract data from the dataframes

# ratings
def get_ratings_for_users(user_ids):
    return (
        ratings_df
        .drop("timestamp")
        .sort("movieId")
        .pivot("userId", index="movieId")
        .drop("movieId")
        .select([str(user_id) for user_id in user_ids])
        .transpose(include_header=True, header_name="userId", column_names=user_ids)
        .drop("userId")
        .to_numpy()
    )

# ratings mean
def get_mean_ratings_for_users(user_ids):
    return (
        ratings_df
        .drop("timestamp")
        .sort("movieId")
        .pivot("userId", index="movieId")
        .drop("movieId")
        .select([str(user_id) for user_id in user_ids.tolist()])
        .select(pl.all().mean())
        .transpose(include_header=True, header_name="userId", column_names=["ratings_mean"])
        .cast({"userId": pl.Int16})
        .filter(pl.col("userId").is_in(user_ids))
        .select("ratings_mean")
        .to_numpy()
        .reshape(-1)
    )

# movie indices for movie ids
def get_movie_indices_for_movie_ids(movie_ids):
    return (
        ratings_df
        .select("movieId")
        .unique()
        .sort("movieId")
        .with_row_index()
        .filter(pl.col("movieId").is_in(movie_ids))
        .select("index")
        .to_numpy()
        .reshape(-1)
    )

# single user rating mean
def get_user_rating_mean(user_id):
    return get_mean_ratings_for_users(np.array([user_id]))[0]

# candidate movie ids
def get_candidate_movie_ids(user_id, similar_user_ids):
    rated_by_user = (
        ratings_df
        .filter(pl.col("userId") == user_id)
        .select("movieId")
        .to_numpy()
        .reshape(-1)
    )

    return (
        ratings_df
        .filter(
            pl.col("userId").is_in(similar_user_ids), # only include ratings from similar users
            pl.col("movieId").is_in(rated_by_user).not_() # only include movies that the user has not rated
        )
        .group_by("movieId")
        .agg()
        .select("movieId")
        .to_numpy()
        .reshape(-1)
    )

In [19]:
def predict_movies(user_ratings_mean, similar_users_ratings, similar_users_ratings_mean, similarity_scores, candidate_movie_indices, num_of_movies):
    """
    The prediction function from course slides.

    :param user_ratings_mean: mean of user ratings
    :param similar_users_ratings: sparse numpy 2D matrix of similar users ratings. Each row is a user and each column is a movie
    :param similar_users_ratings_mean: numpy array of mean ratings of the similar users
    :param similarity_scores: numpy array of similarity scores between the user and the similar users
    :param candidate_movie_indices: numpy array of movie indices that the similar users have rated
    :param num_of_movies: number of movies to recommend
    :return: top ranked candidate movie indices, and their predicted ratings
    """
    
    # Compute ratings only for the candidate movies to save computation
    candidate_similar_users_ratings = np.take(similar_users_ratings, candidate_movie_indices, axis=1)

    # Subtract mean rating from each rating, (r_b,p - r_b)
    delta_ratings = candidate_similar_users_ratings - similar_users_ratings_mean.reshape(similar_users_ratings_mean.shape[0], 1)

    # Sum sim(a,b) * (r_b,p - r_b)
    numerator = np.sum(similarity_scores.reshape(-1, 1) * delta_ratings, axis=0)

    # Sum |sim(a,b)|, taking absolute value to avoid negative bias even though it should not be possible
    denominator = np.sum(np.abs(similarity_scores))

    # Sum sim(a,b) * (r_b,p - r_b) / Sum |sim(a,b)|
    bias = numerator / denominator

    # pred(a,p) = r_a + bias
    predicted_ratings = user_ratings_mean + bias

    # Rank the movies by predicted rating
    indices = np.flip(np.argsort(predicted_ratings))
    predicted_movie_indices = candidate_movie_indices[indices][:num_of_movies]
    sorted_ratings = predicted_ratings[indices][:num_of_movies]

    return predicted_movie_indices, sorted_ratings

In [20]:
def predict_movies_for_user(user_id, num_of_movies):
    similar_user_ids, similarity_scores = get_top_n_most_similar_users_euclidean(user_id, 50)
    
    user_ratings_mean = get_user_rating_mean(user_id)
    # print(user_ratings_mean)
    similar_users_ratings = np.nan_to_num(get_ratings_for_users(similar_user_ids))
    # print(similar_users_ratings)
    similar_users_ratings_mean = get_mean_ratings_for_users(similar_user_ids)
    # print(similar_users_ratings_mean)
    candidate_movie_ids = get_candidate_movie_ids(user_id, similar_user_ids)

    candidate_movie_indices = get_movie_indices_for_movie_ids(candidate_movie_ids)

    return predict_movies(user_ratings_mean, similar_users_ratings, similar_users_ratings_mean, similarity_scores, candidate_movie_indices, num_of_movies)

Group recommendation function to recommend next movie to watch for the group. We use average aggregation of the movie recommendations for the group of users since it gives better top results.

In [26]:
# Keep track of previously watched movies and ratings
watched_movies_ratings = pl.DataFrame(
    data={
        "movie_index": [],
        "rating": [],
        "userId": []
    },
    schema={
        "movie_index": pl.Int32,
        "rating": pl.Float32,
        "userId": pl.Int32
    }
)

In [22]:
def recommend_next_movie_to_watch(user_ids, least_satisfied_user_ids):
    """
    Average aggregated movie recommendations for a group of users.
    :return: movie ids 
    """    
    
    # Find movie recommendations for all users
    n_movies = 500
    predicted_movie_indices = np.ndarray(shape=(len(user_ids), n_movies), dtype=np.int32)
    predicted_ratings = np.ndarray(shape=(len(user_ids), n_movies), dtype=np.float32)
    
    for index, user_id in enumerate(user_ids):
        user_movie_indices, ratings = predict_movies_for_user(user_id, n_movies)
        predicted_movie_indices[index] = user_movie_indices
        predicted_ratings[index] = ratings
    
    # Combine the predictions into a dataframe
    predictions_df = (
        pl.DataFrame({
             "movie_index": predicted_movie_indices.reshape(-1),
             "predicted_rating": predicted_ratings.reshape(-1),
             "userId": np.array([[user_id] * n_movies for user_id in user_ids]).reshape(-1)
        })
        .unique()
        )
    
    # Find movies which have rating for all users in the group
    movie_ratings_count = (
        predictions_df
        .group_by("movie_index")
        .agg(
            pl.count("predicted_rating")
            .alias("ratings_count")
        )
        .filter(pl.col("ratings_count") >= 5)
        .sort("ratings_count", descending=True)
    )
    
    # filter predictions_df to only include movies that have been rated by all users in the group
    predictions_df = (
        predictions_df
        .filter(
            pl.col("movie_index")
            .is_in(movie_ratings_count.select("movie_index")
            )
        )
        .sort("movie_index")
        .pivot("movie_index", index="userId")
    )
    
    # Calculate the average rating for each movie for least satisfied users
    average_ratings_df = (
        predictions_df
        .filter(pl.col("userId").is_in(least_satisfied_user_ids))
        .drop("userId")
        .mean()
    )
    
    # find the top 10 movies based on the average rating
    recommended_movie_index= (
        average_ratings_df
        .transpose(include_header=True, header_name="index", column_names=["avg_rating"])
        .with_columns(
            pl.col("index")
            .cast(pl.Int32)
        )
        .sort("avg_rating", descending=True)
        .filter(pl.col("index").is_in((watched_movies_ratings.select("movie_index").unique().to_series().to_list())).not_())
        .limit(1)
        .join(movies_df, on="index", how="left")
        .select("index")
        .to_numpy()
        .reshape(-1)
        [0]
    )
    
    predicted_user_ratings = (
        predictions_df
        .sort("userId")
        .select([str(recommended_movie_index)])
        .to_numpy()
        .reshape(-1)
    )
    
    return recommended_movie_index, predicted_user_ratings

In [23]:
def set_movie_watched(movie_index, user_ratings, user_ids):
    new_rows = pl.DataFrame(
        data={
            "movie_index": [movie_index] * len(user_ids),
            "rating": user_ratings,
            "userId": user_ids
        },
        schema={
            "movie_index": pl.Int32,
            "rating": pl.Float32,
            "userId": pl.Int32
        }
    )
    
    watched_movies_ratings.extend(new_rows)

In [27]:
def find_least_satisfied_users():
    average_ratings_df = (
        # Find average scores of movies which have been watched by the group
        watched_movies_ratings
        .group_by("userId")
        .agg(
            pl.col("rating").mean().alias("avg_rating")
        )
        .sort("avg_rating", descending=True)
    )
    print(average_ratings_df)
    
    sorted_users = average_ratings_df.select("userId").to_numpy().reshape(-1)
    
    # Boundary user is the user who is boundary to the least satisfied subgroup of users. 
    boundary_user_id = (
        average_ratings_df
        # Calculate delta ratings
        .with_columns(
            next_rating=pl.col("avg_rating").shift(-1)
        )
        .drop_nulls()
        .with_columns(
            delta_rating=pl.col("avg_rating") - pl.col("next_rating")
        )
        # Sort by delta rating to find the boundary user
        .select("userId", "delta_rating")
        .sort("delta_rating", descending=True)
        .limit(1)
        .select("userId")
        .to_numpy()
        .reshape(-1)
        [0]
    )
    
    # Boundary user is left out of the subgroup
    least_satisfied_users = sorted_users[np.where(sorted_users == boundary_user_id)[0][0]+1:]
    
    return least_satisfied_users


In [28]:
# Create a group of users by picking the first 5 userIds
user_ids = [2, 3, 4, 5, 6]

least_satisfied_users = user_ids
for i in range(10):
    # get recommendations for the group of users
    movie_index, user_ratings = recommend_next_movie_to_watch(user_ids, least_satisfied_users)
    set_movie_watched(movie_index, user_ratings, user_ids)
    least_satisfied_users = find_least_satisfied_users()
    print(f"Least satisfied users after round {i+1}: {least_satisfied_users}")
    print()

shape: (5, 2)
┌────────┬────────────┐
│ userId ┆ avg_rating │
│ ---    ┆ ---        │
│ i32    ┆ f32        │
╞════════╪════════════╡
│ 4      ┆ 3.62391    │
│ 2      ┆ 3.041938   │
│ 3      ┆ 2.912777   │
│ 5      ┆ 2.342556   │
│ 6      ┆ 2.251716   │
└────────┴────────────┘
Least satisfied users after round 1: [2 3 5 6]

shape: (5, 2)
┌────────┬────────────┐
│ userId ┆ avg_rating │
│ ---    ┆ ---        │
│ i32    ┆ f32        │
╞════════╪════════════╡
│ 4      ┆ 3.691359   │
│ 2      ┆ 3.261549   │
│ 3      ┆ 2.618451   │
│ 5      ┆ 2.350739   │
│ 6      ┆ 2.101254   │
└────────┴────────────┘
Least satisfied users after round 2: [3 5 6]

shape: (5, 2)
┌────────┬────────────┐
│ userId ┆ avg_rating │
│ ---    ┆ ---        │
│ i32    ┆ f32        │
╞════════╪════════════╡
│ 4      ┆ 3.563083   │
│ 2      ┆ 2.988851   │
│ 3      ┆ 2.640081   │
│ 5      ┆ 2.207835   │
│ 6      ┆ 2.024376   │
└────────┴────────────┘
Least satisfied users after round 3: [2 3 5 6]

shape: (5, 2)
┌────────┬