# Project work part 4

Copy necessary parts from part 1, 2 and 3.

To run the code install all imported packages and run all cells in order.

In [49]:
import numpy as np
import polars as pl

# read data
links_df = pl.read_csv("../data/links.csv", schema={"movieId": pl.Int32, "imdb": pl.Int32, "tmdbId": pl.Int32})
movies_df = (
    pl.read_csv("../data/movies.csv", schema={"movieId": pl.Int32, "title": pl.String, "genres": pl.String})
    .with_row_index()
    .with_columns(
        pl.col("index")
        .cast(pl.Int32)
    )
)
ratings_df = pl.read_csv("../data/ratings.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "rating": pl.Float32, "timestamp": pl.Int32})

To produce counterfactual explanations we define helper functions in order to modify user histories. When this function is used and the model is retrained, the recommendations might change. Counterfactual recommendations can then be derived by comparing the recommendations before and after the changing of group history. 

In [76]:
counterfactual_explanation_ratings_df = ratings_df

def remove_movie_from_user_histories(user_ids, movie_id):
    """
    Removes a movie from the user histories.
    
    :param user_ids: list of user ids 
    :param movie_id: movie to remove
    :return: 
    """
    
    global counterfactual_explanation_ratings_df
    
    for user_id in user_ids:
        counterfactual_explanation_ratings_df = (
            ratings_df
            .filter(
                (pl.col("userId") != user_id) | (pl.col("movieId") != movie_id)
            )
        ) 
        
    affected_users = (
        ratings_df
        .filter(pl.col("userId").is_in(user_ids) & (pl.col("movieId") == movie_id))
        .select("userId")
        .to_numpy()
        .reshape(-1)
        .tolist()
    )
    
    movie_title = (
        movies_df
        .filter(pl.col("movieId") == movie_id)
        .select("title")
        .to_numpy()
        .reshape(-1)
        [0]
    )
    
    print(f"Removed movie '{movie_title}' with id {movie_id} from user histories.")
    print(f"Affected users: {affected_users}")
        
def reset_ratings():
    """
    Resets the ratings to the original state.
    """
    
    global counterfactual_explanation_ratings_df
    counterfactual_explanation_ratings_df = ratings_df

More stuff copied from part 1, 2 and 3.

We remove the diversity score calculation and sequential recommendation functions in order to focus solely on the counterfactual explanations.

In [53]:
from sklearn.metrics.pairwise import nan_euclidean_distances

def get_top_n_most_similar_users_euclidean(user_id, n):
    rating_vectors_df = (
        counterfactual_explanation_ratings_df
        .drop("timestamp")
        .pivot("movieId", index="userId")
    )
    
    user_ratings = rating_vectors_df.filter(pl.col("userId") == user_id).drop("userId").to_numpy()
    
    rating_vectors = (
        rating_vectors_df
        .drop("userId")
        .to_numpy()
    )
    
    # Calculate distance
    distances = nan_euclidean_distances(rating_vectors, user_ratings).reshape(-1)
    
    # Find nan values and count the amount of common movies between user and all users
    nans_in_user_ratings = np.isnan(user_ratings[0])
    nans_in_rating_vectors = np.isnan(rating_vectors)
    common_rated_movies = 1 - np.logical_or(nans_in_user_ratings, nans_in_rating_vectors)
    common_rated_movies_count = np.sum(common_rated_movies, axis=1).astype(np.float32) 
    
    # Ignore users withe less than 5 common movies
    common_rated_movies_count[common_rated_movies_count < 5] = np.nan 
    
    # Count average distances
    average_distances = distances / common_rated_movies_count
    
    # Max scaling
    max_average_distance = np.nanmax(average_distances)
    
    # Distance to similarity. Nan values are set to zero
    similarities = np.nan_to_num(1 - (average_distances / max_average_distance))
    
    distance_df = (
        pl.DataFrame(
            {
                "userId": rating_vectors_df.select("userId"),
                "similarity": similarities
            }
        )
        .sort("similarity", descending=True)
        .limit(n)
    )
    
    return (
        distance_df
        .select("userId")
        .to_numpy()
        .reshape(-1)
    ), (
        distance_df
        .select("similarity")
        .to_numpy()
        .reshape(-1)
    )

In [54]:
# Define some helper functions to extract data from the dataframes

# ratings
def get_ratings_for_users(user_ids):
    return (
        counterfactual_explanation_ratings_df
        .drop("timestamp")
        .sort("movieId")
        .pivot("userId", index="movieId")
        .drop("movieId")
        .select([str(user_id) for user_id in user_ids])
        .transpose(include_header=True, header_name="userId", column_names=user_ids)
        .drop("userId")
        .to_numpy()
    )

# ratings mean
def get_mean_ratings_for_users(user_ids):
    return (
        counterfactual_explanation_ratings_df
        .drop("timestamp")
        .sort("movieId")
        .pivot("userId", index="movieId")
        .drop("movieId")
        .select([str(user_id) for user_id in user_ids.tolist()])
        .select(pl.all().mean())
        .transpose(include_header=True, header_name="userId", column_names=["ratings_mean"])
        .cast({"userId": pl.Int16})
        .filter(pl.col("userId").is_in(user_ids))
        .select("ratings_mean")
        .to_numpy()
        .reshape(-1)
    )

# movie indices for movie ids
def get_movie_indices_for_movie_ids(movie_ids):
    return (
        counterfactual_explanation_ratings_df
        .select("movieId")
        .unique()
        .sort("movieId")
        .with_row_index()
        .filter(pl.col("movieId").is_in(movie_ids))
        .select("index")
        .to_numpy()
        .reshape(-1)
    )

# single user rating mean
def get_user_rating_mean(user_id):
    return get_mean_ratings_for_users(np.array([user_id]))[0]

# candidate movie ids
def get_candidate_movie_ids(user_id, similar_user_ids):
    rated_by_user = (
        counterfactual_explanation_ratings_df
        .filter(pl.col("userId") == user_id)
        .select("movieId")
        .to_numpy()
        .reshape(-1)
    )

    return (
        counterfactual_explanation_ratings_df
        .filter(
            pl.col("userId").is_in(similar_user_ids), # only include ratings from similar users
            pl.col("movieId").is_in(rated_by_user).not_() # only include movies that the user has not rated
        )
        .group_by("movieId")
        .agg()
        .select("movieId")
        .to_numpy()
        .reshape(-1)
    )

In [55]:
def predict_movies(user_ratings_mean, similar_users_ratings, similar_users_ratings_mean, similarity_scores, candidate_movie_indices, num_of_movies):
    """
    The prediction function from course slides.

    :param user_ratings_mean: mean of user ratings
    :param similar_users_ratings: sparse numpy 2D matrix of similar users ratings. Each row is a user and each column is a movie
    :param similar_users_ratings_mean: numpy array of mean ratings of the similar users
    :param similarity_scores: numpy array of similarity scores between the user and the similar users
    :param candidate_movie_indices: numpy array of movie indices that the similar users have rated
    :param num_of_movies: number of movies to recommend
    :return: top ranked candidate movie indices, and their predicted ratings
    """
    
    # Compute ratings only for the candidate movies to save computation
    candidate_similar_users_ratings = np.take(similar_users_ratings, candidate_movie_indices, axis=1)

    # Subtract mean rating from each rating, (r_b,p - r_b)
    delta_ratings = candidate_similar_users_ratings - similar_users_ratings_mean.reshape(similar_users_ratings_mean.shape[0], 1)

    # Sum sim(a,b) * (r_b,p - r_b)
    numerator = np.sum(similarity_scores.reshape(-1, 1) * delta_ratings, axis=0)

    # Sum |sim(a,b)|, taking absolute value to avoid negative bias even though it should not be possible
    denominator = np.sum(np.abs(similarity_scores))

    # Sum sim(a,b) * (r_b,p - r_b) / Sum |sim(a,b)|
    bias = numerator / denominator

    # pred(a,p) = r_a + bias
    predicted_ratings = user_ratings_mean + bias

    # Rank the movies by predicted rating
    indices = np.flip(np.argsort(predicted_ratings))
    predicted_movie_indices = candidate_movie_indices[indices][:num_of_movies]
    sorted_ratings = predicted_ratings[indices][:num_of_movies]

    return predicted_movie_indices, sorted_ratings

In [56]:
def predict_movies_for_user(user_id, num_of_movies):
    similar_user_ids, similarity_scores = get_top_n_most_similar_users_euclidean(user_id, 50)
    user_ratings_mean = get_user_rating_mean(user_id)
    similar_users_ratings = np.nan_to_num(get_ratings_for_users(similar_user_ids))
    similar_users_ratings_mean = get_mean_ratings_for_users(similar_user_ids)
    candidate_movie_ids = get_candidate_movie_ids(user_id, similar_user_ids)
    candidate_movie_indices = get_movie_indices_for_movie_ids(candidate_movie_ids)

    return predict_movies(user_ratings_mean, similar_users_ratings, similar_users_ratings_mean, similarity_scores, candidate_movie_indices, num_of_movies)

In [61]:
def recommend_next_movies_to_watch(user_ids):
    """
    Average aggregated movie recommendations for a group of users taking diversity into account.
    
    :param user_ids: group of users 
    :return: movie indices, ratings, and genres of the recommended movies
    """
    
    # Find movie recommendations for all users
    n_movies = 100
    predicted_movie_indices = np.ndarray(shape=(len(user_ids), n_movies), dtype=np.int32)
    predicted_ratings = np.ndarray(shape=(len(user_ids), n_movies), dtype=np.float32)
    
    for index, user_id in enumerate(user_ids):
        user_movie_indices, ratings = predict_movies_for_user(user_id, n_movies)
        predicted_movie_indices[index] = user_movie_indices
        predicted_ratings[index] = ratings
    
    # Combine the predictions into a dataframe
    predictions_df = (
        pl.DataFrame({
             "movie_index": predicted_movie_indices.reshape(-1),
             "predicted_rating": predicted_ratings.reshape(-1),
             "userId": np.array([[user_id] * n_movies for user_id in user_ids]).reshape(-1)
        })
        .unique()
        )
    
    # Find movies which have rating for all users in the group
    movie_ratings_count = (
        predictions_df
        .group_by("movie_index")
        .agg(
            pl.count("predicted_rating")
            .alias("ratings_count")
        )
        .filter(pl.col("ratings_count") >= 5)
        .sort("ratings_count", descending=True)
    )
    
    # filter predictions_df to only include movies that have been rated by all users in the group
    predictions_df = (
        predictions_df
        .filter(
            pl.col("movie_index")
            .is_in(movie_ratings_count.select("movie_index")
            )
        )
        .sort("movie_index")
        .pivot("movie_index", index="userId")
    )
    
    # Calculate the average rating for each movie for least satisfied users
    average_ratings_df = (
        predictions_df
        .filter(pl.col("userId").is_in(user_ids))
        .drop("userId")
        .mean()
    )
    
    # find the top 30 candidate movies to predict based on the average rating of the subgroup
    candidate_movie_indices = (
        average_ratings_df
        .transpose(include_header=True, header_name="index", column_names=["avg_rating"])
        .with_columns(
            pl.col("index")
            .cast(pl.Int32)
        )
        .sort("avg_rating", descending=True)
        .limit(30)
        .join(movies_df, on="index", how="left")
        .select("index")
        .to_numpy()
        .reshape(-1)
    )
    
    # Find genres of the candidate movies
    movie_genres = (
        movies_df
        .filter(pl.col("index").is_in(candidate_movie_indices))
        .select(["movieId", "genres"])
        .with_columns(genres=pl.col("genres").str.split("|"))
        .explode("genres")
    )
    
    # Keep track of recommended genres
    recommended_genres = set()
    
    # Split the candidate movies into two lists: recommended and discarded
    discarded_movie_ids = []
    recommended_movie_ids = []
    
    candidate_movie_ids = movie_genres.select("movieId").unique().to_numpy().reshape(-1)
    for movie_id in candidate_movie_ids:
        # Check if the movie has a genre that is not yet in the recommended list
        genres = movie_genres.filter(pl.col("movieId") == movie_id).select("genres").to_numpy().reshape(-1)
        len_before = len(recommended_genres)
        recommended_genres.update(genres)
        len_after = len(recommended_genres)
        if len_before < len_after:
            recommended_movie_ids.append(movie_id)
        else:
            discarded_movie_ids.append(movie_id)
            
    # Fill the list with the best average scored movies in the discarded list of movies
    if len(recommended_movie_ids) < 5:
        recommended_movie_ids.extend(discarded_movie_ids[:5-len(recommended_movie_ids)])
    
    if len(recommended_movie_ids) > 5:
        recommended_movie_ids = recommended_movie_ids[:5]
    
    # Update recommended_genres to contain only the genres of the recommended movies
    recommended_genres = movie_genres.filter(pl.col("movieId").is_in(recommended_movie_ids)).select("genres").unique().to_numpy().reshape(-1)

    # Movie ids back to indices 
    recommended_movie_indices = (
        movies_df
        .filter(pl.col("movieId").is_in(recommended_movie_ids))
        .select("index")
        .to_numpy()
        .reshape(-1)
    )
    
    # Find predicted ratings for the recommended movie for all users
    predicted_user_ratings = (
        predictions_df
        .sort("userId")
        .select([str(x) for x in recommended_movie_indices])
        .to_numpy()
        .T
    )
    
    return recommended_movie_indices, predicted_user_ratings, recommended_genres

To produce counterfactual explanations we need to have the user histories to play with. Let's find group history of rated movies.

In [58]:
# Create a group of users by picking the first 5 userIds
user_ids = [1, 2, 3, 4, 5]

group_history = (
    ratings_df
    .filter(pl.col("userId").is_in(user_ids))
    .drop("timestamp")
    .group_by("movieId")
    .agg(pl.len().alias("count"), pl.col("userId"), pl.col("rating"))
    .sort("count", descending=True)
)

print(group_history)

top_common_movies = (
    group_history
    .select("movieId")
    .limit(10)
    .to_numpy()
    .reshape(-1)
)

print(top_common_movies)

shape: (481, 4)
┌─────────┬───────┬───────────┬─────────────────┐
│ movieId ┆ count ┆ userId    ┆ rating          │
│ ---     ┆ ---   ┆ ---       ┆ ---             │
│ i32     ┆ u32   ┆ list[i32] ┆ list[f32]       │
╞═════════╪═══════╪═══════════╪═════════════════╡
│ 296     ┆ 3     ┆ [1, 4, 5] ┆ [3.0, 1.0, 5.0] │
│ 457     ┆ 3     ┆ [1, 4, 5] ┆ [5.0, 5.0, 4.0] │
│ 527     ┆ 3     ┆ [1, 3, 5] ┆ [5.0, 0.5, 5.0] │
│ 608     ┆ 3     ┆ [1, 4, 5] ┆ [5.0, 5.0, 3.0] │
│ 919     ┆ 2     ┆ [1, 4]    ┆ [5.0, 5.0]      │
│ …       ┆ …     ┆ …         ┆ …               │
│ 2019    ┆ 1     ┆ [4]       ┆ [2.0]           │
│ 3793    ┆ 1     ┆ [1]       ┆ [5.0]           │
│ 7991    ┆ 1     ┆ [3]       ┆ [5.0]           │
│ 91529   ┆ 1     ┆ [2]       ┆ [3.5]           │
│ 923     ┆ 1     ┆ [1]       ┆ [5.0]           │
└─────────┴───────┴───────────┴─────────────────┘
[ 296  457  527  608  919 1291  232  357  235  260]


Let's test our method by finding out if removing a movie from the group history affects the recommendations. We recommend 5 movies for a group of 5 users.

We take the top 10 most common movies from the user histories and remove them one by one. We then check if the recommendations change. The first round is the original recommendations. The rating history is reset after each round. 

In [80]:
original_recommendations = None
for i in range(11):
    print(f"Round {i}")
    
    if i > 0:
        # Remove movie from user histories
        remove_movie_from_user_histories(user_ids, top_common_movies[i-1])
        print()
        
    # get recommendations for the group of users
    movie_indices, user_ratings, recommended_genres = recommend_next_movies_to_watch(user_ids)
    movie_ids = (
        movies_df
        .filter(pl.col("index").is_in(movie_indices))
        .select("movieId")
        .to_numpy()
        .reshape(-1)
        .tolist()
    )
    
    if i == 0:
        print("Original recommendations")
        original_recommendations = movie_ids
    
    # reset user histories
    reset_ratings()
    
    # get recommended movie titles
    recommended_movie_titles = (
        movies_df
        .filter(pl.col("index").is_in(movie_indices))
        .drop("index")
    )
    
    print(f"Recommended movies: {recommended_movie_titles}")
    # print(f"Recommended movie ids: {movie_ids}")
    # print(f"Original movie ids: {original_recommendations}")
    if i > 0:
        not_recommended_ids = list(set(original_recommendations).difference(set(movie_ids)))
        if len(not_recommended_ids) == 0:
            print(f"Removing movie id {top_common_movies[i-1]} did not affect the recommendations.")
        else:
            not_recommended_movie_titles = (
                movies_df
                .filter(pl.col("movieId").is_in(not_recommended_ids))
                .drop("index")
            )
            print(f"Movies not recommended {not_recommended_movie_titles}")
            print()
            print(f"Removing movie id {top_common_movies[i-1]} from the group history caused movie ids {not_recommended_ids} not to be recommended.")
        
    print()

Round 0
Original recommendations
Recommended movies: shape: (5, 3)
┌─────────┬─────────────────────────────────┬─────────────────────────────┐
│ movieId ┆ title                           ┆ genres                      │
│ ---     ┆ ---                             ┆ ---                         │
│ i32     ┆ str                             ┆ str                         │
╞═════════╪═════════════════════════════════╪═════════════════════════════╡
│ 293     ┆ Léon: The Professional (a.k.a.… ┆ Action|Crime|Drama|Thriller │
│ 1199    ┆ Brazil (1985)                   ┆ Fantasy|Sci-Fi              │
│ 1220    ┆ Blues Brothers, The (1980)      ┆ Action|Comedy|Musical       │
│ 4298    ┆ Rififi (Du rififi chez les hom… ┆ Crime|Film-Noir|Thriller    │
│ 7386    ┆ Ten Commandments, The (1956)    ┆ Adventure|Drama             │
└─────────┴─────────────────────────────────┴─────────────────────────────┘

Round 1
Removed movie 'Pulp Fiction (1994)' with id 296 from user histories.
Affected users: [1,

From our experimenatation we can see that removing a movie from the group history can affect the recommendations alot. For example in round 2 removing movie 'Fugitive, The (1993)' with id 457 from user histories caused movie ids 7386, 1220, 293 and 1193 not to be recommended. This is a significant change in the recommendations. This change is likely due to the fact that 3 of 5 users had watched the movie which is a majority of the group. This change affected the found similar users for these users and thus the collaborative filtering based recommendation changed. It is worth noting that the only movie (id 4298) that still stayed in the recommendations list rose from number 4 to 3. 