# Project work part 2

In [45]:
import numpy as np
import polars as pl

# read data
links_df = pl.read_csv("../data/links.csv", schema={"movieId": pl.Int32, "imdb": pl.Int32, "tmdbId": pl.Int32})
movies_df = (
    pl.read_csv("../data/movies.csv", schema={"movieId": pl.Int32, "title": pl.String, "genres": pl.String})
    .with_row_index()
    .with_columns(
        pl.col("index")
        .cast(pl.Int32)
    )
)
ratings_df = pl.read_csv("../data/ratings.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "rating": pl.Float32, "timestamp": pl.Int32})
tags_df = pl.read_csv("../data/tags.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "tag": pl.String, "timestamp": pl.Int32})

print("Number of ratings: ", ratings_df.height)

# Divide the dataset to two parts based on the timestamp 
# January 1, 1, 2016, 00:00:00
ratings_df_before = (
    ratings_df
    .with_columns(
        rating = pl.when(pl.col("timestamp") >= 1451599200).then(None).otherwise(pl.col("rating")),
        timestamp = pl.when(pl.col("timestamp") >= 1451599200).then(None).otherwise(pl.col("timestamp"))
    )
)

print("Number of ratings in before:", ratings_df_before.height)

Number of ratings:  100836
Number of ratings in before: 100836


In [39]:
print(ratings_df)
print(movies_df)

shape: (100_836, 4)
┌────────┬─────────┬────────┬────────────┐
│ userId ┆ movieId ┆ rating ┆ timestamp  │
│ ---    ┆ ---     ┆ ---    ┆ ---        │
│ i32    ┆ i32     ┆ f32    ┆ i32        │
╞════════╪═════════╪════════╪════════════╡
│ 1      ┆ 1       ┆ 4.0    ┆ 964982703  │
│ 1      ┆ 3       ┆ 4.0    ┆ 964981247  │
│ 1      ┆ 6       ┆ 4.0    ┆ 964982224  │
│ 1      ┆ 47      ┆ 5.0    ┆ 964983815  │
│ 1      ┆ 50      ┆ 5.0    ┆ 964982931  │
│ …      ┆ …       ┆ …      ┆ …          │
│ 610    ┆ 166534  ┆ 4.0    ┆ 1493848402 │
│ 610    ┆ 168248  ┆ 5.0    ┆ 1493850091 │
│ 610    ┆ 168250  ┆ 5.0    ┆ 1494273047 │
│ 610    ┆ 168252  ┆ 5.0    ┆ 1493846352 │
│ 610    ┆ 170875  ┆ 3.0    ┆ 1493846415 │
└────────┴─────────┴────────┴────────────┘
shape: (9_742, 4)
┌───────┬─────────┬─────────────────────────────────┬─────────────────────────────────┐
│ index ┆ movieId ┆ title                           ┆ genres                          │
│ ---   ┆ ---     ┆ ---                             ┆ -

In [40]:
from sklearn.metrics.pairwise import nan_euclidean_distances

def get_top_n_most_similar_users_euclidean(user_id, n):
    rating_vectors_df = (
        ratings_df_before
        .drop("timestamp")
        .pivot("movieId", index="userId")
    )
    
    user_ratings = rating_vectors_df.filter(pl.col("userId") == user_id).drop("userId").to_numpy()
    
    rating_vectors = (
        rating_vectors_df
        .drop("userId")
        .to_numpy()
    )
    
    # Calculate distance
    distances = nan_euclidean_distances(rating_vectors, user_ratings).reshape(-1)
    
    # Find nan values and count the amount of common movies between user and all users
    nans_in_user_ratings = np.isnan(user_ratings[0])
    nans_in_rating_vectors = np.isnan(rating_vectors)
    common_rated_movies = 1 - np.logical_or(nans_in_user_ratings, nans_in_rating_vectors)
    common_rated_movies_count = np.sum(common_rated_movies, axis=1).astype(np.float32) 
    
    # Ignore users withe less than 5 common movies
    common_rated_movies_count[common_rated_movies_count < 5] = np.nan 
    
    # Count average distances
    average_distances = distances / common_rated_movies_count
    
    # Max scaling
    max_average_distance = np.nanmax(average_distances)
    
    # Distance to similarity. Nan values are set to zero
    similarities = np.nan_to_num(1 - (average_distances / max_average_distance))
    
    distance_df = (
        pl.DataFrame(
            {
                "userId": rating_vectors_df.select("userId"),
                "similarity": similarities
            }
        )
        .sort("similarity", descending=True)
        .limit(n)
    )
    
    return (
        distance_df
        .select("userId")
        .to_numpy()
        .reshape(-1)
    ), (
        distance_df
        .select("similarity")
        .to_numpy()
        .reshape(-1)
    )

In [41]:
# Define some helper functions to extract data from the dataframes

# ratings
def get_ratings_for_users(user_ids):
    return (
        ratings_df_before
        .drop("timestamp")
        .sort("movieId")
        .pivot("userId", index="movieId")
        .drop("movieId")
        .select([str(user_id) for user_id in user_ids])
        .transpose(include_header=True, header_name="userId", column_names=user_ids)
        .drop("userId")
        .to_numpy()
    )

print(get_ratings_for_users(np.array([1, 2, 3])))


# ratings mean
def get_mean_ratings_for_users(user_ids):
    return (
        ratings_df_before
        .drop("timestamp")
        .sort("movieId")
        .pivot("userId", index="movieId")
        .drop("movieId")
        .select([str(user_id) for user_id in user_ids.tolist()])
        .select(pl.all().mean())
        .transpose(include_header=True, header_name="userId", column_names=["ratings_mean"])
        .cast({"userId": pl.Int16})
        .filter(pl.col("userId").is_in(user_ids))
        .select("ratings_mean")
        .to_numpy()
        .reshape(-1)
    )

# movie indices for movie ids
def get_movie_indices_for_movie_ids(movie_ids):
    return (
        ratings_df
        .select("movieId")
        .unique()
        .sort("movieId")
        .with_row_index()
        .filter(pl.col("movieId").is_in(movie_ids))
        .select("index")
        .to_numpy()
        .reshape(-1)
    )

# single user rating mean
def get_user_rating_mean(user_id):
    return get_mean_ratings_for_users(np.array([user_id]))[0]

# candidate movie ids
def get_candidate_movie_ids(user_id, similar_user_ids):
    rated_by_user = (
        ratings_df_before
        .filter(pl.col("userId") == user_id)
        .select("movieId")
        .to_numpy()
        .reshape(-1)
    )

    return (
        ratings_df_before
        .filter(
            pl.col("userId").is_in(similar_user_ids), # only include ratings from similar users
            pl.col("movieId").is_in(rated_by_user).not_() # only include movies that the user has not rated
        )
        .group_by("movieId")
        .agg()
        .select("movieId")
        .to_numpy()
        .reshape(-1)
    )

[[ 4. nan  4. ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [42]:
def predict_movies(user_ratings_mean, similar_users_ratings, similar_users_ratings_mean, similarity_scores, candidate_movie_indices, num_of_movies):
    """
    The prediction function from course slides.

    :param user_ratings_mean: mean of user ratings
    :param similar_users_ratings: sparse numpy 2D matrix of similar users ratings. Each row is a user and each column is a movie
    :param similar_users_ratings_mean: numpy array of mean ratings of the similar users
    :param similarity_scores: numpy array of similarity scores between the user and the similar users
    :param candidate_movie_indices: numpy array of movie indices that the similar users have rated
    :param num_of_movies: number of movies to recommend
    :return: top ranked candidate movie indices, and their predicted ratings
    """
    
    # Compute ratings only for the candidate movies to save computation
    candidate_similar_users_ratings = np.take(similar_users_ratings, candidate_movie_indices, axis=1)

    # Subtract mean rating from each rating, (r_b,p - r_b)
    delta_ratings = candidate_similar_users_ratings - similar_users_ratings_mean.reshape(similar_users_ratings_mean.shape[0], 1)

    # Sum sim(a,b) * (r_b,p - r_b)
    numerator = np.sum(similarity_scores.reshape(-1, 1) * delta_ratings, axis=0)

    # Sum |sim(a,b)|, taking absolute value to avoid negative bias even though it should not be possible
    denominator = np.sum(np.abs(similarity_scores))

    # Sum sim(a,b) * (r_b,p - r_b) / Sum |sim(a,b)|
    bias = numerator / denominator

    # pred(a,p) = r_a + bias
    predicted_ratings = user_ratings_mean + bias

    # Rank the movies by predicted rating
    indices = np.flip(np.argsort(predicted_ratings))
    predicted_movie_indices = candidate_movie_indices[indices][:num_of_movies]
    sorted_ratings = predicted_ratings[indices][:num_of_movies]

    return predicted_movie_indices, sorted_ratings

In [43]:
def predict_movies_for_user(user_id, num_of_movies):
    similar_user_ids, similarity_scores = get_top_n_most_similar_users_euclidean(user_id, 50)
    
    user_ratings_mean = get_user_rating_mean(user_id)
    # print(user_ratings_mean)
    similar_users_ratings = np.nan_to_num(get_ratings_for_users(similar_user_ids))
    # print(similar_users_ratings)
    similar_users_ratings_mean = get_mean_ratings_for_users(similar_user_ids)
    # print(similar_users_ratings_mean)
    candidate_movie_ids = get_candidate_movie_ids(user_id, similar_user_ids)

    candidate_movie_indices = get_movie_indices_for_movie_ids(candidate_movie_ids)

    return predict_movies(user_ratings_mean, similar_users_ratings, similar_users_ratings_mean, similarity_scores, candidate_movie_indices, num_of_movies)

In [44]:
# Predict movies for user 1

predicted_movie_indices, predicted_ratings = predict_movies_for_user(1, 10)

# Get movie titles
predicted_movie_titles = (
    movies_df
    .filter(pl.col("index").is_in(predicted_movie_indices))
    .select("title")
    .to_numpy()
    .reshape(-1)
)

print("Predicted movie titles for user 1:")
print(predicted_movie_titles)



Predicted movie titles for user 1:
['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'
 'Shawshank Redemption, The (1994)' 'Blade Runner (1982)'
 'Terminator 2: Judgment Day (1991)' 'Godfather, The (1972)'
 'Die Hard (1988)' "Fathers' Day (1997)" 'Iron Giant, The (1999)'
 'Body Heat (1981)' 'White Water Summer (1987)']
