# Project work part 2

In [None]:
import numpy as np
import polars as pl

# read data
links_df = pl.read_csv("../data/links.csv", schema={"movieId": pl.Int32, "imdb": pl.Int32, "tmdbId": pl.Int32})
movies_df = (
    pl.read_csv("../data/movies.csv", schema={"movieId": pl.Int32, "title": pl.String, "genres": pl.String})
    .with_row_index()
    .with_columns(
        pl.col("index")
        .cast(pl.Int32)
    )
)
ratings_df = pl.read_csv("../data/ratings.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "rating": pl.Float32, "timestamp": pl.Int32})
tags_df = pl.read_csv("../data/tags.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "tag": pl.String, "timestamp": pl.Int32})

print("Number of ratings: ", ratings_df.height)

# Divide the dataset to two evely sized parts based on the timestamp
ratings_df = ratings_df.sort("timestamp")
ratings_df1 = ratings_df.slice(0, ratings_df.height // 2)
ratings_df2 = ratings_df.slice(ratings_df.height // 2, ratings_df.height)

#print(ratings_df1)
#print(ratings_df2)



Number of ratings:  100836


In [None]:
user_ids = [1, 2, 3, 4, 5]

from sklearn.metrics.pairwise import nan_euclidean_distances


def get_top_n_most_similar_users_euclidean(user_id, n):
    rating_vectors_df = (
        ratings_df
        .drop("timestamp")
        .pivot("movieId", index="userId")
    )
    
    user_ratings = rating_vectors_df.filter(pl.col("userId") == user_id).drop("userId").to_numpy()
    
    rating_vectors = (
        rating_vectors_df
        .drop("userId")
        .to_numpy()
    )
    
    # Calculate distance
    distances = nan_euclidean_distances(rating_vectors, user_ratings).reshape(-1)
    
    # Find nan values and count the amount of common movies between user and all users
    nans_in_user_ratings = np.isnan(user_ratings[0])
    nans_in_rating_vectors = np.isnan(rating_vectors)
    common_rated_movies = 1 - np.logical_or(nans_in_user_ratings, nans_in_rating_vectors)
    common_rated_movies_count = np.sum(common_rated_movies, axis=1).astype(np.float32) 
    
    # Ignore users withe less than 5 common movies
    common_rated_movies_count[common_rated_movies_count < 5] = np.nan 
    
    # Count average distances
    average_distances = distances / common_rated_movies_count
    
    # Max scaling
    max_average_distance = np.nanmax(average_distances)
    
    # Distance to similarity. Nan values are set to zero
    similarities = np.nan_to_num(1 - (average_distances / max_average_distance))
    
    distance_df = (
        pl.DataFrame(
            {
                "userId": rating_vectors_df.select("userId"),
                "similarity": similarities
            }
        )
        .sort("similarity", descending=True)
        .limit(n)
    )
    
    return (
        distance_df
        .select("userId")
        .to_numpy()
        .reshape(-1)
    ), (
        distance_df
        .select("similarity")
        .to_numpy()
        .reshape(-1)
    )

# Calculate the top 50 most similar users to userId 1
user_ids, similarity_scores = get_top_n_most_similar_users_euclidean(1, 50)
print(user_ids)
print(similarity_scores)












shape: (5, 2)
┌────────┬────────┐
│ userId ┆ rating │
│ ---    ┆ ---    │
│ i64    ┆ f64    │
╞════════╪════════╡
│ 1      ┆ 0.0    │
│ 2      ┆ 0.0    │
│ 3      ┆ 0.0    │
│ 4      ┆ 0.0    │
│ 5      ┆ 0.0    │
└────────┴────────┘
