# Project Work Part 1

##### a)

In [32]:
import polars as pl

# read data
links_df = pl.read_csv("../data/links.csv", schema={"movieId": pl.Int32, "imdb": pl.Int32, "tmdbId": pl.Int32})
movies_df = pl.read_csv("../data/movies.csv", schema={"movieId": pl.Int32, "title": pl.String, "genres": pl.String})
ratings_df = pl.read_csv("../data/ratings.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "rating": pl.Float32, "timestamp": pl.Int32})
tags_df = pl.read_csv("../data/tags.csv", schema={"userId": pl.Int32, "movieId": pl.Int32, "tag": pl.String, "timestamp": pl.Int32})

print("Number of ratings: ", ratings_df.height)

Number of ratings:  100836


In [21]:
links_df.head()

movieId,imdb,tmdbId
i32,i32,i32
1,114709,862
2,113497,8844
3,113228,15602
4,114885,31357
5,113041,11862


In [22]:
movies_df.head()

movieId,title,genres
i32,str,str
1,"""Toy Story (1995)""","""Adventure|Animation|Children|C…"
2,"""Jumanji (1995)""","""Adventure|Children|Fantasy"""
3,"""Grumpier Old Men (1995)""","""Comedy|Romance"""
4,"""Waiting to Exhale (1995)""","""Comedy|Drama|Romance"""
5,"""Father of the Bride Part II (1…","""Comedy"""


In [23]:
ratings_df.head()

userId,movieId,rating,timestamp
i32,i32,f32,i32
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931


In [24]:
tags_df.head()

userId,movieId,tag,timestamp
i32,i32,str,i32
2,60756,"""funny""",1445714994
2,60756,"""Highly quotable""",1445714996
2,60756,"""will ferrell""",1445714992
2,89774,"""Boxing story""",1445715207
2,89774,"""MMA""",1445715200


##### b)

In [112]:
import scipy as sp
import numpy as np


#ratings_user_1 = ratings_df.filter(pl.col("userId") <= 2).sort(["userId", "movieId"])
#ratings_user_1

# create vectors of ratings for each user filling missing values with 0
ratings_matrix = ratings_df.drop("timestamp").pivot("movieId", index="userId")

# take values as numpy matrix and fill nan values with 0
ratings_matrix = np.nan_to_num(ratings_matrix)
ratings_matrix = ratings_matrix.astype(np.float32)

# Check that the matrix has the correct shape
print('Amount of users:', ratings_df.select("userId").n_unique())
print('Amount of movies:', movies_df.height)
print('Shape of the ratings matrix', ratings_matrix.shape)

# There seems to be few movies with no ratings

Amount of users: 610
Amount of movies: 9742
Shape of the ratings matrix (610, 9725)


In [113]:
# Take first user as a query user
query_user = ratings_matrix[0, :]

# Calculate pearson correlation between query user and all other users
correlations = np.apply_along_axis(lambda x: sp.stats.pearsonr(query_user, x)[0], 1, ratings_matrix)
correlations

array([0.99999992, 0.02054232, 0.05445295, 0.17740924, 0.121138  ,
       0.10520456, 0.14406792, 0.12661765, 0.0564219 , 0.00287683,
       0.11978652, 0.01295446, 0.07999541, 0.09735831, 0.14226418,
       0.14983032, 0.24035334, 0.18632076, 0.29343477, 0.13807823,
       0.12484106, 0.03804229, 0.08624568, 0.12730217, 0.07447641,
       0.06184138, 0.19816509, 0.16731617, 0.11407638, 0.07237451,
       0.11558805, 0.11120556, 0.12012325, 0.05991513, 0.04548557,
       0.04317944, 0.07669367, 0.0892666 , 0.24167046, 0.06645147,
       0.07791727, 0.21008516, 0.09003302, 0.06141142, 0.2735247 ,
       0.06225817, 0.03965395, 0.05174323, 0.04767906, 0.06789644,
       0.09151871, 0.03198276, 0.034361  , 0.05086542, 0.02951776,
       0.06535052, 0.26730701, 0.09607459, 0.13165022, 0.02994366,
       0.05399139, 0.07222425, 0.1468491 , 0.24140243, 0.03190201,
       0.16736853, 0.05053994, 0.20319876, 0.07057356, 0.04153149,
       0.0401237 , 0.08600496, 0.02775419, 0.03727638, 0.05999

In [126]:
# get top 10 indexes
top_10_indexes = np.argsort(correlations)[-11:-1]
print('top 10 similar indexes', top_10_indexes)
print()

# get top 10 similar users
top_10_similar_users = ratings_matrix[top_10_indexes, :]
print('top 10 similar user rating vectors:')
print(top_10_similar_users)

top 10 similar indexes [26 67 41 90 16 63 38 56 44 18]

top 10 similar user rating vectors:
[[27.   3.   0.  ...  0.   0.   0. ]
 [68.   2.5  2.  ...  0.   0.   0. ]
 [42.   0.   4.  ...  0.   0.   0. ]
 ...
 [57.   5.   0.  ...  0.   0.   0. ]
 [45.   4.   0.  ...  0.   0.   0. ]
 [19.   4.   3.  ...  0.   0.   0. ]]
