In [1]:
import os
import time
import requests
import numpy as np
import pyarrow as pa
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from fuzzywuzzy import process
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
ratings = pd.read_csv('upgraded_movielens_latest/filtered_ratings.csv', engine='pyarrow')
movies = pd.read_csv('upgraded_movielens_latest/upgraded_movies.csv', engine='pyarrow')
movies_w_title = pd.read_csv('movielens_latest/movies.csv', engine='pyarrow')

In [36]:
movies_w_title = movies_w_title.drop(columns=['genres'])
valid_movie_ids = set(movies['movieId'])
titles = movies_w_title[movies_w_title['movieId'].isin(valid_movie_ids)]
movies['movieId'] = movies['movieId'].astype(int)

In [None]:
print("Movies 'movieId' dtype:", movies['movieId'].dtype)
print("Ratings 'movieId' dtype:", ratings['movieId'].dtype)
print("Titles 'movieId' dtype:", titles['movieId'].dtype)

In [38]:
movies = movies.sort_values('movieId', ascending=True)
titles = titles.sort_values('movieId', ascending=True)

In [None]:
n_ratings = len(ratings)
n_movies = ratings['movieId'].nunique()
n_users = ratings['userId'].nunique()

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average number of ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average number of ratings per movie: {round(n_ratings/n_movies, 2)}")

In [None]:
print(f"Mean global rating: {round(ratings['rating'].mean(),3)}.")

mean_ratings = ratings.groupby('userId')['rating'].mean()
print(f"Mean rating per user: {round(mean_ratings.mean(),2)}.")

In [None]:
mean_ratings = ratings.groupby('movieId')[['rating']].mean()
lowest_rated = mean_ratings['rating'].idxmin()
movies[movies['movieId']==lowest_rated]
# print()

In [None]:
titles[titles['movieId'] == lowest_rated]

In [None]:
highest_rated = mean_ratings['rating'].idxmax()
print(highest_rated)
movies[movies['movieId'] == highest_rated]

In [None]:
titles[titles['movieId'] == highest_rated]

In [None]:
ratings[ratings['movieId']==highest_rated]

In [None]:
"""suppose for movieId 1 , it will group all the ratings with movieId 1.
 In this case that group has 76813 ratings . It then take those 76813 rating then count them by count aggregate func
 basically len(group) which is 76813 in this example and mean them by mean aggregate func which is 3.893508"""

movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
movie_stats.head()

In [None]:
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()

print(f"Average number of ratings for a given movie: {C:.2f}")
print(f"Average rating for a given movie: {m:.2f}")

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return bayesian_avg

In [None]:
lamerica = pd.Series([5, 5])
bayesian_avg(lamerica)

In [None]:
bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.head()

In [None]:
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
bayesian_avg_ratings.head()

In [51]:
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')

In [None]:
movie_stats.head(6)

In [None]:
movie_stats = movie_stats.merge(movies,on='movieId')
movie_stats.sort_values('bayesian_avg', ascending=False).head(2)

In [None]:
movie_stats.sort_values('bayesian_avg', ascending=True).head()

In [55]:
ratings_copy = ratings
ratings = ratings.drop(columns=['timestamp'])
bayesian_ratings = ratings_copy.merge(movie_stats[["movieId", "bayesian_avg"]], on='movieId')
bayesian_ratings = bayesian_ratings.drop(columns=['timestamp', 'rating'])
bayesian_ratings = bayesian_ratings.sort_values('userId', ascending=True)
bayesian_ratings = bayesian_ratings.rename(columns={'bayesian_avg': 'rating'})

In [None]:
bayesian_ratings.head()

In [57]:
def create_X(df):
    
    M = df['userId'].nunique()
    N = df['movieId'].nunique()
    print(M)
    print(N)

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    print(X.shape)
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper



In [None]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [None]:
X_b, user_mapper_b, movie_mapper_b, user_inv_mapper_b, movie_inv_mapper_b = create_X(bayesian_ratings)

In [None]:
print("X.shape",X.shape)


svd = TruncatedSVD(n_components=300, n_iter=10)
Q = svd.fit_transform(X.T)
print("Q.shape",Q.shape)


svd = TruncatedSVD(n_components=300, n_iter=10)
Q_b = svd.fit_transform(X_b.T)
print("Q_b.shape",Q_b.shape)

In [62]:
np.save('data/Q.npy', Q)
np.save('bayesian_data/Q_b.npy', Q_b)

In [None]:
X = X.T
neighbour_ids = []

movie_ind = movie_mapper[1]
print(movie_ind)
movie_vec = X[movie_ind]
print(movie_vec.shape)
if isinstance(movie_vec, (np.ndarray)):
    movie_vec = movie_vec.reshape(1,-1)
    print(movie_vec.shape)
kNN = NearestNeighbors(n_neighbors=11, algorithm="brute", metric='cosine')
kNN.fit(X)
neighbour = kNN.kneighbors(movie_vec, return_distance=False)
for i in range(0,5):
    n = neighbour.item(i)
    print("n = ",n)
    neighbour_ids.append(movie_inv_mapper[n])
    print("movie_inv_mapper[n]   ",movie_inv_mapper[n])
neighbour_ids.pop(0)
print(neighbour_ids)

In [None]:
n_total = X.shape[0]*X.shape[1]
print("n_total",n_total)
n_ratings = X.nnz
print("n_ratings",n_ratings)
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")
n_ratings_per_user = X.getnnz(axis=1)
print(f"Most active user rated {n_ratings_per_user.max()} movies.")
print(f"Least active user rated {n_ratings_per_user.min()} movies.")

In [None]:
n_total = X_b.shape[0]*X.shape[1]
print("n_total",n_total)
n_ratings = X_b.nnz
print("n_ratings",n_ratings)
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")
n_ratings_per_user = X.getnnz(axis=1)
print(f"Most active user rated {n_ratings_per_user.max()} movies.")
print(f"Least active user rated {n_ratings_per_user.min()} movies.")

In [None]:
n_ratings_per_movie = X.getnnz(axis=0)

plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.kdeplot(n_ratings_per_user, fill=True)
plt.xlim(0)
plt.title("Number of Ratings Per User", fontsize=14)
plt.xlabel("number of ratings per user")
plt.ylabel("density")
plt.subplot(1,2,2)
sns.kdeplot(n_ratings_per_movie, fill=True)
plt.xlim(0)
plt.title("Number of Ratings Per Movie", fontsize=14)
plt.xlabel("number of ratings per movie")
plt.ylabel("density")
plt.show()