# Hands-on clustering #2:
## Recommender system using Non-negative Matrix Factorization: MovieLens

## 2. Data exploration

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns
sns.set_theme(style="darkgrid")

### 2.1. Load the dataset

In [2]:
# Loading ratings 
names = ["User ID", "Movie ID", "Rating"]
ratings_df = pd.read_csv("Data/ml-100k/u.data", sep="\t", usecols=[0,1,2], names=names)

FileNotFoundError: [Errno 2] No such file or directory: 'Data/ml-100k/u.data'

### 2.2 Understand the data

#### Display first few columns

In [None]:
ratings_df.head()

#### Check data dimensions

In [None]:
n_ratings = ratings_df.shape[0]
n_users = len(ratings_df["User ID"].unique())
n_items = len(ratings_df["Movie ID"].unique())

print("Total number of ratings in the dataset: %i" % (n_ratings))
print("Number of persons who rated movies: %i" % (n_users))
print("Number of rated movies: %i" % (n_items))

#### Statistics of the ratings

In [None]:
summary = ratings_df["Rating"].describe()
summary

### 2.3. Histogram of the ratings

In [None]:
plt.hist(ratings_df["Rating"], bins=range(1, 7), align="left")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.grid(True)

##### Interpretation

- We might want to normalize accross the users, to ensure that each have them have the same rating scales.\
- However, we need to be careful in the transformation we apply to the data. Indeed, bringing the data to standard normal distribution would break the non-negativity of the data, and we could not apply NMF anymore.\
- For this study, no normalization will be applied.

### 2.4. Ratings per user

In [None]:
users_list = range(1, n_users + 1)
n_ratings = [np.count_nonzero(ratings_df["User ID"] == i) for i in users_list]
n_ratings = np.array(n_ratings)

binwidth = 10
bins = np.arange(min(n_ratings), max(n_ratings)+binwidth, binwidth)
plt.figure(figsize=(10,5))
plt.hist(n_ratings, bins=bins, align="left")
plt.xlabel("Number of ratings")
plt.ylabel("Frequency: number of users with this much ratings")
plt.show()

#### Statistics

In [None]:
n_ratings_df = pd.DataFrame(n_ratings)
summary = n_ratings_df.describe()

summary

##### Interpretation

- The average number of ratings per user is below fifty.
- The minimum is twenty (that's how the dataset is built)
- Some users rated many movies (more than 700)

### 2.5. Average ratings per movie

In [None]:
movies_list = range(1, n_items + 1)
mean_ratings = [ratings_df["Rating"][ratings_df["Movie ID"] == i].mean() for i in movies_list]
mean_ratings = np.array(mean_ratings)

binwidth = 0.1
bins = np.arange(min(mean_ratings), max(mean_ratings)+binwidth, binwidth)
plt.figure(figsize=(10,5))
plt.hist(mean_ratings, bins=bins, align="mid")
plt.xlabel("Average rating")
plt.ylabel("Frequency: number of movies with this average rating")
plt.show()

##### Interpretation

- Spikes on exact values: several movies had unanimous ratings.
- Spikes on 1 and 5: Some users might have a binary rating scale (like or dislike).

#### Find movies with ratings 1 and 5

In [None]:
# Loading movies titles
movie_titles = pd.read_csv("Data/ml-100k/u.item", sep="|", header=None, usecols=[1], 
                           encoding='iso-8859-1', names=["Title"])

bad_movies, good_movies = [], []
for i in range(len(mean_ratings)):
    if mean_ratings[i] == 1:
        bad_movies.append(movie_titles["Title"][i])
    if mean_ratings[i] == 5:
        good_movies.append(movie_titles["Title"][i])

print("Example of movie with rating of 1:", np.random.choice(bad_movies))
print("Example of movie with rating of 5:", np.random.choice(good_movies))

## 3. Applying NMF

### 3.1. Build the user-item matrix

In [None]:
from scipy import sparse

users = ratings_df["User ID"].values
movies = ratings_df["Movie ID"].values
rating = ratings_df["Rating"].values

matrix_sparse = sparse.csr_matrix((rating, (users, movies)), shape=(n_users+1, n_items+1))
R = matrix_sparse.todense()
R = np.array(R[1:, 1:])

print("Verify rating of user 196 for movie 242 (first row in data file): ", R[195, 241])

### 3.2. Sparsity of user-item matrix

In [None]:
sparsity = 1 - len(R.nonzero()[0]) / (R.shape[0] * R.shape[1])
print("Sparsity of R: %0.2f%%" % (sparsity * 100))

##### Interpretation

6.30% of the user-item ratings have a value. Missing values are filled with zeros but they do not represent zero on the same scale as the ratings, they are simply empty entries. Ratings are defined from 1 to 5.

### 3.3 Apply NMF with 20 components

In [None]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=20, init="nndsvda", max_iter=int(1e3))
nmf.fit(R)
W = nmf.transform(R)
H = nmf.components_

print("Number of iterations: ", nmf.n_iter_)
print("Movies features shape (transpose(H)): ", H.T.shape)
print("Users features shape (W): ", W.shape)
print("Reconstruction of R shape (W.H): ", W.dot(H).shape)

##### Interpretation

The algorithm converged, the shapes of the matrices are as expected.

### 3.4. Average reconstruction error

In [None]:
R_pred = W.dot(H)

print("Average reconstruction error: ", np.linalg.norm(R - R_pred) / (R.shape[0]*R.shape[1]))

### 3.5. Clip values outside rating scale

In [None]:
print(R)
R_pred = W.dot(H)
R_pred[R_pred > 5] = 5.
R_pred[R_pred < 1] = 1.

print(R_pred.astype(np.int8))

##### Interpretation

We want to make recommendations to the users. In other words, we want to estimate the rating a movie would get when this rating does not exist. This way, we give a valid rating to all of the movies in the new reconstructed matrix.

## 4. Make recommendations

In [None]:
n_recommendations = 5
user_idx = 12
ratings_user = R[user_idx, :]
predictions_user = R_pred[user_idx, :]

print("Preferred movies for user %i:" % user_idx)
print("\nTitle | Rating\n-------------------------------")
favorite_movies_index = np.argsort(-ratings_user)
for i in range(n_recommendations):
    index = favorite_movies_index[i]
    print(movie_titles["Title"][index], " | ", ratings_user[index])

unseen_indices = np.where(ratings_user == 0)[0]
predictions_unseen = predictions_user[unseen_indices]
print("\n\nRecommended movies for user %i:" % user_idx)
print("\nTitle | Rating\n-------------------------------")
predicted_movies_index = np.argsort(-predictions_unseen)
for i in range(n_recommendations):
    index = unseen_indices[predicted_movies_index[i]]
    print(movie_titles["Title"][index], " | ", predictions_user[index])

In [None]:
# creating a new user profile:
my_ratings = np.zeros((1682,1), dtype=int)
my_ratings[0] = 4 
my_ratings[1] = 4 
my_ratings[10] = 1 
my_ratings[15] = 3
my_ratings[27] = 4
my_ratings[34] = 1
my_ratings[49] = 1
my_ratings[55] = 1
my_ratings[61] = 1
my_ratings[68] = 5
my_ratings[70] = 4
my_ratings[81] = 4
my_ratings[87] = 2
my_ratings[94] = 4
my_ratings[120] = 2
my_ratings[171] = 1
my_ratings[173] = 4
my_ratings[175] = 1
my_ratings[182] = 1
my_ratings[194] = 2
my_ratings[203] = 5
my_ratings[209] = 5
my_ratings[221] = 1
my_ratings[234] = 2
my_ratings[312] = 3
my_ratings[317] = 3
my_ratings[322] = 3
my_ratings[342] = 1
my_ratings[378] = 1
my_ratings[379] = 1
my_ratings[392] = 3
my_ratings[404] = 2
my_ratings[422] = 4
my_ratings[542] = 4

In [None]:
newR = np.r_[R, my_ratings.T]

new_nmf = NMF(n_components=20, init="nndsvda", max_iter=int(1e3))
new_nmf.fit(R)
newW = new_nmf.transform(R)
newH = new_nmf.components_

print("Number of iterations: ", nmf.n_iter_)
print("Movies features shape (transpose(H)): ", H.T.shape)
print("Users features shape (W): ", W.shape)
print("Reconstruction of R shape (W.H): ", W.dot(H).shape)

In [None]:
print(newR)
newR_pred = newW.dot(newH)
newR_pred[newR_pred > 5] = 5.
newR_pred[newR_pred < 1] = 1.

print(newR_pred.astype(np.int8))

In [None]:
n_recommendations = 5
user_idx = -1
ratings_user = newR[user_idx, :]
predictions_user = newR_pred[user_idx, :]

print("Preferred movies for user %i:" % user_idx)
print("\nTitle | Rating\n-------------------------------")
favorite_movies_index = np.argsort(-ratings_user)
for i in range(n_recommendations):
    index = favorite_movies_index[i]
    print(movie_titles["Title"][index], " | ", ratings_user[index])

unseen_indices = np.where(ratings_user == 0)[0]
predictions_unseen = predictions_user[unseen_indices]
print("\n\nRecommended movies for user %i:" % user_idx)
print("\nTitle | Rating\n-------------------------------")
predicted_movies_index = np.argsort(-predictions_unseen)
for i in range(n_recommendations):
    index = unseen_indices[predicted_movies_index[i]]
    print(movie_titles["Title"][index], " | ", predictions_user[index])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

movies_sim = cosine_similarity(H.T)
plt.imshow(movies_sim)
plt.show()

In [None]:
n_recommendations = 5

watch_index = np.random.randint(0, len(movie_titles["Title"]))
print("Currently watching: ", movie_titles["Title"][watch_index])

movies_similarities = movies_sim[watch_index, :]
suggestions = np.argsort(-movies_similarities)[1:n_recommendations + 1]

print("\n\nRecommended movies for user:")
print("\nTitle | Similarity\n-------------------------------")
for i in suggestions:
    print(movie_titles["Title"][i], " | ", movies_similarities[i])

In [None]:
print(movies_sim)