1-imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, root_mean_squared_error
import joblib

2-Sample Data

In [4]:
df = pd.read_csv('./rating.csv')
print(df.head())
print(df.describe())

   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40
             userId       movieId        rating
count  2.000026e+07  2.000026e+07  2.000026e+07
mean   6.904587e+04  9.041567e+03  3.525529e+00
std    4.003863e+04  1.978948e+04  1.051989e+00
min    1.000000e+00  1.000000e+00  5.000000e-01
25%    3.439500e+04  9.020000e+02  3.000000e+00
50%    6.914100e+04  2.167000e+03  3.500000e+00
75%    1.036370e+05  4.770000e+03  4.000000e+00
max    1.384930e+05  1.312620e+05  5.000000e+00


In [5]:
users_embedding_size, movies_embedding_size = 50, 50

3-Create Id-To-Index mapping

In [6]:
# Importing users, movies ids as type uint32
users = df['userId'].astype(np.uint32)
unique_users = users.unique()
movies = df['movieId'].astype(np.uint32)
unique_movies = movies.unique()
ratings = df['rating'].dropna()
ratings = ratings.astype(np.ushort)

indexed_users = {}
indexed_movies = {}
for i in range(max(len(unique_users), len(unique_movies))):
    if i < len(unique_users):
        indexed_users[unique_users[i]] = i
    if i < len(unique_movies):
        indexed_movies[unique_movies[i]] = i

print(indexed_users[1000])
print(f'users shape:', users.shape, f'movies shape:', movies.shape)
print('unique users shape: ', unique_users.shape)
print('unique movies shape: ', unique_movies.shape)

999
users shape: (20000263,) movies shape: (20000263,)
unique users shape:  (138493,)
unique movies shape:  (26744,)


4-Initialize random embedding matrices

In [7]:
users_embedding = np.random.uniform(low=-1.0, high=1.0,
    size=(len(unique_users), users_embedding_size))
movies_embedding = np.random.uniform(low=-1.0,
    high=1.0, size=(len(unique_movies), movies_embedding_size))

print(users_embedding.shape)
print(movies_embedding.shape)

(138493, 50)
(26744, 50)


5-Formation of X and Y and concatenation

In [None]:
"""5-Formation of X and Y and concatenation (IMPROVED)"""

# 1. Map IDs to indices for all users and movies simultaneously
user_indices = np.array([indexed_users.get(uid, -1) for uid in users])
movie_indices = np.array([indexed_movies.get(mid, -1) for mid in movies])

# Filter out any cold-start items/users (where index is -1)
valid_indices = (user_indices != -1) & (movie_indices != -1)
user_indices_valid = user_indices[valid_indices]
movie_indices_valid = movie_indices[valid_indices]
ratings_valid = ratings[valid_indices]

# 2. Use NumPy advanced indexing to fetch all embeddings at once
# This is the key optimization: fetching vectors for all rows simultaneously
user_vectors = users_embedding[user_indices_valid]
movie_vectors = movies_embedding[movie_indices_valid]

# 3. Concatenate the fetched vectors horizontally to form the feature matrix X
X = np.concatenate((user_vectors, movie_vectors), axis=1)

# Assign the filtered ratings to Y
Y = ratings_valid

print(f"Optimized X shape: {X.shape}, Optimized Y shape: {Y.shape}")

6-Splitting data

In [None]:
print(Y.shape)
print(X.shape)
trainX, testX, trainY, testY = train_test_split(X, Y, train_size=.8, test_size=.2)

NameError: name 'Y' is not defined

7-Training

In [None]:
model = MLPRegressor(activation='relu')
model.fit(trainX, trainY)

8-Testing

In [None]:
y_pred = model.predict(testX)

score_r2 = r2_score(testY, y_pred)
print(f'r2 score = {score_r2}')

rmse_score = root_mean_squared_error(testY, y_pred)
print(f'RMSE score = {rmse_score}')

9-Saving Model

In [None]:
filename = 'MLPRegressor_model.joblib'

# Save the model object to the file
joblib.dump(model, filename)