# 1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# 2- Importing the dataset

In [None]:
ratings_df = pd.read_csv('./rating.csv')
movies_df = pd.read_csv('./movie.csv')

ratings_df = ratings_df.drop(columns=['timestamp'])
movies_df = movies_df.drop(columns=['title'])

print(ratings_df.head(1))
print(movies_df.head(1))

   userId  movieId  rating
0       1        2     3.5
   movieId                                       genres
0        1  Adventure|Animation|Children|Comedy|Fantasy


In [None]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating
count,1358684.0,1358684.0,1358684.0
mean,4594.321,8610.676,3.526341
std,2660.662,19051.14,1.052535
min,1.0,1.0,0.5
25%,2295.0,904.0,3.0
50%,4510.0,2146.0,4.0
75%,6899.0,4638.0,4.0
max,9197.0,130768.0,5.0


In [None]:
ratings_df = ratings_df.merge(movies_df, on='movieId')
ratings_df['genres'] = ratings_df['genres'].str.split('|')

# 3. Model params

In [35]:
users_embedding_size, movies_embedding_size = 25, 25
genre_embedding_size = 5

# 4. Converting Movie/User IDs to Indexes

In [None]:
def genre_pooling(genre_df):
    genre_dict = {}
    id = 1

    # First, create a mapping of genres to unique IDs
    for genres in genre_df:
        for genre in genres:
            if genre not in genre_dict:
                genre_dict[genre] = id
                id += 1

    # Convert genres into their respective unique IDs
    converted_genre = genre_df.apply(lambda genres: [genre_dict[genre] for genre in genres])

    # Apply pooling: mean of the genre IDs for each movie
    pooled_genre = converted_genre.apply(np.mean)

    return pooled_genre


In [25]:
pooled_genres = genre_pooling(ratings_df['genres'])
pooled_genres = np.array(pooled_genres).reshape(-1, 1)

# 5. Data splitting

In [26]:
print(len(ratings_df), pooled_genres.shape)
train_user, test_user, train_movie, test_movie, train_genre, test_genre, train_y, test_y = train_test_split(
    ratings_df['userId'], ratings_df['movieId'], pooled_genres, ratings_df['rating'],
    test_size=0.2, random_state=42
)

print(pooled_genres.shape)
# del ratings_df
# del movies_df
# del pooled_genres

1358684 (1358684, 1)
(1358684, 1)


In [27]:
user_encoder = LabelEncoder()
user_enc = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
movie_enc = movie_encoder.fit_transform(ratings_df['movieId'])

num_users = len(user_enc)
num_movies = len(movie_enc)

# 6. Input and Embedding layers creation

In [36]:
user_input = tf.keras.Input(shape=(1,), name='user_input')
movie_input = tf.keras.Input(shape=(1,), name='movie_input')
genre_input = tf.keras.Input(shape=(1,), name='genre_input')


user_embedding = tf.keras.layers.Embedding(
    input_dim=num_users + 1,  # +1 because IDs start from 1
    output_dim=users_embedding_size,
    embeddings_initializer='uniform',
    name='user_embedding',
)(user_input)

movie_embedding = tf.keras.layers.Embedding(
    input_dim=num_movies + 1,
    output_dim=movies_embedding_size,
    embeddings_initializer='uniform',
    name='movie_embedding',
)(movie_input)

# 7. Layers Setup

In [37]:

# Flatten embeddings to 2D
user_vec = tf.keras.layers.Flatten()(user_embedding)
movie_vec = tf.keras.layers.Flatten()(movie_embedding)
# Concatenate user and movie vectors
concat = tf.keras.layers.Concatenate()([user_vec, movie_vec, genre_input])

# Dense layers
x = tf.keras.layers.Dense(
    128,
    activation='relu',)(concat)
x = tf.keras.layers.Dense(
    32,
    activation='relu',)(x)
x = tf.keras.layers.Dense(
    16,
    activation='relu',)(x)
output = tf.keras.layers.Dense(1, activation='linear')(x)

# 8. Model compilation

In [38]:
model = tf.keras.Model(inputs=[user_input, movie_input, genre_input], outputs=output)
optimizer = tf.keras.optimizers.Adam(learning_rate=.0005)

# 5. Compile model
model.compile(
    optimizer=optimizer,
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

# 9. Model training

In [40]:

model.summary()

# 6. Train model
history = model.fit(
    [train_user, train_movie, train_genre],
    train_y,
    validation_split=0.1,
    epochs=20,
    batch_size=256,
    verbose=1
)


Epoch 1/20
[1m   17/15286[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:50:58[0m 1s/step - loss: 2.7386 - root_mean_squared_error: 1.6538

KeyboardInterrupt: 

# 10. Model Evaluation

In [None]:
# 7. Evaluate
pred_y = model.predict([test_user, test_movie, test_genre])
r2 = r2_score(test_y, pred_y)
rmse = root_mean_squared_error(test_y, pred_y)
print(f'R² score: {r2:.3f}, RMSE: {rmse:.3f}')


InvalidArgumentError: Graph execution error:

Detected at node functional_1/movie_embedding_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.12/dist-packages/tornado/platform/asyncio.py", line 211, in start

  File "/usr/lib/python3.12/asyncio/base_events.py", line 645, in run_forever

  File "/usr/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once

  File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipython-input-1141661372.py", line 2, in <cell line: 0>

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 566, in predict

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 260, in one_step_on_data_distributed

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 250, in one_step_on_data

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 105, in predict_step

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/layers/layer.py", line 936, in __call__

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/operation.py", line 58, in __call__

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py", line 183, in call

  File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/function.py", line 177, in _run_through_graph

  File "/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py", line 648, in call

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/layers/layer.py", line 936, in __call__

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/operation.py", line 58, in __call__

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/embedding.py", line 150, in call

  File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/numpy.py", line 5581, in take

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/numpy.py", line 2274, in take

indices[1,0] = 43928 is not in [0, 22063)
	 [[{{node functional_1/movie_embedding_1/GatherV2}}]] [Op:__inference_one_step_on_data_distributed_2778]

# 11. Model Save

In [None]:
model.save('cf_mlp_embedding.h5')