In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam, SGD, RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [1]:
import pandas as pd
import numpy as np

def load_movielens_1m_data(filepath):
    """
    Load the MovieLens 1M dataset.

    Args:
    filepath (str): The path to the dataset file.

    Returns:
    Tuple: Returns number of users, number of movies, and the ratings matrix.
    """
    # Column labels for the dataset
    column_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    
    # Load the dataset
    data = pd.read_csv(filepath, sep='::', header=None, names=column_names, engine='python')

    # Determine the number of users and movies
    num_users = data['user_id'].max()
    num_movies = data['movie_id'].max()

    # Initialize a matrix to store the ratings
    ratings_matrix = np.zeros((num_users, num_movies))

    # Fill the matrix with ratings
    for row in data.itertuples():
        ratings_matrix[row.user_id - 1, row.movie_id - 1] = row.rating

    return num_users, num_movies, ratings_matrix

In [2]:
from sklearn.model_selection import train_test_split

def split_ratings_matrix(ratings_matrix, test_size=0.2):
    """
    Split the ratings matrix into training and testing sets.

    Args:
    ratings_matrix (numpy.ndarray): The ratings matrix to split.
    test_size (float): The proportion of the dataset to include in the test split.

    Returns:
    Tuple: Training and testing ratings matrices.
    """
    # Flatten the matrix to get all [user, movie, rating] interactions
    user_ids, movie_ids = np.where(ratings_matrix > 0)
    ratings = ratings_matrix[user_ids, movie_ids]
    
    # Split the data
    train_indices, test_indices = train_test_split(range(len(ratings)), test_size=test_size, random_state=42)
    
    # Create training and testing matrices
    train_matrix = np.zeros_like(ratings_matrix)
    test_matrix = np.zeros_like(ratings_matrix)
    
    # Fill the matrices
    train_matrix[user_ids[train_indices], movie_ids[train_indices]] = ratings[train_indices]
    test_matrix[user_ids[test_indices], movie_ids[test_indices]] = ratings[test_indices]
    
    return train_matrix, test_matrix

In [4]:
filepath = './data/MovieLens_1M/movielens_1m_dataset.dat'
num_users, num_movies, ratings_matrix = load_movielens_1m_data(filepath)
train_matrix, test_matrix = split_ratings_matrix(ratings_matrix)

In [7]:
print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")

print(f"Shape of the train ratings matrix: {train_matrix.shape}")
print(f"Shape of the test ratings matrix: {test_matrix.shape}")

# Calculate the density of the full ratings matrix
total_possible_ratings = num_users * num_movies
actual_ratings = np.count_nonzero(ratings_matrix)
density = (actual_ratings / total_possible_ratings) * 100
print(f"Density of the full ratings matrix: {density:.2f}%")

# Verify the train-test split
train_ratings_count = np.count_nonzero(train_matrix)
test_ratings_count = np.count_nonzero(test_matrix)
print(f"Number of ratings in the entire ML1M dataset: {actual_ratings}")
print(f"Number of ratings in the training set: {train_ratings_count}")
print(f"Number of ratings in the testing set: {test_ratings_count}")

# Calculate and print the density of the train and test matrices
train_density = (train_ratings_count / total_possible_ratings) * 100
test_density = (test_ratings_count / total_possible_ratings) * 100
print(f"Density of the training matrix: {train_density:.2f}%")
print(f"Density of the testing matrix: {test_density:.2f}%")


Number of users: 6040
Number of movies: 3952
Shape of the train ratings matrix: (6040, 3952)
Shape of the test ratings matrix: (6040, 3952)
Density of the full ratings matrix: 4.19%
Number of ratings in the entire ML1M dataset: 1000209
Number of ratings in the training set: 800167
Number of ratings in the testing set: 200042
Density of the training matrix: 3.35%
Density of the testing matrix: 0.84%


In [12]:
# Define the dimension of the latent space for user and movie embeddings
latent_dim = 32  # This is the size of the latent space

# Define input layers for users and movies
user_input_layer = Input(shape=(1,), name='user_input_layer')
movie_input_layer = Input(shape=(1,), name='movie_input_layer')

# Embedding layers for users and movies
# These layers map users and movies into the latent space
user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim, input_length=1, name='user_embedding')(user_input_layer)
movie_embedding = Embedding(input_dim=num_movies, output_dim=latent_dim, input_length=1, name='movie_embedding')(movie_input_layer)

# Flatten the embeddings
# Flattening is required to convert the 2D embedding output to 1D for further processing
flattened_user_embedding = Flatten(name='flattened_user_embedding')(user_embedding)
flattened_movie_embedding = Flatten(name='flattened_movie_embedding')(movie_embedding)

# Dot product of user and movie embeddings in the latent space
# This operation captures the interaction between users and movies
interaction_layer = Dot(axes=1, name='interaction_layer')([flattened_user_embedding, flattened_movie_embedding])

# Define the model architecture with input and output layers
mf_model = Model(inputs=[user_input_layer, movie_input_layer], outputs=interaction_layer, name='matrix_factorization_mse_model')

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

# Compile the model with additional metrics
mf_model.compile(
    loss='mean_squared_error', 
    optimizer=Nadam(learning_rate=0.001),
    metrics=['mean_absolute_error', rmse]  # Include MAE and custom RMSE as additional metrics
)

# Display the model's architecture
mf_model.summary()

Model: "matrix_factorization_mse_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input_layer (InputLay  [(None, 1)]                  0         []                            
 er)                                                                                              
                                                                                                  
 movie_input_layer (InputLa  [(None, 1)]                  0         []                            
 yer)                                                                                             
                                                                                                  
 user_embedding (Embedding)  (None, 1, 32)                193280    ['user_input_layer[0][0]']    
                                                                     

In [14]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_rmse', patience=2, restore_best_weights=True)

# Extracting the indices of non-zero ratings for training and validation
train_user_ids, train_item_ids = train_matrix.nonzero()
train_ratings = train_matrix[train_user_ids, train_item_ids]

test_user_ids, test_item_ids = test_matrix.nonzero()
test_ratings = test_matrix[test_user_ids, test_item_ids]

# Training the model with validation data and early stopping
mf_model.fit(
    [train_user_ids, train_item_ids], 
    train_ratings,
    epochs=10,
    batch_size=64,
    shuffle=True,
    validation_data=([test_user_ids, test_item_ids], test_ratings),
    callbacks=[early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.src.callbacks.History at 0x7f2b6d9436d0>

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Model prediction: providing the model with user and item indices to predict the ratings
test_predictions = mf_model.predict([test_user_ids, test_item_ids]).flatten()

# These are the true ratings that users have given to movies
test_actuals = test_matrix[test_matrix.nonzero()].flatten()

# Compute RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(test_actuals, test_predictions))

# Compute MAE (Mean Absolute Error)
mae = mean_absolute_error(test_actuals, test_predictions)

# Compute MSE (Mean Squared Error)
mse = mean_squared_error(test_actuals, test_predictions)

# Print the computed metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

RMSE: 0.9020478690175736
MAE: 0.7056276965498413
MSE: 0.8136903579991456


In [16]:
import numpy as np
import math

def precision_at_k(actual, predicted, k):
    """Calculate Precision at K."""
    act_set = set(actual)
    pred_set = set(predicted[:k])
    if not pred_set:
        return 0
    return len(act_set & pred_set) / float(k)

def recall_at_k(actual, predicted, k):
    """Calculate Recall at K."""
    act_set = set(actual)
    pred_set = set(predicted[:k])
    if not act_set:
        return 0
    return len(act_set & pred_set) / float(len(act_set))

def ndcg_at_k(actual, predicted, k):
    """Calculate nDCG at K."""
    actual = set(actual)
    if not actual:
        return 0

    dcg = sum((int(pred in actual) / math.log2(idx + 2)) for idx, pred in enumerate(predicted[:k]))
    idcg = sum((1.0 / math.log2(idx + 2)) for idx in range(min(len(actual), k)))
    return dcg / idcg if idcg > 0 else 0

def get_user_actual_items(user_id, ratings_matrix):
    """Get list of items rated by a user."""
    return [i for i, rating in enumerate(ratings_matrix[user_id]) if rating > 0]

def get_user_predicted_items(user_id, model, num_items):
    """Get list of predicted item rankings for a user."""
    user_vector = np.full((num_items,), user_id)
    item_vector = np.arange(num_items)
    predictions = model.predict([user_vector, item_vector]).flatten()
    return np.argsort(-predictions)

In [17]:
# Parameters
K = 10  # Example: top 10 recommendations

# Lists to store metric values for each user
user_precisions = []
user_recalls = []
user_ndcgs = []

# Loop through each user to calculate metrics
for user_id in range(num_users):
    actual_items = get_user_actual_items(user_id, test_matrix)
    predicted_items = get_user_predicted_items(user_id, mf_model, num_movies)

    user_precisions.append(precision_at_k(actual_items, predicted_items, K))
    user_recalls.append(recall_at_k(actual_items, predicted_items, K))
    user_ndcgs.append(ndcg_at_k(actual_items, predicted_items, K))

# Calculate average metric values
average_precision = sum(user_precisions) / len(user_precisions) if user_precisions else 0
average_recall = sum(user_recalls) / len(user_recalls) if user_recalls else 0
average_ndcg = sum(user_ndcgs) / len(user_ndcgs) if user_ndcgs else 0

# Print average metrics
print(f"Average Precision@{K}: {average_precision}")
print(f"Average Recall@{K}: {average_recall}")
print(f"Average NDCG@{K}: {average_ndcg}")

Average Precision@10: 0.03937086092715141
Average Recall@10: 0.015120661355868014
Average NDCG@10: 0.04144546371125939


In [1]:
# import numpy as np
# from sklearn.metrics import ndcg_score
# from joblib import Parallel, delayed

# def batch_predict(model, num_users, num_items):
#     """Predict ratings for all users and items in batches."""
#     user_vector = np.repeat(np.arange(num_users), num_items)
#     item_vector = np.tile(np.arange(num_items), num_users)
#     predictions = model.predict([user_vector, item_vector]).reshape(num_users, num_items)
#     return predictions

# def compute_metrics_for_user(user_id, predictions, test_matrix, k=10):
#     """Compute precision, recall, and ndcg for a single user."""
#     true_items = np.where(test_matrix[user_id] > 0)[0]
#     predicted_ranking = np.argsort(-predictions[user_id])[:k]
    
#     true_set = set(true_items)
#     pred_set = set(predicted_ranking)
    
#     precision = len(true_set & pred_set) / k
#     recall = len(true_set & pred_set) / len(true_set) if true_set else 0
#     ndcg = ndcg_score([true_items], [predicted_ranking], k=k)
    
#     return precision, recall, ndcg

# predictions = batch_predict(mf_model, num_users, num_movies)

# results = Parallel(n_jobs=-1)(delayed(compute_metrics_for_user)(user_id, predictions, test_matrix, K) for user_id in range(num_users))

# # Unpack results
# precisions, recalls, ndcgs = zip(*results)

# # Calculate average metric values
# average_precision = np.mean(precisions)
# average_recall = np.mean(recalls)
# average_ndcg = np.mean(ndcgs)

# # Print average metrics
# print(f"Average Precision@{K}: {average_precision}")
# print(f"Average Recall@{K}: {average_recall}")
# print(f"Average NDCG@{K}: {average_ndcg}")