In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam, SGD, RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [4]:
def load_movielens_100k_data(base_path='./', delimiter='\t'):
    """
    Load the MovieLens 100K dataset.

    Args:
    base_path (str): The base path to the dataset files.
    delimiter (str): The delimiter used in the dataset files.

    Returns:
    Tuple: Returns number of users, number of movies, training and testing rating matrices.
    """
    # Load training and testing data
    train_data = pd.read_csv(base_path + 'movielens_100k_u1.base', sep=delimiter, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(base_path + 'movielens_100k_u1.test', sep=delimiter, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

    # Determine the number of users and movies from the dataset
    num_users = max(train_data['user_id'].max(), test_data['user_id'].max())
    num_movies = max(train_data['movie_id'].max(), test_data['movie_id'].max())

    # Initialize matrices to store the ratings
    train_ratings_matrix = np.zeros((num_users, num_movies))
    test_ratings_matrix = np.zeros((num_users, num_movies))

    # Fill the matrices with ratings from the data
    for row in train_data.itertuples():
        train_ratings_matrix[row.user_id - 1, row.movie_id - 1] = row.rating
    for row in test_data.itertuples():
        test_ratings_matrix[row.user_id - 1, row.movie_id - 1] = row.rating

    return num_users, num_movies, train_ratings_matrix, test_ratings_matrix

In [5]:
# Load and preprocess MovieLens 100K data
num_users, num_movies, train_ratings_matrix, test_ratings_matrix = load_movielens_100k_data('./data/MovieLens_100K/')

# Printing some details for understanding
print("Number of users:", num_users)
print("Number of movies:", num_movies)

# Displaying the shape of the training and testing matrices
print("Shape of Training Ratings Matrix:", train_ratings_matrix.shape)
print("Shape of Testing Ratings Matrix:", test_ratings_matrix.shape)

# Displaying a small part of the training matrix (for example, the first 5 users and movies)
print("Sample of Training Ratings Matrix (first 5 users and movies):\n", train_ratings_matrix[:5, :5])

# Displaying a small part of the testing matrix (for example, the first 5 users and movies)
print("Sample of Testing Ratings Matrix (first 5 users and movies):\n", test_ratings_matrix[:5, :5])

Number of users: 943
Number of movies: 1682
Shape of Training Ratings Matrix: (943, 1682)
Shape of Testing Ratings Matrix: (943, 1682)
Sample of Training Ratings Matrix (first 5 users and movies):
 [[5. 3. 4. 3. 3.]
 [4. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Sample of Testing Ratings Matrix (first 5 users and movies):
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [4. 3. 0. 0. 0.]]


In [6]:
# Define the dimension of the latent space for user and movie embeddings
latent_dim = 32  # This is the size of the latent space

# Define input layers for users and movies
user_input_layer = Input(shape=(1,), name='user_input_layer')
movie_input_layer = Input(shape=(1,), name='movie_input_layer')

# Embedding layers for users and movies
# These layers map users and movies into the latent space
user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim, input_length=1, name='user_embedding')(user_input_layer)
movie_embedding = Embedding(input_dim=num_movies, output_dim=latent_dim, input_length=1, name='movie_embedding')(movie_input_layer)

# Flatten the embeddings
# Flattening is required to convert the 2D embedding output to 1D for further processing
flattened_user_embedding = Flatten(name='flattened_user_embedding')(user_embedding)
flattened_movie_embedding = Flatten(name='flattened_movie_embedding')(movie_embedding)

# Dot product of user and movie embeddings in the latent space
# This operation captures the interaction between users and movies
interaction_layer = Dot(axes=1, name='interaction_layer')([flattened_user_embedding, flattened_movie_embedding])

# Define the model architecture with input and output layers
mf_model = Model(inputs=[user_input_layer, movie_input_layer], outputs=interaction_layer, name='matrix_factorization_mse_model')

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

# Compile the model with additional metrics
mf_model.compile(
    loss='mean_squared_error', 
    optimizer=Nadam(learning_rate=0.001),
    metrics=['mean_absolute_error', rmse]  # Include MAE and custom RMSE as additional metrics
)

# Display the model's architecture
mf_model.summary()


Model: "matrix_factorization_mse_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input_layer (InputLay  [(None, 1)]                  0         []                            
 er)                                                                                              
                                                                                                  
 movie_input_layer (InputLa  [(None, 1)]                  0         []                            
 yer)                                                                                             
                                                                                                  
 user_embedding (Embedding)  (None, 1, 32)                30176     ['user_input_layer[0][0]']    
                                                                     

In [7]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Extracting the indices of non-zero ratings for training and validation
train_user_ids, train_item_ids = train_ratings_matrix.nonzero()
train_ratings = train_ratings_matrix[train_user_ids, train_item_ids]

test_user_ids, test_item_ids = test_ratings_matrix.nonzero()
test_ratings = test_ratings_matrix[test_user_ids, test_item_ids]

# Training the model with validation data and early stopping
mf_model.fit(
    [train_user_ids, train_item_ids], 
    train_ratings,
    epochs=50,
    batch_size=64,
    shuffle=True,
    validation_data=([test_user_ids, test_item_ids], test_ratings),
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


<keras.src.callbacks.History at 0x7feddcc81d10>

In [9]:
# Making recommendations for a new user or item (this part is optional and depends on your application)
# new_user_id = 0  # Replace with the ID of the new user
# user_ids = np.full(num_movies, new_user_id)
# item_ids = np.arange(num_movies)

# Predictions for the new user and all items
# recommendations = model.predict([user_ids, item_ids])

# Sorting recommendations in descending order to get top recommendations
# sorted_indices = np.argsort(recommendations, axis=0)[::-1]

# Getting the top recommended item IDs
# top_item_ids = sorted_indices[:10, 0]

# Printing the top recommended item IDs
# print(f"Top Recommendations for New User ({new_user_id}):")
# print(top_item_ids)

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Generate predictions for the test set
# 'test_user_ids' and 'test_item_ids' contain the indices of all non-zero entries in the test ratings matrix
test_user_ids = test_ratings_matrix.nonzero()[0]
test_item_ids = test_ratings_matrix.nonzero()[1]

# Model prediction: providing the model with user and item indices to predict the ratings
test_predictions = mf_model.predict([test_user_ids, test_item_ids]).flatten()

# Extract actual ratings from the test set
# These are the true ratings that users have given to movies
test_actuals = test_ratings_matrix[test_ratings_matrix.nonzero()].flatten()

# Compute RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(test_actuals, test_predictions))

# Compute MAE (Mean Absolute Error)
mae = mean_absolute_error(test_actuals, test_predictions)

# Compute MSE (Mean Squared Error)
mse = mean_squared_error(test_actuals, test_predictions)

# Print the computed metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

RMSE: 0.9351804967450156
MAE: 0.7327724791778717
MSE: 0.8745625614922541


In [11]:
import numpy as np
import math

def precision_at_k(actual, predicted, k):
    """Calculate Precision at K."""
    act_set = set(actual)
    pred_set = set(predicted[:k])
    if not pred_set:
        return 0
    return len(act_set & pred_set) / float(k)

def recall_at_k(actual, predicted, k):
    """Calculate Recall at K."""
    act_set = set(actual)
    pred_set = set(predicted[:k])
    if not act_set:
        return 0
    return len(act_set & pred_set) / float(len(act_set))

def ndcg_at_k(actual, predicted, k):
    """Calculate nDCG at K."""
    actual = set(actual)
    if not actual:
        return 0

    dcg = sum((int(pred in actual) / math.log2(idx + 2)) for idx, pred in enumerate(predicted[:k]))
    idcg = sum((1.0 / math.log2(idx + 2)) for idx in range(min(len(actual), k)))
    return dcg / idcg if idcg > 0 else 0

def get_user_actual_items(user_id, ratings_matrix):
    """Get list of items rated by a user."""
    return [i for i, rating in enumerate(ratings_matrix[user_id]) if rating > 0]

def get_user_predicted_items(user_id, model, num_items):
    """Get list of predicted item rankings for a user."""
    user_vector = np.full((num_items,), user_id)
    item_vector = np.arange(num_items)
    predictions = model.predict([user_vector, item_vector]).flatten()
    return np.argsort(-predictions)  # returns indices of items in descending order of predicted rating

In [12]:
# Parameters
K = 10  # Example: top 10 recommendations

# Lists to store metric values for each user
user_precisions = []
user_recalls = []
user_ndcgs = []

# Loop through each user to calculate metrics
for user_id in range(num_users):
    actual_items = get_user_actual_items(user_id, test_ratings_matrix)
    predicted_items = get_user_predicted_items(user_id, mf_model, num_movies)

    user_precisions.append(precision_at_k(actual_items, predicted_items, K))
    user_recalls.append(recall_at_k(actual_items, predicted_items, K))
    user_ndcgs.append(ndcg_at_k(actual_items, predicted_items, K))

# Calculate average metric values
average_precision = sum(user_precisions) / len(user_precisions) if user_precisions else 0
average_recall = sum(user_recalls) / len(user_recalls) if user_recalls else 0
average_ndcg = sum(user_ndcgs) / len(user_ndcgs) if user_ndcgs else 0

# Print average metrics
print(f"Average Precision@{K}: {average_precision}")
print(f"Average Recall@{K}: {average_recall}")
print(f"Average NDCG@{K}: {average_ndcg}")

Average Precision@10: 0.03054082714740194
Average Recall@10: 0.008186207209281865
Average NDCG@10: 0.02773379038499734


In [12]:
import numpy as np

# Function to calculate Precision@K, Recall@K, and NDCG@K
def calculate_ranking_metrics(predictions, true_ratings, k=10, relevance_threshold=4):
    """
    Calculate Precision@K, Recall@K, and NDCG@K with relevance threshold.
    
    Args:
        predictions (np.array): Predicted scores for items, shape (num_users, num_items).
        true_ratings (np.array): True ratings, shape (num_users, num_items).
        k (int): Number of top recommendations to evaluate.
        relevance_threshold (float): Threshold above which items are considered relevant.
        
    Returns:
        dict: Precision@K, Recall@K, and NDCG@K scores averaged across all users.
    """
    precisions, recalls, ndcgs = [], [], []

    # Define a function to calculate DCG; used for both DCG@K and IDCG@K
    def dcg(scores):
        return np.sum((2**scores - 1) / np.log2(np.arange(2, scores.size + 2)))

    for user_predictions, user_true_ratings in zip(predictions, true_ratings):
        # Determine which items are considered relevant for this user
        relevant_items = user_true_ratings > relevance_threshold
        
        # If no items are relevant for this user, skip to avoid zero division
        if not np.any(relevant_items):
            continue
        
        # Rank items based on the predicted scores
        top_k_indices = np.argsort(user_predictions)[-k:]
        
        # Calculate precision, recall
        num_relevant_in_top_k = np.sum(relevant_items[top_k_indices])
        num_relevant_total = np.sum(relevant_items)
        
        precision_at_k = num_relevant_in_top_k / k
        recall_at_k = num_relevant_in_top_k / num_relevant_total
        
        # Calculate NDCG
        top_k_relevance = relevant_items[top_k_indices].astype(int)
        top_k_dcg = dcg(top_k_relevance)
        ideal_dcg = dcg(np.sort(relevant_items)[-k:][::-1].astype(int))  # Sort by true relevance
        
        ndcg_at_k = top_k_dcg / ideal_dcg if ideal_dcg > 0 else 0
        
        precisions.append(precision_at_k)
        recalls.append(recall_at_k)
        ndcgs.append(ndcg_at_k)
    
    # Calculate the average across all users for whom metrics were computed
    metrics = {
        "Precision@K": np.mean(precisions) if precisions else 0,
        "Recall@K": np.mean(recalls) if recalls else 0,
        "NDCG@K": np.mean(ndcgs) if ndcgs else 0
    }
    
    return metrics

# Define evaluation parameters
k = 10
relevance_threshold = 4

# Generate predictions for the test set
test_user_ids = test_ratings_matrix.nonzero()[0]
test_item_ids = test_ratings_matrix.nonzero()[1]

# Model prediction: providing the model with user and item indices to predict the ratings
test_predictions = mf_model.predict([test_user_ids, test_item_ids])

# Calculate ranking metrics (Precision@K, Recall@K, and NDCG@K)
metrics = calculate_ranking_metrics(test_predictions, test_ratings_matrix, k=k, relevance_threshold=relevance_threshold)

# Print the computed metrics
print(f"Precision@{k}: {metrics['Precision@K']:.4f}")
print(f"Recall@{k}: {metrics['Recall@K']:.4f}")
print(f"NDCG@{k}: {metrics['NDCG@K']:.4f}")

Precision@10: 0.0029
Recall@10: 0.0020
NDCG@10: 0.0067
