In [34]:
import pandas as pd
import numpy as np
import tensorflow as tf

def load_and_preprocess_movielens_100k(base_path='./', delimiter='\t', batch_size=64):
    """
    Load and preprocess the MovieLens 100K dataset for a user-based autoencoder model, including batching,
    ensuring that the TensorFlow datasets are structured to provide (input, target) tuples where input and target are identical.
    
    Args:
        base_path (str): The base path to the dataset files.
        delimiter (str): The delimiter used in the dataset files.
        batch_size (int): The size of batches to produce.
    
    Returns:
        tuple: Contains the number of users, number of movies, TensorFlow dataset for training, and TensorFlow dataset for testing.
    """
    # Load training and testing data
    train_data = pd.read_csv(f'{base_path}movielens_100k_u1.base', sep=delimiter, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'{base_path}movielens_100k_u1.test', sep=delimiter, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

    # Determine the number of users and movies
    num_users = max(train_data['user_id'].max(), test_data['user_id'].max())
    num_movies = max(train_data['movie_id'].max(), test_data['movie_id'].max())

    # Convert to zero-based index for TensorFlow processing
    train_data[['user_id', 'movie_id']] -= 1
    test_data[['user_id', 'movie_id']] -= 1

    # Create matrices
    train_ratings_matrix = np.zeros((num_users, num_movies))
    test_ratings_matrix = np.zeros((num_users, num_movies))

    for row in train_data.itertuples():
        train_ratings_matrix[row.user_id, row.movie_id] = row.rating
    for row in test_data.itertuples():
        test_ratings_matrix[row.user_id, row.movie_id] = row.rating

    # Convert matrices to TensorFlow datasets, ensuring each item is mapped to (input, target) tuple
    train_dataset = tf.data.Dataset.from_tensor_slices((train_ratings_matrix, train_ratings_matrix))
    test_dataset = tf.data.Dataset.from_tensor_slices((test_ratings_matrix, test_ratings_matrix))

    # Convert datasets to float32, necessary for TensorFlow processing
    train_dataset = train_dataset.map(lambda x, y: (tf.cast(x, tf.float32), tf.cast(y, tf.float32)))
    test_dataset = test_dataset.map(lambda x, y: (tf.cast(x, tf.float32), tf.cast(y, tf.float32)))

    # Shuffle the training dataset with a specified buffer size and batch both datasets
    shuffle_buffer_size = num_users  # Adjust based on available memory
    train_dataset = train_dataset.shuffle(buffer_size=shuffle_buffer_size).batch(batch_size)
    test_dataset = test_dataset.batch(batch_size)

    return num_users, num_movies, train_dataset, test_dataset

In [35]:
base_path = './data/MovieLens_100K/'  # Adjust this path to where the dataset files are located
batch_size = 64  # Common choice for batch size, but can be adjusted based on memory capacity and model requirements

# Load and preprocess the data
num_users, num_movies, train_dataset, test_dataset = load_and_preprocess_movielens_100k(base_path=base_path, batch_size=batch_size)

# Print out some insights
print(f'Number of users: {num_users}')
print(f'Number of movies: {num_movies}')

# Iterate over the first batch to print its shape and get an insight into the batched dataset
for inputs, targets in train_dataset.take(1):
    print(f'Shape of training data inputs: {inputs.shape}')  # Shape should be (batch_size, num_movies)
    print(f'Shape of training data targets: {targets.shape}')  # Shape should be identical to inputs

for inputs, targets in test_dataset.take(1):
    print(f'Shape of testing data inputs: {inputs.shape}')  # Shape should be (batch_size, num_movies)
    print(f'Shape of testing data targets: {targets.shape}')  # Shape should be identical to inputs

Number of users: 943
Number of movies: 1682
Shape of training data inputs: (64, 1682)
Shape of training data targets: (64, 1682)
Shape of testing data inputs: (64, 1682)
Shape of testing data targets: (64, 1682)


In [60]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import tensorflow.keras.backend as K

def root_mean_squared_error(y_true, y_pred):
    """Define RMSE as a metric."""
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

def build_user_autoencoder(num_movies, encoding_dim=128):
    """
    Build a user-based autoencoder model.
    
    Args:
        num_movies (int): Number of movies in the dataset to set the input dimension.
        encoding_dim (int): Size of the encoding layer, representing the bottleneck.
        
    Returns:
        A compiled Keras Model for the autoencoder.
    """
    # Input layer
    input_layer = Input(shape=(num_movies,), name='Input_Layer')
    
    # Encoder layers
    encoded = Dense(encoding_dim, activation='linear', name='Encoder_Layer')(input_layer)
        
    # Decoder layers
    decoded = Dense(encoding_dim, activation='linear', name='Decoder_Layer')(encoded)
    
    # Output layer
    output_layer = Dense(num_movies, activation='linear', name='Output_Layer')(decoded)
    
    # Define the model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='User_Autoencoder')
    
    # Compile the model
    autoencoder.compile(optimizer='adam', loss='mse', metrics=[root_mean_squared_error, 'mae'])
    
    return autoencoder


encoding_dim = 500
autoencoder_model = build_user_autoencoder(num_movies, encoding_dim)

# Display the model summary to check the architecture
autoencoder_model.summary()

Model: "User_Autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_Layer (InputLayer)    [(None, 1682)]            0         
                                                                 
 Encoder_Layer (Dense)       (None, 500)               841500    
                                                                 
 Decoder_Layer (Dense)       (None, 500)               250500    
                                                                 
 Output_Layer (Dense)        (None, 1682)              842682    
                                                                 
Total params: 1934682 (7.38 MB)
Trainable params: 1934682 (7.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [61]:
from tensorflow.keras.callbacks import EarlyStopping


early_stopping = EarlyStopping(monitor='val_root_mean_squared_error', patience=5, restore_best_weights=True)

epochs = 1000

# Train the model
history = autoencoder_model.fit(
    train_dataset, 
    validation_data=test_dataset,  
    epochs=epochs,
    callbacks=[early_stopping]
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [62]:
# Placeholder for predictions and true ratings
all_predictions = []
all_true_ratings = []

# Iterate through the test dataset to predict ratings
for inputs, targets in test_dataset:
    predictions = autoencoder_model.predict(inputs)
    all_predictions.append(predictions)
    all_true_ratings.append(targets.numpy())

# Concatenate all batched predictions and true ratings
all_predictions = np.concatenate(all_predictions, axis=0)
all_true_ratings = np.concatenate(all_true_ratings, axis=0)

# Filter out unrated items (assuming they are represented by zeros)
rated_indices = np.where(all_true_ratings != 0)
filtered_predictions = all_predictions[rated_indices]
filtered_true_ratings = all_true_ratings[rated_indices]

# Calculate RMSE, MSE, and MAE for rated items
mse = np.mean((filtered_predictions - filtered_true_ratings) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(filtered_predictions - filtered_true_ratings))

print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")

Test MSE: 2.1091272830963135
Test RMSE: 1.452283501625061
Test MAE: 1.1573264598846436


In [69]:
import numpy as np

def calculate_ranking_metrics(predictions, true_ratings, k=10, relevance_threshold=4):
    """
    Calculate Precision@K, Recall@K, and NDCG@K with relevance threshold.
    
    Args:
        predictions (np.array): Predicted scores for items, shape (num_users, num_items).
        true_ratings (np.array): True ratings, shape (num_users, num_items).
        k (int): Number of top recommendations to evaluate.
        relevance_threshold (float): Threshold above which items are considered relevant.
        
    Returns:
        dict: Precision@K, Recall@K, and NDCG@K scores averaged across all users.
    """
    precisions, recalls, ndcgs = [], [], []

    # Define a function to calculate DCG; used for both DCG@K and IDCG@K
    def dcg(scores):
        return np.sum((2**scores - 1) / np.log2(np.arange(2, scores.size + 2)))

    for user_predictions, user_true_ratings in zip(predictions, true_ratings):
        # Determine which items are considered relevant for this user
        relevant_items = user_true_ratings > relevance_threshold
        
        # If no items are relevant for this user, skip to avoid zero division
        if not np.any(relevant_items):
            continue
        
        # Rank items based on the predicted scores
        top_k_indices = np.argsort(user_predictions)[-k:][::-1]
        
        # Calculate precision, recall
        num_relevant_in_top_k = np.sum(relevant_items[top_k_indices])
        num_relevant_total = np.sum(relevant_items)
        
        precision_at_k = num_relevant_in_top_k / k
        recall_at_k = num_relevant_in_top_k / num_relevant_total
        
        # Calculate NDCG
        top_k_relevance = relevant_items[top_k_indices].astype(int)
        top_k_dcg = dcg(top_k_relevance)
        ideal_dcg = dcg(np.sort(relevant_items)[-k:][::-1].astype(int))  # Sort by true relevance
        
        ndcg_at_k = top_k_dcg / ideal_dcg if ideal_dcg > 0 else 0
        
        precisions.append(precision_at_k)
        recalls.append(recall_at_k)
        ndcgs.append(ndcg_at_k)
    
    # Calculate the average across all users for whom metrics were computed
    metrics = {
        "Precision@K": np.mean(precisions) if precisions else 0,
        "Recall@K": np.mean(recalls) if recalls else 0,
        "NDCG@K": np.mean(ndcgs) if ndcgs else 0
    }
    
    return metrics

# Example usage:
k = 10
relevance_threshold = 4
metrics = calculate_ranking_metrics(all_predictions, all_true_ratings, k=k, relevance_threshold=relevance_threshold)
print(f"Precision@{k}: {metrics['Precision@K']:.4f}")
print(f"Recall@{k}: {metrics['Recall@K']:.4f}")
print(f"NDCG@{k}: {metrics['NDCG@K']:.4f}")

Precision@10: 0.4988
Recall@10: 0.7053
NDCG@10: 0.8224
