In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam, SGD, RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [26]:
def load_and_binarize_ml1m(filepath, threshold=4):
    """
    Load the MovieLens 1M dataset, binarize ratings, and correctly handle user and movie IDs.
    
    Args:
    - filepath (str): Path to the MovieLens 1M dataset.
    - threshold (int): Threshold for binarizing ratings (ratings >= threshold are positive).
    
    Returns:
    - ratings_matrix (np.ndarray): Binarized ratings matrix.
    - user_id_mapping (dict): Mapping of original user IDs to matrix indices.
    - movie_id_mapping (dict): Mapping of original movie IDs to matrix indices.
    """
    # Load dataset
    df = pd.read_csv(filepath, sep='::', engine='python', names=['user_id', 'movie_id', 'rating', 'timestamp'])
    
    # Binarize ratings
    df['rating'] = (df['rating'] >= threshold).astype(int)
    
    # Create mappings for user and movie IDs
    user_ids = df['user_id'].unique()
    movie_ids = df['movie_id'].unique()
    user_id_mapping = {user_id: index for index, user_id in enumerate(user_ids)}
    movie_id_mapping = {movie_id: index for index, movie_id in enumerate(movie_ids)}
    
    # Initialize ratings matrix
    num_users, num_movies = len(user_ids), len(movie_ids)
    ratings_matrix = np.zeros((num_users, num_movies), dtype=int)
    
    # Fill ratings matrix using mappings
    for row in df.itertuples(index=False):
        user_index = user_id_mapping[row.user_id]
        movie_index = movie_id_mapping[row.movie_id]
        ratings_matrix[user_index, movie_index] = row.rating
    
    return ratings_matrix, user_id_mapping, movie_id_mapping

In [27]:
def split_ratings(ratings_matrix, test_ratio=0.2, random_state=42):
    """
    Split ratings into training and testing sets by masking a percentage of ratings.
    
    Args:
    - ratings_matrix (np.ndarray): Full binarized ratings matrix.
    - test_ratio (float): Fraction of ratings to use as the test set.
    - random_state (int): Seed for reproducibility.
    
    Returns:
    - train_matrix (np.ndarray): Training set ratings matrix.
    - test_matrix (np.ndarray): Test set ratings matrix.
    """
    np.random.seed(random_state)
    mask = np.random.rand(*ratings_matrix.shape) < test_ratio
    train_matrix = np.copy(ratings_matrix)
    test_matrix = np.copy(ratings_matrix)

    # Apply mask
    train_matrix[mask] = 0
    test_matrix[~mask] = 0

    return train_matrix, test_matrix

In [29]:
# Load, binarize, and split the dataset
ratings_matrix, user_id_mapping, movie_id_mapping = load_and_binarize_ml1m('./data/MovieLens_1M/movielens_1m_dataset.dat')
train_matrix, test_matrix = split_ratings(ratings_matrix)

# Print the shapes of the matrices
print("Shape of the full ratings matrix:", ratings_matrix.shape)
print("Shape of the training matrix:", train_matrix.shape)
print("Shape of the test matrix:", test_matrix.shape)

# Calculate and print the density of the full ratings matrix
non_zero_ratings = np.count_nonzero(ratings_matrix)
total_possible_ratings = ratings_matrix.size
density = (non_zero_ratings / total_possible_ratings) * 100
print(f"Density of the full ratings matrix: {density:.2f}%")

# Calculate and print the number of positive ratings in the training and test sets
train_positives = np.count_nonzero(train_matrix)
test_positives = np.count_nonzero(test_matrix)
print(f"Number of positive ratings in the training set: {train_positives}")
print(f"Number of positive ratings in the test set: {test_positives}")

# Calculate and print the distribution of ratings across users and movies in the training set
print(f"Average number of ratings per user in the training set: {np.mean(np.count_nonzero(train_matrix, axis=1)):.2f}")
print(f"Average number of ratings per movie in the training set: {np.mean(np.count_nonzero(train_matrix, axis=0)):.2f}")

# Verify that every user and movie has at least one rating in the training set
users_with_ratings_train = np.any(train_matrix > 0, axis=1).sum()
movies_with_ratings_train = np.any(train_matrix > 0, axis=0).sum()
print(f"Number of users with at least one rating in the training set: {users_with_ratings_train} / {train_matrix.shape[0]}")
print(f"Number of movies with at least one rating in the training set: {movies_with_ratings_train} / {train_matrix.shape[1]}")


Shape of the full ratings matrix: (6040, 3706)
Shape of the training matrix: (6040, 3706)
Shape of the test matrix: (6040, 3706)
Density of the full ratings matrix: 2.57%
Number of positive ratings in the training set: 460208
Number of positive ratings in the test set: 115073
Average number of ratings per user in the training set: 76.19
Average number of ratings per movie in the training set: 124.18
Number of users with at least one rating in the training set: 6038 / 6040
Number of movies with at least one rating in the training set: 3490 / 3706


In [37]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam

latent_dim = 32
num_users, num_movies = ratings_matrix.shape

# Inputs
user_input = Input(shape=(1,), name='user_input', dtype='int32')
item_input = Input(shape=(1,), name='item_input', dtype='int32')

# Embeddings
user_embedding = Embedding(num_users, latent_dim, name='user_embedding')(user_input)
item_embedding = Embedding(num_movies, latent_dim, name='item_embedding')(item_input)

# Flatten embeddings
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

# Dot product of user and item embeddings
dot_product = Dot(axes=1)([user_vec, item_vec])

# Output layer
output = Dense(1, activation='sigmoid')(dot_product)

# Model
binary_model = Model(inputs=[user_input, item_input], outputs=output)

# Compile the model
binary_model.compile(optimizer=Adam(0.001), loss='binary_crossentropy')


In [39]:
# Training data
train_user_ids, train_item_ids = np.where(train_matrix > 0)
train_labels = train_matrix[train_user_ids, train_item_ids]

# Test data
test_user_ids, test_item_ids = np.where(test_matrix > 0)
test_labels = test_matrix[test_user_ids, test_item_ids]

In [40]:
binary_model.fit([train_user_ids, train_item_ids], train_labels, 
          epochs=5, batch_size=64, 
          validation_split=0.1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f6e56711450>

In [44]:
import numpy as np
from sklearn.metrics import ndcg_score

def precision_at_k(true_matrix, predicted_scores_matrix, k=10):
    precisions = []
    
    for user_id in range(true_matrix.shape[0]):
        true_items = np.where(true_matrix[user_id] > 0)[0]
        top_k_predictions = np.argsort(-predicted_scores_matrix[user_id])[:k]
        
        tp = len(set(true_items) & set(top_k_predictions))
        if len(true_items) > 0:
            precisions.append(tp / min(k, len(true_items)))
        else:
            precisions.append(0.0)
    
    return np.mean(precisions)

def recall_at_k(true_matrix, predicted_scores_matrix, k=10):
    recalls = []
    
    for user_id in range(true_matrix.shape[0]):
        true_items = np.where(true_matrix[user_id] > 0)[0]
        top_k_predictions = np.argsort(-predicted_scores_matrix[user_id])[:k]
        
        tp = len(set(true_items) & set(top_k_predictions))
        if len(true_items) > 0:
            recalls.append(tp / len(true_items))
        else:
            recalls.append(0.0)
    
    return np.mean(recalls)

def calculate_ndcg_at_k(true_matrix, predicted_scores_matrix, k=10):
    ndcg_scores = []
    
    for user_id in range(true_matrix.shape[0]):
        true_items = true_matrix[user_id]
        scores = predicted_scores_matrix[user_id]
        ndcg = ndcg_score([true_items], [scores], k=k)
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)

In [42]:
def generate_prediction_matrix(model, num_users, num_movies):
    user_indices, item_indices = np.where(np.ones((num_users, num_movies)))
    predictions = model.predict([user_indices, item_indices]).flatten()
    prediction_matrix = predictions.reshape(num_users, num_movies)
    return prediction_matrix

predicted_scores_matrix = generate_prediction_matrix(binary_model, num_users, num_movies)



In [48]:
k = 10
precision = precision_at_k(test_matrix, predicted_scores_matrix, k=k)
recall = recall_at_k(test_matrix, predicted_scores_matrix, k=k)
ndcg = calculate_ndcg_at_k(test_matrix, predicted_scores_matrix, k=k)

print(f"Precision at {k}: {precision:.4f}")
print(f"Recall at {k}: {recall:.4f}")
print(f"NDCG at {k}: {ndcg:.4f}")

Precision at 10: 0.0170
Recall at 10: 0.0115
NDCG at 10: 0.0138
