In [12]:
import numpy as np
import pandas as pd

def load_and_preprocess_data(base_path='./', delimiter='\t', threshold=3):
    """
    Load MovieLens data and preprocess it by binarizing the ratings.

    Args:
    base_path (str): Base path to the dataset files.
    delimiter (str): Delimiter used in the dataset files.
    threshold (int): Threshold rating to decide likes and dislikes.

    Returns:
    Tuple: Number of users, number of movies, binarized training and testing rating matrices.
    """
    # Load training and testing data
    train_data = pd.read_csv(base_path + 'movielens_100k_u1.base', sep=delimiter, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(base_path + 'movielens_100k_u1.test', sep=delimiter, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

    # Determine the number of users and movies
    num_users = max(train_data['user_id'].max(), test_data['user_id'].max())
    num_movies = max(train_data['movie_id'].max(), test_data['movie_id'].max())

    # Initialize matrices to store binarized ratings
    train_ratings = np.zeros((num_users, num_movies))
    test_ratings = np.zeros((num_users, num_movies))

    # Fill the matrices with binarized ratings
    for row in train_data.itertuples():
        train_ratings[row.user_id - 1, row.movie_id - 1] = 1 if row.rating >= threshold else 0
    for row in test_data.itertuples():
        test_ratings[row.user_id - 1, row.movie_id - 1] = 1 if row.rating >= threshold else 0

    return num_users, num_movies, train_ratings, test_ratings

# Load and preprocess data
num_users, num_movies, train_ratings_binarized, test_ratings_binarized = load_and_preprocess_data('./data/MovieLens_100K/')

In [13]:
print("Shape of train_ratings:", train_ratings_binarized.shape)
print("Shape of test_ratings:", test_ratings_binarized.shape)

Shape of train_ratings: (943, 1682)
Shape of test_ratings: (943, 1682)


In [14]:
def check_distribution(ratings_matrix):
    positive_count = np.sum(ratings_matrix >= 1)
    negative_count = np.sum(ratings_matrix == 0)
    total_count = positive_count + negative_count
    print(f"Positive samples: {positive_count} ({positive_count / total_count * 100:.2f}%)")
    print(f"Negative samples: {negative_count} ({negative_count / total_count * 100:.2f}%)")

print("Training set distribution:")
check_distribution(train_ratings_binarized)

print("\nTesting set distribution:")
check_distribution(test_ratings_binarized)

Training set distribution:
Positive samples: 66103 (4.17%)
Negative samples: 1520023 (95.83%)

Testing set distribution:
Positive samples: 16417 (1.04%)
Negative samples: 1569709 (98.96%)


In [21]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.metrics import AUC, Precision, Recall, F1Score

def build_binary_recommendation_model(num_users, num_movies, latent_dim=32):
    """
    Build a binary classification recommendation model.

    Args:
    - num_users (int): The total number of users in the dataset.
    - num_movies (int): The total number of movies in the dataset.
    - latent_dim (int): The number of dimensions in the embedding space.

    Returns:
    - Model: A Keras model instance.
    """
    # User and movie input layers
    user_input = Input(shape=(1,), name='user_input')
    movie_input = Input(shape=(1,), name='movie_input')

    # Embedding layers for users and movies
    user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim, name='user_embedding')(user_input)
    movie_embedding = Embedding(input_dim=num_movies, output_dim=latent_dim, name='movie_embedding')(movie_input)

    # Flatten the embeddings and compute the dot product
    user_vector = Flatten(name='flattened_user_embedding')(user_embedding)
    movie_vector = Flatten(name='flattened_movie_embedding')(movie_embedding)
    interaction = Dot(axes=1, name='interaction_layer')([user_vector, movie_vector])

    # Output layer with a sigmoid activation function for binary classification
    output = Dense(1, activation='sigmoid', name='output_layer')(interaction)

    # Compile the model with additional metrics
    model = Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(
        loss='binary_crossentropy',
        optimizer=Nadam(learning_rate=0.001),
        metrics=['accuracy', Precision(), Recall(), F1Score()]  
    )

    return model

# Instantiate the model with additional metrics
binary_model = build_binary_recommendation_model(num_users, num_movies)
binary_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 movie_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 user_embedding (Embedding)  (None, 1, 32)                30176     ['user_input[0][0]']          
                                                                                                  
 movie_embedding (Embedding  (None, 1, 32)                53824     ['movie_input[0][0]']         
 )                                                                                          

 Flatten)                                                                                         
                                                                                                  
 flattened_movie_embedding   (None, 32)                   0         ['movie_embedding[0][0]']     
 (Flatten)                                                                                        
                                                                                                  
 interaction_layer (Dot)     (None, 1)                    0         ['flattened_user_embedding[0][
                                                                    0]',                          
                                                                     'flattened_movie_embedding[0]
                                                                    [0]']                         
                                                                                                  
 output_la

In [22]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

# Extracting the indices of non-zero ratings for training and validation
train_user_ids, train_item_ids = train_ratings_binarized.nonzero()
train_ratings = train_ratings_binarized[train_user_ids, train_item_ids]

test_user_ids, test_item_ids = test_ratings_binarized.nonzero()
test_ratings = test_ratings_binarized[test_user_ids, test_item_ids]


# Training the model
binary_model.fit(
    [train_user_ids, train_item_ids],
    train_ratings,
    epochs=50,
    batch_size=64,
    shuffle=True,
    validation_data=([test_user_ids, test_item_ids], test_ratings),
    callbacks=[early_stopping],
)

# Check the number of positive and negative samples in the test set
positive_samples = np.sum(test_ratings_binarized == 1)
negative_samples = np.sum(test_ratings_binarized == 0)

print("Number of positive samples in the test set:", positive_samples)
print("Number of negative samples in the test set:", negative_samples)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Number of positive samples in the test set: 16417
Number of negative samples in the test set: 1569709


In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Generate predictions for the test set
test_predictions_probs = binary_model.predict([test_user_ids, test_item_ids]).flatten()
test_predictions = (test_predictions_probs > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(test_ratings, test_predictions)
precision = precision_score(test_ratings, test_predictions)
recall = recall_score(test_ratings, test_predictions)
f1 = f1_score(test_ratings, test_predictions)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
