In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterSampler


df = pd.read_parquet('../data/processed/preprocessed_data_with_embeddings_and_ohe.parquet')


# This script reduces the size of the training set by sampling a fraction (10%) and uses RandomizedSearchCV to test different hyperparameters.

In [2]:
# Prepare the feature matrix and target vector
X = df.drop(columns=['rating'])
y = df['rating']

# Create mappings for userId and movieId to continuous indices
user_mapping = {user_id: idx for idx, user_id in enumerate(X['userId'].unique())}
movie_mapping = {movie_id: idx for idx, movie_id in enumerate(X['movieId'].unique())}

# Map userId and movieId to continuous indices
X['userId'] = X['userId'].map(user_mapping)
X['movieId'] = X['movieId'].map(movie_mapping)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)



In [3]:
# Get the number of unique users and items
num_users = len(user_mapping)
num_items = len(movie_mapping)

def create_model(embedding_dim=20, dropout_rate=0.2, dense_units=64, learning_rate=0.001, regularization=0.01):
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')
    
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding', embeddings_regularizer=l2(regularization))(user_input)
    item_embedding = Embedding(input_dim=num_items, output_dim=embedding_dim, name='item_embedding', embeddings_regularizer=l2(regularization))(item_input)
    
    user_flat = Flatten()(user_embedding)
    item_flat = Flatten()(item_embedding)
    
    concat = Concatenate()([user_flat, item_flat])
    
    fc1 = Dense(dense_units, activation='relu')(concat)
    drop1 = Dropout(dropout_rate)(fc1)
    fc2 = Dense(dense_units // 2, activation='relu')(drop1)
    drop2 = Dropout(dropout_rate)(fc2)
    fc3 = Dense(dense_units // 4, activation='relu')(drop2)
    output = Dense(1)(fc3)
    
    model = Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mae')
    
    return model



In [4]:
# Define the hyperparameter grid
param_dist = {
    'embedding_dim': [10, 20, 30],
    'dropout_rate': [0.2, 0.3, 0.4],
    'dense_units': [32, 64, 128],
    'learning_rate': [0.001, 0.01, 0.1],
    'regularization': [0.001, 0.01, 0.1]
}

# Use a smaller subset of the training data
X_train_sample = X_train.sample(frac=0.1, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

train_user_input_sample = X_train_sample['userId']
train_item_input_sample = X_train_sample['movieId']

best_score = float('inf')
best_params = None
best_model = None

for params in ParameterSampler(param_dist, n_iter=10, random_state=42):
    model = create_model(**params)
    history = model.fit([train_user_input_sample, train_item_input_sample], y_train_sample, epochs=30, batch_size=256, verbose=0, validation_split=0.2)
    
    val_score = np.min(history.history['val_loss'])
    if val_score < best_score:
        best_score = val_score
        best_params = params
        best_model = model

In [5]:
# Display the best parameters and score
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

# Evaluate the best model on the full test set
y_pred = best_model.predict([X_test['userId'], X_test['movieId']])
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R^2 Score: {r2:.4f}')

Best parameters: {'regularization': 0.001, 'learning_rate': 0.001, 'embedding_dim': 20, 'dropout_rate': 0.2, 'dense_units': 32}
Best score: 0.8069910407066345
[1m41185/41185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 3ms/step
RMSE: 0.9888
MAE: 0.7279
R^2 Score: 0.1260




In [6]:
from sklearn.model_selection import KFold

# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse = []
cv_mae = []
cv_r2 = []

for train_index, val_index in kf.split(X_train_sample):
    X_train_fold, X_val_fold = X_train_sample.iloc[train_index], X_train_sample.iloc[val_index]
    y_train_fold, y_val_fold = y_train_sample.iloc[train_index], y_train_sample.iloc[val_index]

    train_user_input_fold = X_train_fold['userId']
    train_item_input_fold = X_train_fold['movieId']
    val_user_input_fold = X_val_fold['userId']
    val_item_input_fold = X_val_fold['movieId']

    model = create_model(**best_params)
    model.fit([train_user_input_fold, train_item_input_fold], y_train_fold, epochs=30, batch_size=256, verbose=0)
    
    y_val_pred = model.predict([val_user_input_fold, val_item_input_fold])
    
    cv_rmse.append(mean_squared_error(y_val_fold, y_val_pred, squared=False))
    cv_mae.append(mean_absolute_error(y_val_fold, y_val_pred))
    cv_r2.append(r2_score(y_val_fold, y_val_pred))

print(f'5-Fold CV RMSE: {np.mean(cv_rmse):.4f} ± {np.std(cv_rmse):.4f}')
print(f'5-Fold CV MAE: {np.mean(cv_mae):.4f} ± {np.std(cv_mae):.4f}')
print(f'5-Fold CV R^2: {np.mean(cv_r2):.4f} ± {np.std(cv_r2):.4f}')


[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step




[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step




[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step




[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step




[1m2472/2472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step
5-Fold CV RMSE: 0.9716 ± 0.0107
5-Fold CV MAE: 0.7331 ± 0.0046
5-Fold CV R^2: 0.1552 ± 0.0176


