In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import math
import matplotlib.pyplot as plt
import os # For checking file paths
import time # To time model training

# --- Configuration ---
# --- Dataset 1 ---
CSV_FILE_PATH_1 = './Datasets/ratings_preprocessed_D1.csv'
DATASET_NAME_1 = "Dataset 1"

# --- Dataset 2 ---
CSV_FILE_PATH_2 = './Datasets/ratings_preprocessed_D4.csv'
DATASET_NAME_2 = "Dataset 2"

USER_COL = 'userId'
ITEM_COL = 'movieId'

# Model Hyperparameters
ENCODING_DIM = 64
DENSE_LAYERS = [512, 256]
ACTIVATION = 'selu'
OUT_ACTIVATION = 'sigmoid'
DROPOUT_RATE = 0.1
LEARNING_RATE = 0.01 # Adjusted, check convergence
EPOCHS = 15
BATCH_SIZE = 64

# Evaluation Parameters
K = 10

# --- File Saving Configuration ---
SAVE_PLOTS = True
PLOT_SAVE_DIR = "./Training Results"
COMPARISON_PLOT_FILENAME = "dae_comparison_metrics.png"
INDIVIDUAL_LOSS_PLOT_PREFIX = "dae_loss_curve"

# Create plot save directory if it doesn't exist
if SAVE_PLOTS and not os.path.exists(PLOT_SAVE_DIR):
    os.makedirs(PLOT_SAVE_DIR)
    print(f"Created directory for saving plots: {PLOT_SAVE_DIR}")

# --- Helper Functions ---

def load_and_preprocess_for_dae(csv_path, user_col, item_col):
    """Loads data, preprocesses, and splits into train/test for DAE."""
    print(f"\n--- Processing Dataset for DAE: {csv_path} ---")
    print("Loading data...")
    if not os.path.exists(csv_path):
         print(f"Error: CSV file not found at {csv_path}")
         return None

    try:
        df = pd.read_csv(csv_path)
        print(f"Data loaded successfully. Shape: {df.shape}")
        df = df[[user_col, item_col]].copy().drop_duplicates()
    except KeyError as e:
        print(f"Error: Column {e} not found in {csv_path}.")
        return None
    except Exception as e:
        print(f"Error loading or processing {csv_path}: {e}")
        return None

    print("Preprocessing data (Label Encoding)...")
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    df['user_id_encoded'] = user_encoder.fit_transform(df[user_col])
    df['item_id_encoded'] = item_encoder.fit_transform(df[item_col])

    n_users = df['user_id_encoded'].nunique()
    n_items = df['item_id_encoded'].nunique()
    print(f"Number of unique users (encoded): {n_users}")
    print(f"Number of unique items (encoded): {n_items}")

    user_item_interactions = df.groupby('user_id_encoded')['item_id_encoded'].apply(list).to_dict()

    print("Splitting data (per user)...")
    train_interactions = defaultdict(list) # Encoded {user: [items]} for masking
    test_interactions = defaultdict(list)  # Encoded {user: [items]} for ground truth
    user_vectors_train = np.zeros((n_users, n_items), dtype=np.float32)
    user_vectors_test_input = np.zeros((n_users, n_items), dtype=np.float32)
    users_in_test = [] # List of encoded user IDs w/ test items

    for user_id, items in user_item_interactions.items():
        if len(items) < 2:
            train_items = items
            test_items = []
            if train_items:
                for item_id in train_items:
                    if 0 <= user_id < n_users and 0 <= item_id < n_items:
                        train_interactions[user_id].append(item_id)
                        user_vectors_train[user_id, item_id] = 1.0
                        user_vectors_test_input[user_id, item_id] = 1.0
        else:
            try: train_items, test_items = train_test_split(items, test_size=0.2, random_state=42)
            except ValueError: train_items, test_items = items[:-1], items[-1:]
            if not test_items and len(items) > 0: train_items, test_items = items[:-1], items[-1:]

            if train_items:
                for item_id in train_items:
                     if 0 <= user_id < n_users and 0 <= item_id < n_items:
                         train_interactions[user_id].append(item_id)
                         user_vectors_train[user_id, item_id] = 1.0
                         user_vectors_test_input[user_id, item_id] = 1.0
            if test_items:
                valid_test_item_added = False
                for item_id in test_items:
                     if 0 <= item_id < n_items:
                         test_interactions[user_id].append(item_id)
                         valid_test_item_added = True
                if valid_test_item_added and 0 <= user_id < n_users:
                    users_in_test.append(user_id)

    print(f"Users with items in test set: {len(users_in_test)}")
    user_vectors_train_target = user_vectors_train.copy()

    return {
        "n_users": n_users, "n_items": n_items,
        "user_encoder": user_encoder, "item_encoder": item_encoder,
        "user_vectors_train": user_vectors_train,
        "user_vectors_train_target": user_vectors_train_target,
        "user_vectors_test_input": user_vectors_test_input,
        "train_interactions": train_interactions,
        "test_interactions": test_interactions,
        "users_in_test": users_in_test,
    }

def build_autoencoder(n_items, encoding_dim, dense_layers, activation, out_activation, dropout_rate):
    """Builds the Keras Autoencoder model."""
    input_layer = keras.Input(shape=(n_items,))
    x = input_layer; x = layers.Dropout(dropout_rate)(x)
    for neurons in dense_layers: x = layers.Dense(neurons, activation=activation)(x); x = layers.Dropout(dropout_rate)(x)
    encoded = layers.Dense(encoding_dim, activation=activation, name='embedding')(x)
    x = encoded
    for neurons in reversed(dense_layers): x = layers.Dense(neurons, activation=activation)(x); x = layers.Dropout(dropout_rate)(x)
    decoded = layers.Dense(n_items, activation=out_activation)(x)
    autoencoder = keras.Model(input_layer, decoded, name='Autoencoder')
    return autoencoder

def get_recommendations_dae(user_id_encoded, user_vector_input, model, train_items_set, k):
    """Generates top-K recommendations using the DAE model."""
    scores = model.predict(np.expand_dims(user_vector_input, axis=0), verbose=0)[0]
    valid_train_indices = [idx for idx in train_items_set if idx < len(scores)]
    scores[valid_train_indices] = -np.inf # Mask seen items
    top_k_indices = np.argsort(scores)[::-1]
    n_items_output = model.output_shape[1]
    valid_top_k = [idx for idx in top_k_indices if idx < n_items_output][:k]
    return valid_top_k

def calculate_metrics(recommendations_dict, test_interactions_dict, users_to_eval, k):
    """Calculates HR@K and NDCG@K."""
    hits, total_ndcg, evaluated_user_count = 0, 0.0, 0
    for user_id in users_to_eval: # Encoded ID
        if user_id not in test_interactions_dict or not test_interactions_dict[user_id]: continue
        if user_id not in recommendations_dict: continue

        evaluated_user_count += 1
        recommended_indices = recommendations_dict[user_id] # Encoded item IDs
        test_items_set = set(test_interactions_dict[user_id]) # Encoded item IDs

        hit = any(item_id in test_items_set for item_id in recommended_indices)
        if hit: hits += 1

        dcg = sum(1.0 / math.log2(i + 2) for i, item_id in enumerate(recommended_indices) if item_id in test_items_set)
        idcg = sum(1.0 / math.log2(i + 2) for i in range(min(k, len(test_items_set))))
        total_ndcg += (dcg / idcg) if idcg > 0 else 0.0

    avg_hit_rate = hits / evaluated_user_count if evaluated_user_count > 0 else 0.0
    avg_ndcg = total_ndcg / evaluated_user_count if evaluated_user_count > 0 else 0.0
    return avg_hit_rate, avg_ndcg, evaluated_user_count

# --- Training and Evaluation Function for DAE (No Validation during Fit) ---
def train_evaluate_dae(dataset_name, data_dict, model_params, eval_params, save_plots_config):
    """Trains DAE, saves/shows loss curve, evaluates HR/NDCG."""
    print(f"\n--- Training and Testing DAE: {dataset_name} ---")
    start_time = time.time()

    n_items = data_dict['n_items']
    user_vectors_train = data_dict['user_vectors_train']
    user_vectors_train_target = data_dict['user_vectors_train_target']
    users_in_test = data_dict['users_in_test']
    user_vectors_test_input = data_dict['user_vectors_test_input']
    train_interactions = data_dict['train_interactions']
    test_interactions = data_dict['test_interactions']

    k = eval_params['k']
    epochs = model_params['epochs']
    batch_size = model_params['batch_size']
    learning_rate = model_params['learning_rate']

    print("Building DAE model...")
    tf.keras.backend.clear_session()
    model = build_autoencoder(
        n_items, model_params['encoding_dim'], model_params['dense_layers'],
        model_params['activation'], model_params['out_activation'], model_params['dropout_rate']
    )

    print("Training DAE model...")
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    callbacks = []

    history = model.fit(
        user_vectors_train, user_vectors_train_target,
        epochs=epochs, batch_size=batch_size, shuffle=True,
        callbacks=callbacks, verbose=1
    )
    training_time = time.time() - start_time
    print(f"DAE Training finished. Time: {training_time:.2f}s")

    # --- Plot and Save/Show INDIVIDUAL Training Loss Curve ---
    fig_loss, ax_loss = plt.subplots(figsize=(10, 5))
    ax_loss.plot(history.history['loss'], label='Training Loss', marker='o')
    ax_loss.set_title(f'DAE Model Training Loss - {dataset_name}')
    ax_loss.set_ylabel('Loss (Binary Crossentropy)')
    ax_loss.set_xlabel('Epoch')
    ax_loss.legend(loc='best'); ax_loss.grid(True)

    if save_plots_config['save']:
        safe_name = "".join(c if c.isalnum() else "_" for c in dataset_name)
        f_path = os.path.join(save_plots_config['dir'], f"{save_plots_config['prefix']}_{safe_name}.png")
        try:
            fig_loss.savefig(f_path, bbox_inches='tight', dpi=150)
            print(f"Saved DAE loss plot to: {f_path}")
        except Exception as e:
            print(f"Error saving DAE loss plot: {e}")
        finally:
             plt.close(fig_loss) # Close the figure regardless of save success/failure
    else:
        plt.show() # Show plot if not saving

    # --- Test Model ---
    print("\nTesting DAE model...")
    train_loss = history.history['loss'][-1] if 'loss' in history.history and history.history['loss'] else float('nan')

    # Calculate reconstruction loss on the training set
    test_loss_recon_train = "N/A"
    if user_vectors_train.shape[0] > 0:
        try: loss_val = model.evaluate(user_vectors_train, user_vectors_train_target, batch_size=batch_size, verbose=0); test_loss_recon_train = f"{loss_val:.4f}"
        except Exception as e: print(f"Error evaluating recon loss: {e}"); test_loss_recon_train = "Error"
    else: test_loss_recon_train = "N/A"

    print(f"Final Training Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss_recon_train}")

    # --- Evaluate HR@K, NDCG@K ---
    print(f"\nCalculating DAE HR@{k} and NDCG@{k}...")
    dae_recs = {}
    valid_test_users_for_eval = [uid for uid in users_in_test
                                 if uid in test_interactions and test_interactions[uid]
                                 and uid < user_vectors_test_input.shape[0]]
    print(f"Testing HR/NDCG on {len(valid_test_users_for_eval)} users.")

    eval_start_time = time.time()
    for user_id in valid_test_users_for_eval: # Encoded ID
        train_items_set = set(train_interactions.get(user_id, []))
        user_vector_input = user_vectors_test_input[user_id]
        dae_recs[user_id] = get_recommendations_dae(user_id, user_vector_input, model, train_items_set, k)
    eval_time = time.time() - eval_start_time

    hit_rate, ndcg, eval_count = calculate_metrics(dae_recs, test_interactions, valid_test_users_for_eval, k)

    print("\n--- DAE Test Summary ---")
    print(f"Dataset: {dataset_name}")
    #print(f"Training Time: {training_time:.2f} seconds")
    #print(f"Evaluation Time (Rec Gen): {eval_time:.2f} seconds")
    print(f"Final Training Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss_recon_train}")
    print(f"Evaluated HR/NDCG on {eval_count} users.")
    print(f"Hit Rate @{k}: {hit_rate:.4f}")
    print(f"NDCG @{k}: {ndcg:.4f}")
    print("-----------------------------")

    # Return only HR and NDCG for the comparison plot
    return {'HR@K': hit_rate, 'NDCG@K': ndcg}


# --- Main Execution ---
results = {}
dataset_names = [DATASET_NAME_1, DATASET_NAME_2]
csv_paths = [CSV_FILE_PATH_1, CSV_FILE_PATH_2]

model_params = { # DAE params
    'encoding_dim': ENCODING_DIM, 'dense_layers': DENSE_LAYERS,
    'activation': ACTIVATION, 'out_activation': OUT_ACTIVATION,
    'dropout_rate': DROPOUT_RATE, 'learning_rate': LEARNING_RATE,
    'epochs': EPOCHS, 'batch_size': BATCH_SIZE
}
eval_params = {'k': K}
save_plots_config = {
    'save': SAVE_PLOTS,
    'dir': PLOT_SAVE_DIR,
    'prefix': INDIVIDUAL_LOSS_PLOT_PREFIX
}

for name, path in zip(dataset_names, csv_paths):
    data = load_and_preprocess_for_dae(path, USER_COL, ITEM_COL)
    if data:
        dae_results = train_evaluate_dae(name, data, model_params, eval_params, save_plots_config)
        results[name] = dae_results # Store only HR and NDCG
    else:
        results[name] = {'HR@K': 0.0, 'NDCG@K': 0.0} # Default results

# --- Plot Comparison (HR and NDCG Only) ---
print("\n--- Plotting DAE Comparison (HR & NDCG Only) ---")

if not results:
    print("No results available to plot.")
else:
    # Check if results were successfully generated for datasets
    valid_dataset_names = [name for name in dataset_names if name in results and 'HR@K' in results[name]]
    if not valid_dataset_names:
        print("No valid results available to plot.")
    else:
        # Metrics to plot: HR@K and NDCG@K only
        metrics_to_plot = ['HR@K', 'NDCG@K']
        metric_titles = {
            'HR@K': f'Hit Rate @{K}',
            'NDCG@K': f'NDCG @{K}'
            # Removed TrainTime and TrainLoss titles
        }

        n_datasets = len(valid_dataset_names)
        x = np.arange(n_datasets)
        width = 0.5 # Width for single bars per dataset

        num_metrics = len(metrics_to_plot) # Should be 2
        fig_comp, axes = plt.subplots(1, num_metrics, figsize=(6 * num_metrics, 5))
        if num_metrics == 1: axes = [axes] # Make iterable

        for i, metric in enumerate(metrics_to_plot):
            ax = axes[i]
            metric_values = [results[d_name].get(metric, 0) for d_name in valid_dataset_names]
            bars = ax.bar(x, metric_values, width)
            fmt_str = '%.4f' # Format for HR and NDCG
            ax.bar_label(bars, padding=3, fmt=fmt_str)

            ax.set_ylabel(metric_titles[metric])
            ax.set_title(f'DAE {metric_titles[metric]} Comparison')
            ax.set_xticks(x)
            ax.set_xticklabels(valid_dataset_names)
            ax.grid(True, axis='y', linestyle='--', alpha=0.7)
            max_val = max(metric_values) if metric_values else 0
            ax.set_ylim(0, max(max_val * 1.2, 0.02)) # Dynamic y-axis with padding

        fig_comp.tight_layout() # Adjust layout

        # Save or Show the plot
        if SAVE_PLOTS:
            save_path = os.path.join(PLOT_SAVE_DIR, COMPARISON_PLOT_FILENAME)
            try:
                fig_comp.savefig(save_path, bbox_inches='tight', dpi=150)
                print(f"Saved comparison plot (HR & NDCG) to: {save_path}")
            except Exception as e:
                print(f"Error saving comparison plot: {e}")
            finally:
                 plt.close(fig_comp) # Close the figure after attempting save
        else:
            plt.show() # Show the plot if not saving

print("\nComparison plotting complete.")


--- Processing Dataset for DAE: ./Datasets/ratings_preprocessed_D1.csv ---
Loading data...
Data loaded successfully. Shape: (100781, 27)
Preprocessing data (Label Encoding)...
Number of unique users (encoded): 610
Number of unique items (encoded): 9686
Splitting data (per user)...
Users with items in test set: 610

--- Training and Testing DAE: Dataset 1 ---
Building DAE model...
Training DAE model...
Epoch 1/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 102ms/step - loss: 0.6329
Epoch 2/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - loss: 0.5072
Epoch 3/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 104ms/step - loss: 0.2784
Epoch 4/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - loss: 0.0972
Epoch 5/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - loss: 2.3416
Epoch 6/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - l