In [6]:
import pandas as pd
import numpy as np
# No LabelEncoder needed for this SVD approach
from sklearn.model_selection import train_test_split
from collections import defaultdict
import math
import matplotlib.pyplot as plt
import os # For checking file paths
import time # To time model training

# Import Surprise library components
from surprise import Dataset, Reader, SVD

# --- Configuration ---
# --- Dataset 1 ---
CSV_FILE_PATH_1 = './Datasets/ratings_preprocessed_D1.csv'
DATASET_NAME_1 = "Dataset 1"

# --- Dataset 2 ---
CSV_FILE_PATH_2 = './Datasets/ratings_preprocessed_D4.csv'
DATASET_NAME_2 = "Dataset 2"

USER_COL = 'userId'
ITEM_COL = 'movieId'
RATING_COL_IMPLICIT = 'rating'

# SVD (from Surprise) Model Hyperparameters
SVD_N_FACTORS = 64 # Number of latent factors
SVD_N_EPOCHS = 15  # Number of epochs for SVD training
SVD_LR_ALL = 0.005 # Learning rate for SVD
SVD_REG_ALL = 0.02  # Regularization term for SVD

# Evaluation Parameters
K = 10

# --- File Saving Configuration ---
SAVE_PLOTS = True
PLOT_SAVE_DIR = "./Training Results"
COMPARISON_PLOT_FILENAME = "svd_comparison_metrics.png"

# Create plot save directory if it doesn't exist.
if SAVE_PLOTS and not os.path.exists(PLOT_SAVE_DIR):
    os.makedirs(PLOT_SAVE_DIR)
    print(f"Created directory for saving plots: {PLOT_SAVE_DIR}")

# --- Helper Functions ---

def load_and_preprocess_for_svd(csv_path, user_col, item_col):
    """Loads data, preprocesses for SVD, and splits into train/test dicts."""
    print(f"\n--- Processing Dataset for SVD: {csv_path} ---")
    print("Loading data...")
    if not os.path.exists(csv_path):
         print(f"Error: CSV file not found at {csv_path}")
         return None

    try:
        df = pd.read_csv(csv_path)
        print(f"Data loaded successfully. Shape: {df.shape}")
        df = df[[user_col, item_col]].copy().drop_duplicates()
        df[RATING_COL_IMPLICIT] = 1.0 # Implicit feedback
    except KeyError as e:
        print(f"Error: Column {e} not found in {csv_path}.")
        return None
    except Exception as e:
        print(f"Error loading or processing {csv_path}: {e}")
        return None

    n_users_orig = df[user_col].nunique()
    n_items_orig = df[item_col].nunique()
    print(f"Number of unique users (original IDs): {n_users_orig}")
    print(f"Number of unique items (original IDs): {n_items_orig}")

    # --- Train/Test Split (Per User) - No Validation Set ---
    print("Splitting data (per user) into Train/Test...")
    train_interactions_dict = defaultdict(list) # user_id_orig -> [item_id_orig]
    test_interactions_dict = defaultdict(list)  # user_id_orig -> [item_id_orig]
    train_data_surprise_list = []               # list for Surprise: (user, item, rating)
    users_in_test_set = set()

    grouped = df.groupby(user_col)
    for user_orig_id, group in grouped:
        items_orig = list(group[item_col].unique())
        if len(items_orig) < 2:
            train_items_orig = items_orig
            test_items_orig = []
        else:
            try: # Use same random_state for consistency if comparing splits
                train_items_orig, test_items_orig = train_test_split(items_orig, test_size=0.2, random_state=42)
            except ValueError: train_items_orig, test_items_orig = items_orig[:-1], items_orig[-1:]
            if not test_items_orig and len(items_orig) > 0: train_items_orig, test_items_orig = items_orig[:-1], items_orig[-1:]

        if train_items_orig:
            train_interactions_dict[user_orig_id].extend(train_items_orig)
            for item_id_orig in train_items_orig:
                 train_data_surprise_list.append((user_orig_id, item_id_orig, 1.0))
        if test_items_orig:
            test_interactions_dict[user_orig_id].extend(test_items_orig)
            users_in_test_set.add(user_orig_id)

    users_in_test = list(users_in_test_set)
    print(f"Users with items in test set: {len(users_in_test)}")

    # --- Prepare Data for Surprise ---
    if not train_data_surprise_list:
        print("Error: No training data generated. Cannot proceed.")
        return None

    train_df_surprise = pd.DataFrame(train_data_surprise_list, columns=[user_col, item_col, RATING_COL_IMPLICIT])
    reader = Reader(rating_scale=(1, 1))
    try:
        surprise_dataset = Dataset.load_from_df(train_df_surprise[[user_col, item_col, RATING_COL_IMPLICIT]], reader)
        trainset = surprise_dataset.build_full_trainset() # Build trainset ONLY from train interactions
        print(f"Surprise trainset built: {trainset.n_users} users, {trainset.n_items} items.")
        all_raw_item_ids_list = df[item_col].unique().tolist() # Get all original item IDs
    except Exception as e:
        print(f"Error creating Surprise dataset or trainset: {e}")
        return None

    return {
        "n_users_orig": n_users_orig, "n_items_orig": n_items_orig,
        "trainset_surprise": trainset,
        "train_interactions_dict": train_interactions_dict,
        "test_interactions_dict": test_interactions_dict,
        "users_in_test": users_in_test,
        "all_raw_item_ids": all_raw_item_ids_list
    }

# --- SVD Recommendation Function ---
def get_recommendations_svd(user_orig_id, trainset, svd_model, k, all_raw_item_ids):
    """Gets top K recommendations for a user using Surprise SVD model."""
    try: trainset.to_inner_uid(user_orig_id) # Check if user is known
    except ValueError: return [] # User not in trainset

    predictions = []
    user_train_items = set()
    try: # Get items user interacted with in training
        user_inner_id = trainset.to_inner_uid(user_orig_id)
        user_train_items = set(trainset.to_raw_iid(inner_iid) for inner_iid, _ in trainset.ur[user_inner_id])
    except ValueError: pass

    for item_raw_id in all_raw_item_ids:
        if item_raw_id not in user_train_items:
             pred = svd_model.predict(uid=user_orig_id, iid=item_raw_id)
             if not pred.details.get('was_impossible', False):
                 predictions.append((item_raw_id, pred.est))

    predictions.sort(key=lambda x: x[1], reverse=True)
    top_k_raw_ids = [item_raw_id for item_raw_id, score in predictions[:k]]
    return top_k_raw_ids

# --- Common Metric Calculation Function ---
def calculate_metrics(recommendations_dict, test_interactions_dict, users_to_eval, k):
    """Calculates HR@K and NDCG@K."""
    hits, total_ndcg, evaluated_user_count = 0, 0.0, 0
    for user_id in users_to_eval: # Original ID
        if user_id not in test_interactions_dict or not test_interactions_dict[user_id]: continue
        if user_id not in recommendations_dict: continue

        evaluated_user_count += 1
        recommended_indices = recommendations_dict[user_id] # Original item IDs
        test_items_set = set(test_interactions_dict[user_id]) # Original item IDs

        hit = any(item_id in test_items_set for item_id in recommended_indices)
        if hit: hits += 1

        dcg = sum(1.0 / math.log2(i + 2) for i, item_id in enumerate(recommended_indices) if item_id in test_items_set)
        idcg = sum(1.0 / math.log2(i + 2) for i in range(min(k, len(test_items_set))))
        total_ndcg += (dcg / idcg) if idcg > 0 else 0.0

    avg_hit_rate = hits / evaluated_user_count if evaluated_user_count > 0 else 0.0
    avg_ndcg = total_ndcg / evaluated_user_count if evaluated_user_count > 0 else 0.0
    return avg_hit_rate, avg_ndcg, evaluated_user_count

# --- Training and Evaluation Function for SVD ---
def train_evaluate_svd(dataset_name, data_dict, model_params, eval_params):
    """Trains and tests the SVD model."""
    print(f"\n--- Training and Testing SVD: {dataset_name} ---")
    start_time = time.time()

    trainset = data_dict['trainset_surprise']
    users_in_test = data_dict['users_in_test'] # Original user IDs
    test_interactions_dict = data_dict['test_interactions_dict'] # Original IDs
    all_raw_item_ids = data_dict['all_raw_item_ids']

    k = eval_params['k']
    n_factors = model_params['svd_n_factors']
    n_epochs = model_params['svd_n_epochs']
    lr_all = model_params['svd_lr_all']
    reg_all = model_params['svd_reg_all']

    # Build and Train Model (No validation set used during fit)
    print("Building and Training SVD model...")
    svd_model = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=42, verbose=False)
    svd_model.fit(trainset) # Train on the prepared trainset
    print("SVD Training finished.")
    training_time = time.time() - start_time

    # Evaluate HR@K, NDCG@K on Test Set
    print(f"\nCalculating SVD HR@{k} and NDCG@{k}...")
    svd_recs = {}
    valid_test_users_for_eval = [uid for uid in users_in_test
                                 if uid in test_interactions_dict and test_interactions_dict[uid]
                                 and trainset.knows_user(uid)] # Ensure user was in trainset
    print(f"Evaluating on {len(valid_test_users_for_eval)} users present in trainset and test split.")

    eval_start_time = time.time()
    for user_id in valid_test_users_for_eval: # user_id is original ID
         svd_recs[user_id] = get_recommendations_svd(user_id, trainset, svd_model, k, all_raw_item_ids)
    eval_time = time.time() - eval_start_time

    hit_rate, ndcg, eval_count = calculate_metrics(svd_recs, test_interactions_dict, valid_test_users_for_eval, k)

    print("\n--- SVD Testing Summary ---")
    print(f"Dataset: {dataset_name}")
    #print(f"Training Time: {training_time:.2f} seconds")
    #print(f"Evaluation Time (Rec Generation): {eval_time:.2f} seconds")
    print(f"Tested HR/NDCG on {eval_count} users.")
    print(f"Hit Rate @{k}: {hit_rate:.4f}")
    print(f"NDCG @{k}: {ndcg:.4f}")
    print("-----------------------------")

    # Return metrics needed for plotting (HR, NDCG) - Training time removed from return as not plotted
    return {'HR@K': hit_rate, 'NDCG@K': ndcg, 'TrainTime': training_time} # Keep TrainTime for filtering results if needed


# --- Main Execution ---
results = {}
dataset_names = [DATASET_NAME_1, DATASET_NAME_2]
csv_paths = [CSV_FILE_PATH_1, CSV_FILE_PATH_2]

model_params = { # SVD params
    'svd_n_factors': SVD_N_FACTORS, 'svd_n_epochs': SVD_N_EPOCHS,
    'svd_lr_all': SVD_LR_ALL, 'svd_reg_all': SVD_REG_ALL
}
eval_params = {'k': K}

for name, path in zip(dataset_names, csv_paths):
    data = load_and_preprocess_for_svd(path, USER_COL, ITEM_COL)
    if data:
        svd_results = train_evaluate_svd(name, data, model_params, eval_params)
        results[name] = svd_results
    else:
        # Ensure default dict has keys expected later, even if values are default
        results[name] = {'HR@K': 0.0, 'NDCG@K': 0.0, 'TrainTime': -1.0} # Use -1 for time to indicate failure


# --- Plot Comparison ---
print("\n--- Plotting SVD Comparison (HR & NDCG Only) ---")

if not results:
    print("No results available to plot.")
else:
    # Filter ensures datasets processed successfully (TrainTime >= 0)
    valid_dataset_names = [name for name in dataset_names if name in results and results[name]['TrainTime'] >= 0]
    if not valid_dataset_names:
        print("No valid results available to plot.")
    else:
        # Metrics to plot: HR@K and NDCG@K only
        metrics_to_plot = ['HR@K', 'NDCG@K']
        metric_titles = {
            'HR@K': f'Hit Rate @{K}',
            'NDCG@K': f'NDCG @{K}',
            # 'TrainTime': 'Training Time (seconds)' # Removed
            # 'TrainLoss': 'Training Loss' # Cannot plot training loss curve for SVD
        }

        n_datasets = len(valid_dataset_names)
        x = np.arange(n_datasets)
        width = 0.5 # Width for single bars per dataset

        num_metrics = len(metrics_to_plot) # Should be 2
        fig_comp, axes = plt.subplots(1, num_metrics, figsize=(6 * num_metrics, 5)) # Creates 1 row, 2 columns
        if num_metrics == 1: axes = [axes] # Make iterable (good practice, though should be 2)

        for i, metric in enumerate(metrics_to_plot):
            ax = axes[i]
            metric_values = [results[d_name].get(metric, 0) for d_name in valid_dataset_names]
            bars = ax.bar(x, metric_values, width)
            fmt_str = '%.4f' # Format for HR and NDCG
            ax.bar_label(bars, padding=3, fmt=fmt_str)

            ax.set_ylabel(metric_titles[metric])
            ax.set_title(f'SVD {metric_titles[metric]} Comparison')
            ax.set_xticks(x)
            ax.set_xticklabels(valid_dataset_names)
            ax.grid(True, axis='y', linestyle='--', alpha=0.7)
            max_val = max(metric_values) if metric_values else 0
            ax.set_ylim(0, max(max_val * 1.2, 0.02)) # Dynamic y-axis with padding

        fig_comp.tight_layout() # Adjust layout

        # Save or Show the plot
        if SAVE_PLOTS:
            save_path = os.path.join(PLOT_SAVE_DIR, COMPARISON_PLOT_FILENAME)
            try:
                fig_comp.savefig(save_path, bbox_inches='tight', dpi=150)
                print(f"Saved comparison plot to: {save_path}")
                plt.close(fig_comp) # Close the figure after saving
            except Exception as e:
                print(f"Error saving comparison plot: {e}")
                plt.show() # Show if saving failed
        else:
            plt.show() # Show the plot if not saving

print("\nComparison plotting complete.")


--- Processing Dataset for SVD: ./Datasets/ratings_preprocessed_D1.csv ---
Loading data...
Data loaded successfully. Shape: (100781, 27)
Number of unique users (original IDs): 610
Number of unique items (original IDs): 9686
Splitting data (per user) into Train/Test...
Users with items in test set: 610
Surprise trainset built: 610 users, 8943 items.

--- Training and Testing SVD: Dataset 1 ---
Building and Training SVD model...
SVD Training finished.

Calculating SVD HR@10 and NDCG@10...
Evaluating on 609 users present in trainset and test split.

--- SVD Testing Summary ---
Dataset: Dataset 1
Tested HR/NDCG on 609 users.
Hit Rate @10: 0.4122
NDCG @10: 0.0845
-----------------------------

--- Processing Dataset for SVD: ./Datasets/ratings_preprocessed_D4.csv ---
Loading data...
Data loaded successfully. Shape: (1000209, 26)
Number of unique users (original IDs): 6040
Number of unique items (original IDs): 3706
Splitting data (per user) into Train/Test...
Users with items in test set: 