In [4]:
import pandas as pd
import pickle
import hashlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    LabelEncoder,
    StandardScaler,
    OrdinalEncoder,
    RobustScaler,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_regression
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns

**Plan:**

1. Import SelectKBest and scoring functions
2. Add method to DataPreprocessor to analyze features
3. Visualize results using matplotlib
4. Return top K features with their scores


In [19]:
class DataPreprocessor:
    NUMERICAL_COLS = [
        "age",
        "duration",
        "acousticness",
        # "danceability",
        # "energy",
        "key",
        # "loudness",
        "mode",
        "speechiness",
        "instrumentalness",
        "liveness",
        # "valence",
        "tempo",
        "time_signature",
        "explicit",
        "dance_valence",
        "energy_loudness",
    ]

    def __init__(
        self,
        test_size=0.2,
        random_state=42,
        max_artist_features=3995,
        max_genre_features=21,
        max_music_features=5784,
    ):
        self.test_size = test_size
        self.random_state = random_state
        self.gender_encoder = LabelEncoder()  # Changed to LabelEncoder
        self.gender_encoder = LabelEncoder()
        self.artist_tfidf_vectorizer = TfidfVectorizer(max_features=max_artist_features)
        self.genre_tfidf_vectorizer = TfidfVectorizer(max_features=max_genre_features)
        self.music_tfidf_vectorizer = TfidfVectorizer(max_features=max_music_features)
        self.release_year_encoder = OrdinalEncoder()  # Added release year encoder
        self.scaler = StandardScaler()

    def load_data(self, filepath):
        """
        Load data from a CSV file.

        Args:
            filepath (str): Path to the CSV file.

        Returns:
            pd.DataFrame: Loaded data.
        """
        data = pd.read_csv(filepath)
        return data

    # def hash_user_id(self, user_id):
    #     """
    #     Convert user_id to 32-dimensional numeric tensor.
    #     Returns zero vector for null/empty values.
    #     """
    #     # Handle null/empty values
    #     if pd.isna(user_id) or str(user_id).strip() == "":
    #         return np.zeros(32)

    #     # Hash valid user_id
    #     hash_hex = hashlib.md5(str(user_id).encode()).hexdigest()

    #     # Convert hex string to 32 integers (2 chars per integer)
    #     try:
    #         return np.array([int(hash_hex[i : i + 2], 16) for i in range(0, 64, 2)])
    #     except ValueError:
    #         # Return zero vector if conversion fails
    #         return np.zeros(32)

    def fit(self, data):
        """Fit all preprocessors on training data"""
        self.gender_encoder.fit(data["gender"])
        self.release_year_encoder.fit(data["release_year"])
        self.artist_tfidf_vectorizer.fit(data["artist_id"])
        self.genre_tfidf_vectorizer.fit(data["genre"])
        self.music_tfidf_vectorizer.fit(data["music_id"])
        self.scaler.fit(data[self.NUMERICAL_COLS])
        return self

    def transform(self, data, is_training=False):
        """Transform data using fitted preprocessors"""
        transform_fn = lambda enc, x: (
            enc.fit_transform(x) if is_training else enc.transform(x)
        )

        data_encoded = pd.DataFrame(
            {
                # "user_id_hashed": np.vstack(data["user_id"].apply(self.hash_user_id)),
                "gender_encoded": transform_fn(self.gender_encoder, data["gender"]),
                "release_year_encoded": transform_fn(
                    self.release_year_encoder, data["release_year"]
                ),
            }
        )

        # Add other transformations...
        return data_encoded

    def encode_features_train(self, data):
        # Hash user IDs to 32-dim vectors
        # user_id_hashed = np.vstack(data["user_id"].apply(self.hash_user_id))

        # Encode gender
        data["gender_encoded"] = self.gender_encoder.fit_transform(data["gender"])

        # Reshape release_year to 2D array and encode
        release_year_2d = data["release_year"].values.reshape(-1, 1)
        data["release_year_encoded"] = self.release_year_encoder.fit_transform(
            release_year_2d
        ).ravel()

        # Rest of the method remains the same...
        # TF-IDF Encoding
        artist_tfidf = self.artist_tfidf_vectorizer.fit_transform(data["artist_name"])
        genre_tfidf = self.genre_tfidf_vectorizer.fit_transform(data["genre"])
        music_tfidf = self.music_tfidf_vectorizer.fit_transform(data["music"])

        # Get actual feature names from vectorizers
        artist_feature_names = [
            f"artist_tfidf_{i}" for i in range(artist_tfidf.shape[1])
        ]
        genre_feature_names = [f"genre_tfidf_{i}" for i in range(genre_tfidf.shape[1])]
        music_feature_names = [f"music_tfidf_{i}" for i in range(music_tfidf.shape[1])]

        # Create DataFrames with actual dimensions
        artist_tfidf_df = pd.DataFrame(
            artist_tfidf.toarray(), columns=artist_feature_names
        )
        genre_tfidf_df = pd.DataFrame(
            genre_tfidf.toarray(), columns=genre_feature_names
        )
        music_tfidf_df = pd.DataFrame(
            music_tfidf.toarray(), columns=music_feature_names
        )

        # Combine all features in the expected order
        numerical_features = []

        data_encoded = pd.DataFrame(
            {
                "user_id_hashed": list(user_id_hashed),
                "gender_encoded": data["gender_encoded"],
                "release_year_encoded": data["release_year_encoded"],
            }
        )

        data_encoded = pd.concat(
            [
                data_encoded,
                music_tfidf_df,
                artist_tfidf_df,
                genre_tfidf_df,
                data[numerical_features],
            ],
            axis=1,
        )

        # Verify release_year exists
        if "release_year" not in data.columns:
            raise ValueError("release_year column missing from input data")

        # Add release year check after encoding
        if "release_year_encoded" not in data_encoded.columns:
            raise ValueError("release_year_encoded not properly created")

        return data_encoded

    def encode_features_transform(self, data):
        """Transform test/inference data."""
        # Hash user IDs to 32-dim vectors
        # user_id_hashed = np.vstack(data["user_id"].apply(self.hash_user_id))

        # Encode gender and release year
        data["gender_encoded"] = self.gender_encoder.transform(data["gender"])

        # Reshape release_year to 2D array and encode
        release_year_2d = data["release_year"].values.reshape(-1, 1)
        data["release_year_encoded"] = self.release_year_encoder.transform(
            release_year_2d
        ).ravel()

        # TF-IDF Encoding
        artist_tfidf = self.artist_tfidf_vectorizer.transform(data["artist_name"])
        genre_tfidf = self.genre_tfidf_vectorizer.transform(data["genre"])
        music_tfidf = self.music_tfidf_vectorizer.transform(data["music"])

        # Get actual feature names from vectorizers
        artist_feature_names = [
            f"artist_tfidf_{i}" for i in range(artist_tfidf.shape[1])
        ]
        genre_feature_names = [f"genre_tfidf_{i}" for i in range(genre_tfidf.shape[1])]
        music_feature_names = [f"music_tfidf_{i}" for i in range(music_tfidf.shape[1])]

        # Create DataFrames with actual dimensions
        artist_tfidf_df = pd.DataFrame(
            artist_tfidf.toarray(), columns=artist_feature_names
        )
        genre_tfidf_df = pd.DataFrame(
            genre_tfidf.toarray(), columns=genre_feature_names
        )
        music_tfidf_df = pd.DataFrame(
            music_tfidf.toarray(), columns=music_feature_names
        )

        # Combine all features in the expected order
        numerical_features = [
            "age",
            "duration",
            "acousticness",
            # "danceability",
            # "energy",
            "key",
            # "loudness",
            "mode",
            "speechiness",
            "instrumentalness",
            "liveness",
            # "valence",
            "tempo",
            "time_signature",
            "explicit",
            "dance_valence",
            "energy_loudness",
        ]

        data_encoded = pd.DataFrame(
            {
                # "user_id_hashed": list(user_id_hashed),
                "gender_encoded": data["gender_encoded"],
                "release_year_encoded": data["release_year_encoded"],
            }
        )

        data_encoded = pd.concat(
            [
                data_encoded,
                music_tfidf_df,
                artist_tfidf_df,
                genre_tfidf_df,
                data[numerical_features],
            ],
            axis=1,
        )

        # Verify release_year exists
        if "release_year" not in data.columns:
            raise ValueError("release_year column missing from input data")

        # Add release year check after encoding
        if "release_year_encoded" not in data_encoded.columns:
            raise ValueError("release_year_encoded not properly created")

        return data_encoded

    def feature_engineering(self, data_encoded):
        """Scale numerical features and normalize plays"""
        numerical_features = [
            "age",
            "duration",
            "acousticness",
            # "danceability",
            # "energy",
            "key",
            # "loudness",
            "mode",
            "speechiness",
            "instrumentalness",
            "liveness",
            # "valence",
            "tempo",
            "time_signature",
            "explicit",
            "dance_valence",
            "energy_loudness",
        ]

        # Store plays separately before scaling
        target = None
        if "plays" in data_encoded.columns:
            target = data_encoded["plays"]
            # Add log transformation for skewed play counts
            target = np.log1p(target)  # Add 1 and take log
            # Then normalize
            # target = (target - target.min()) / (target.max() - target.min())

        # Scale numerical features
        data_encoded[numerical_features] = self.scaler.fit_transform(
            data_encoded[numerical_features]
        )

        # Add normalized plays back
        if target is not None:
            data_encoded["plays"] = target

        return data_encoded

    def split_data(self, data_encoded, target_column="plays", val_size=0.1):
        features = data_encoded.drop(columns=[target_column])
        target = data_encoded[target_column]

        # Handle duplicate values in binning
        try:
            bins = pd.qcut(target, q=10, labels=False, duplicates="drop")
        except ValueError:
            # Fallback if too many duplicates
            bins = pd.cut(target, bins=10, labels=False)

        # Split with stratification
        train_features, temp_features, train_target, temp_target = train_test_split(
            features,
            target,
            test_size=self.test_size + val_size,
            random_state=self.random_state,
            stratify=bins,
        )

        # Further split temp into validation and test
        val_features, test_features, val_target, test_target = train_test_split(
            temp_features,
            temp_target,
            test_size=self.test_size / (self.test_size + val_size),
            random_state=self.random_state,
            stratify=pd.qcut(temp_target, q=5, labels=False, duplicates="drop"),
        )

        return (
            train_features,
            val_features,
            test_features,
            train_target,
            val_target,
            test_target,
        )

    def save_preprocessors(self, directory="models/"):
        """Save all preprocessors including release year encoder"""
        preprocessors = {
            "gender_encoder": self.gender_encoder,
            "artist_tfidf_vectorizer": self.artist_tfidf_vectorizer,
            "genre_tfidf_vectorizer": self.genre_tfidf_vectorizer,
            "music_tfidf_vectorizer": self.music_tfidf_vectorizer,
            "release_year_encoder": self.release_year_encoder,
            "scaler": self.scaler,
        }

        for name, preprocessor in preprocessors.items():
            with open(f"{directory}{name}.pkl", "wb") as f:
                pickle.dump(preprocessor, f)

    def load_preprocessors(self, directory="models/"):
        """Load all preprocessors including release year encoder"""
        preprocessors = {
            "gender_encoder": "gender_encoder.pkl",
            "artist_tfidf_vectorizer": "artist_tfidf_vectorizer.pkl",
            "genre_tfidf_vectorizer": "genre_tfidf_vectorizer.pkl",
            "music_tfidf_vectorizer": "music_tfidf_vectorizer.pkl",
            "release_year_encoder": "release_year_encoder.pkl",
            "scaler": "scaler.pkl",
        }

        for attr, filename in preprocessors.items():
            with open(f"{directory}{filename}", "rb") as f:
                setattr(self, attr, pickle.load(f))

    # def analyze_feature_importance(self, data_encoded, target, k=10):
    #     """
    #     Analyze and visualize the K most important features using f_regression.

    #     Args:
    #         data_encoded: DataFrame of encoded features
    #         target: Series of target values (plays)
    #         k: Number of top features to select
    #     """
    #     # Remove non-numeric columns and target
    #     feature_cols = data_encoded.select_dtypes(include=["float64", "int64"]).columns
    #     X = data_encoded[feature_cols]

    #     # Initialize and fit SelectKBest
    #     selector = SelectKBest(score_func=f_regression, k=k)
    #     selector.fit(X, target)

    #     # Get scores and feature names
    #     scores = pd.DataFrame(
    #         {"Feature": feature_cols, "Score": selector.scores_}
    #     ).sort_values("Score", ascending=False)

    #     # Plot top K features
    #     plt.figure(figsize=(12, 6))
    #     sns.barplot(data=scores.head(k), x="Score", y="Feature")
    #     plt.title(f"Top {k} Features by F-Regression Score")
    #     plt.tight_layout()
    #     plt.show()

    #     return scores.head(k)

    # def analyze_class_distribution(self, target, n_bins=5, plot=True):
    #     """
    #     Analyze and visualize target distribution with proper binning
    #     """
    #     try:
    #         # Try qcut first
    #         try:
    #             bins = pd.qcut(
    #                 target,
    #                 q=n_bins,
    #                 labels=[f"Bin {i+1}" for i in range(n_bins)],
    #                 duplicates="drop",
    #             )
    #         except ValueError:
    #             # Fallback to cut if qcut fails
    #             bins = pd.cut(
    #                 target, bins=n_bins, labels=[f"Bin {i+1}" for i in range(n_bins)]
    #             )

    #         distribution = Counter(bins)

    #         if plot:
    #             plt.figure(figsize=(10, 6))
    #             sns.countplot(x=bins)
    #             plt.title("Target Variable Distribution (Plays)")
    #             plt.xlabel("Play Count Bins")
    #             plt.ylabel("Frequency")
    #             plt.xticks(rotation=45)

    #             # Add value labels on bars
    #             for i, v in enumerate(distribution.values()):
    #                 plt.text(i, v, str(v), ha="center", va="bottom")

    #             plt.tight_layout()
    #             plt.show()

    #         # Calculate and display statistics
    #         max_class = max(distribution.values())
    #         min_class = min(distribution.values())
    #         imbalance_ratio = max_class / min_class

    #         print("\nDistribution Statistics:")
    #         print(f"Total samples: {len(target)}")
    #         print(f"Imbalance ratio: {imbalance_ratio:.2f}")
    #         print("\nBin Ranges:")

    #         # Get bin edges without accessing categories
    #         bin_edges = pd.IntervalIndex(bins.unique()).to_tuples()
    #         for i, (lower, upper) in enumerate(bin_edges):
    #             print(f"Bin {i+1}: ({lower:.2f}, {upper:.2f}]")

    #         return distribution

    #     except Exception as e:
    #         print(f"Error analyzing distribution: {str(e)}")
    #         return None

    # def balance_classes(self, X, y, method="smote", sampling_strategy="auto"):
    #     """Balance classes using SMOTE or undersampling"""
    #     if method.lower() == "smote":
    #         sampler = SMOTE(sampling_strategy=sampling_strategy)
    #     else:
    #         sampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    #     X_resampled, y_resampled = sampler.fit_resample(X, y)
    #     return X_resampled, y_resampled

    # def inspect_tfidf_features(self, feature_name="music"):
    #     """Inspect TF-IDF vectorizer features"""
    #     vectorizer = getattr(self, f"{feature_name}_tfidf_vectorizer")

    #     # Get vocabulary and feature names
    #     vocab = vectorizer.vocabulary_
    #     feature_names = vectorizer.get_feature_names_out()

    #     print(f"\n{feature_name.title()} TF-IDF Features:")
    #     print(f"Total features: {len(feature_names)}")
    #     print(f"Max features setting: {vectorizer.max_features}")
    #     print("\nTop 10 features by vocabulary index:")
    #     for word, idx in sorted(vocab.items(), key=lambda x: x[1])[:10]:
    #         print(f"{word}: {idx}")

    #     return vocab, feature_names


# Usage example:
"""
preprocessor = DataPreprocessor()
data = preprocessor.load_data("your_data.csv")

# Check class distribution
distribution = preprocessor.analyze_class_distribution(data['plays'])

# Balance classes if needed
X_resampled, y_resampled = preprocessor.balance_classes(
    data_encoded, 
    data['plays'],
    method='smote'
)

# Inspect TF-IDF features
vocab, features = preprocessor.inspect_tfidf_features('music')
"""

'\npreprocessor = DataPreprocessor()\ndata = preprocessor.load_data("your_data.csv")\n\n# Check class distribution\ndistribution = preprocessor.analyze_class_distribution(data[\'plays\'])\n\n# Balance classes if needed\nX_resampled, y_resampled = preprocessor.balance_classes(\n    data_encoded, \n    data[\'plays\'],\n    method=\'smote\'\n)\n\n# Inspect TF-IDF features\nvocab, features = preprocessor.inspect_tfidf_features(\'music\')\n'

In [20]:
# Initialize the preprocessor
preprocessor = DataPreprocessor(
    test_size=0.2,
    random_state=42,
    max_artist_features=3995,
    max_genre_features=21,
    max_music_features=5784,
)

# Load and preprocess data
filepath = "../../data/engineered_data.csv"

# In main function, add error handling for data loading
try:
    data = preprocessor.load_data(filepath)
except Exception as e:
    print(f"Error loading data from {filepath}: {e}")

In [21]:
data = preprocessor.load_data(filepath)
data.head()

Unnamed: 0,age,gender,music,artist_name,featured_artists,genre,duration,music_id,artist_id,acousticness,...,instrumentalness,liveness,tempo,time_signature,explicit,release_year,music_age,plays_log,energy_loudness,dance_valence
0,16,F,Bank Account,21 Savage,"Birdy, Zoé",Dark Trap,3.67,2fQrGHiQOvpL9UgPvtYy6G,a0b0b79c90af400d012c20ff4e5190d46ea6da7d00a9f4...,-0.955407,...,-0.42859,-0.934219,-2.537022,4.0,True,2017,7,2.484907,-0.132277,-0.97283
1,16,F,Little Talks,Of Monsters and Men,"Ninho, Snoop Dogg, Russ, Paramore",Other,4.44,2ihCaVdNZmnHZWt0fvAM7B,669122254ed9bfcd862183da62571b3ece680da165f4fb...,-0.385361,...,-0.428619,1.274342,-1.03643,4.0,False,2013,11,6.532334,-0.109632,-0.047337
2,17,M,Wherever I Go,OneRepublic,"Keith Urban, DJ Khaled, NIKI, MF DOOM",Other,2.83,46jLy47W8rkf8rEX04gMKB,8a1176f697531f10b9f2678fec4c1c85194a3165148d20...,0.242743,...,-0.250512,1.147169,0.022241,4.0,False,2009,15,4.919981,0.077155,-0.039519
3,44,M,No New Friends,DJ Khaled,"The xx, LIT killah",Pop,5.14,5oVlbbiKGdGeZkWCFy0mqk,cb25617212d96d7e96e5aadec386fc662d7252337f076c...,1.317553,...,-0.428619,-0.827931,-0.623654,4.0,False,2017,7,5.442418,-0.267584,-0.607147
4,44,M,Dreams,Campsite Dream,,Country,3.2,1SNoSoQ3JZldOhzBY9gw0n,09207797345aaafddfca55ff08c4be65c0392d8411b175...,0.22895,...,0.537232,0.10913,0.002027,4.0,False,1974,50,5.971262,0.117367,0.081196


### misc

In [13]:
# # aggregate plays by song for each user
# import pandas as pd
# from io import StringIO

# # Reading data into a DataFrame
# df = data.copy()

# # Aggregating plays by song
# aggregated_plays = df.groupby("music")["plays"].sum().reset_index()

# # Output the result
# print(aggregated_plays)

In [14]:
# preprocessor = DataPreprocessor()
# filepath = "../data/cleaned_modv2.csv"
# data = preprocessor.load_data(filepath)
# data_encoded = preprocessor.encode_features_train(data)
# top_features = preprocessor.analyze_feature_importance(
#     data_encoded, data["plays"], k=20
# )
# print(top_features)

In [15]:
# preprocessor = DataPreprocessor()
# data = preprocessor.load_data(filepath)

# # Encode features
# data_encoded = preprocessor.encode_features_train(data)

# # Check class distribution
# distribution = preprocessor.analyze_class_distribution(data["plays"])

# # Balance classes if needed
# X_resampled, y_resampled = preprocessor.balance_classes(
#     data_encoded, data["plays"], method="smote"
# )

# # Inspect TF-IDF features
# vocab, features = preprocessor.inspect_tfidf_features("music")

In [16]:
# distribution = preprocessor.analyze_class_distribution(data["plays"], n_bins=10)

In [17]:
# vocab, features = preprocessor.inspect_tfidf_features("music")

In [18]:
# # inspect all tfidf features
# feature_names = ["genre", "music", "artist"]
# vocab_features = [
#     preprocessor.inspect_tfidf_features(feature_name) for feature_name in feature_names
# ]
# vocab, features = zip(*vocab_features)

### model.py

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class EnhancedListNetLoss(nn.Module):
    def __init__(self, k=10, ils_weight=0.1, temperature=1.0):
        """
        Enhanced ListNet Loss with Intra-List Similarity regularization

        Args:
            k (int): Top-k items to consider
            ils_weight (float): Weight for the ILS regularization term
            temperature (float): Temperature for similarity scaling
        """
        super(EnhancedListNetLoss, self).__init__()
        self.k = k
        self.ils_weight = ils_weight
        self.temperature = temperature

    def compute_similarity_matrix(self, features):
        """
        Compute pairwise similarities between items in the batch

        Args:
            features (torch.Tensor): Combined feature representation [batch_size, feature_dim]
        """
        # Normalize features
        normalized_features = F.normalize(features, p=2, dim=1)

        # Compute cosine similarity matrix
        similarity_matrix = torch.mm(normalized_features, normalized_features.t())

        # Scale similarities by temperature
        similarity_matrix = similarity_matrix / self.temperature

        return similarity_matrix

    def compute_ils_penalty(self, similarity_matrix, rankings):
        """
        Compute ILS penalty based on item similarities and their positions in ranking

        Args:
            similarity_matrix (torch.Tensor): Pairwise similarity matrix [batch_size, batch_size]
            rankings (torch.Tensor): Predicted rankings [batch_size, 1]
        """
        batch_size = rankings.size(0)

        # Convert rankings to pairwise position differences
        position_diff = (rankings - rankings.t()).abs()

        # Weight similarities by position differences (closer positions = higher penalty)
        position_weights = torch.exp(-position_diff)

        # Compute penalty: high similarity items should be far apart in ranking
        ils_penalty = (similarity_matrix * position_weights).sum() / (
            batch_size * (batch_size - 1)
        )

        return ils_penalty

    def combine_features(self, genre_features, artist_features, music_features):
        """
        Combine different feature types with appropriate weighting
        """
        # Normalize each feature type
        genre_norm = F.normalize(genre_features, p=2, dim=1)
        artist_norm = F.normalize(artist_features, p=2, dim=1)
        music_norm = F.normalize(music_features, p=2, dim=1)

        # Combine features with weights
        # You can adjust these weights based on importance
        combined = torch.cat(
            [
                genre_norm * 0.4,  # Higher weight for genre diversity
                artist_norm * 0.3,
                music_norm * 0.3,
            ],
            dim=1,
        )

        return combined

    def forward(self, y_pred, y_true, genre_features, artist_features, music_features):
        """
        Forward pass computing both ListNet loss and ILS regularization

        Args:
            y_pred (torch.Tensor): Predicted scores [batch_size, 1]
            y_true (torch.Tensor): True scores [batch_size, 1]
            genre_features (torch.Tensor): Genre TF-IDF features
            artist_features (torch.Tensor): Artist TF-IDF features
            music_features (torch.Tensor): Music TF-IDF features
        """
        # Original ListNet loss
        P_y_pred = F.softmax(y_pred, dim=0)
        P_y_true = F.softmax(y_true, dim=0)
        listnet_loss = -torch.sum(P_y_true * torch.log(P_y_pred + 1e-10)) / y_pred.size(
            0
        )

        # Compute ILS penalty
        combined_features = self.combine_features(
            genre_features, artist_features, music_features
        )
        similarity_matrix = self.compute_similarity_matrix(combined_features)
        ils_penalty = self.compute_ils_penalty(similarity_matrix, y_pred)

        # Combine losses
        total_loss = listnet_loss + self.ils_weight * ils_penalty

        return total_loss, {
            "listnet_loss": listnet_loss.item(),
            "ils_penalty": ils_penalty.item(),
            "total_loss": total_loss.item(),
        }

    def get_diversity_metric(
        self, genre_features, artist_features, music_features, y_pred, k=10
    ):
        """
        Compute diversity metric for top-k recommendations
        """
        combined_features = self.combine_features(
            genre_features, artist_features, music_features
        )
        similarity_matrix = self.compute_similarity_matrix(combined_features)

        # Get top-k indices
        _, top_k_indices = torch.topk(y_pred, k=min(k, y_pred.size(0)))

        # Compute average similarity between top-k items (lower is more diverse)
        top_k_similarities = similarity_matrix[top_k_indices][:, top_k_indices]
        diversity_score = 1.0 - (top_k_similarities.sum() - k) / (k * (k - 1))

        return diversity_score.item()


class HybridRecommender(nn.Module):
    def __init__(
        self,
        num_genders,
        num_music_items,
        num_genres,
        num_artist_features,
        num_numerical_features,
        num_release_years,
        embedding_dim,
        hidden_dims = [256, 128, 64, 32],
        dropout_prob=0.3,
    ):
        super(HybridRecommender, self).__init__()
    
        # Embeddings
        self.gender_embedding = nn.Embedding(num_genders, embedding_dim)
        self.release_year_embedding = nn.Embedding(num_release_years, embedding_dim)

        # Feature transformations with batch norm
        self.music_fc = nn.Sequential(
            nn.Linear(num_music_items, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
        )
        self.genre_fc = nn.Sequential(
            nn.Linear(num_genres, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
        )
        self.artist_fc = nn.Sequential(
            nn.Linear(num_artist_features, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
        )

        # Calculate expected dimension
        self.expected_dim = 32 + embedding_dim * 5 + num_numerical_features

        # Input normalization
        self.input_bn = nn.BatchNorm1d(self.expected_dim)

        # Main network layers with residual connections
        self.layer1 = nn.Sequential(
            nn.Linear(self.expected_dim, hidden_dims[0]),
            nn.BatchNorm1d(hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
        )

        self.layer2 = nn.Sequential(
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.BatchNorm1d(hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropout_prob * 0.8),
        )

        self.layer3 = nn.Sequential(
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.BatchNorm1d(hidden_dims[2]),
            nn.ReLU(),
            nn.Dropout(dropout_prob * 0.6),
        )

        self.layer4 = nn.Sequential(
            nn.Linear(hidden_dims[2], hidden_dims[3]),
            nn.BatchNorm1d(hidden_dims[3]),
            nn.ReLU(),
            nn.Dropout(dropout_prob * 0.4),
        )

        # Residual connections
        self.res1 = nn.Linear(self.expected_dim, hidden_dims[1])
        self.res2 = nn.Linear(hidden_dims[0], hidden_dims[2])
        self.res3 = nn.Linear(hidden_dims[1], hidden_dims[3])

        # Output layer
        self.output = nn.Sequential(nn.Linear(hidden_dims[3], 1), nn.Sigmoid())

    def forward(
        self,
        user_id_hashed,
        artist_features,
        gender_id,
        music_features,
        genre_features,
        numerical_features,
        release_year,
    ):
        # Ensure all inputs are 2D
        if user_id_hashed.dim() == 1:
            user_id_hashed = user_id_hashed.unsqueeze(1)
        if gender_id.dim() == 1:
            gender_id = gender_id.unsqueeze(1)
        if release_year.dim() == 1:
            release_year = release_year.unsqueeze(1)
        if numerical_features.dim() == 1:
            numerical_features = numerical_features.unsqueeze(1)

        # Process embeddings
        gender_embedded = self.gender_embedding(gender_id.long().squeeze(-1))
        release_year_embedded = self.release_year_embedding(release_year.long().squeeze(-1))
        
        # Process features through FC layers
        music_embedded = self.music_fc(music_features.float())
        artist_embedded = self.artist_fc(artist_features.float())
        genre_embedded = self.genre_fc(genre_features.float())

        # Concatenate all features
        concat_features = torch.cat(
            [
                user_id_hashed.float(),
                gender_embedded,
                release_year_embedded,
                music_embedded,
                artist_embedded,
                genre_embedded,
                numerical_features.float(),
            ],
            dim=1
        )

        # Add dimension checks
        expected_batch_size = user_id_hashed.size(0)
        assert all(x.size(0) == expected_batch_size for x in [
            gender_embedded, release_year_embedded, music_embedded,
            artist_embedded, genre_embedded, numerical_features
        ]), "Batch size mismatch in features"
        
        assert concat_features.shape[1] == self.expected_dim, \
            f"Expected {self.expected_dim} features but got {concat_features.shape[1]}"

        # Rest of the forward pass remains the same
        x = self.input_bn(concat_features)
        x1 = self.layer1(x)
        r1 = self.res1(x)
        x2 = self.layer2(x1)
        x2 = x2 + r1
        x3 = self.layer3(x2)
        r2 = self.res2(x1)
        x3 = x3 + r2
        x4 = self.layer4(x3)
        r3 = self.res3(x2)
        x4 = x4 + r3
        
        return self.output(x4)


### train.py

In [25]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from preprocessing import DataPreprocessor
from model import HybridRecommender, EnhancedListNetLoss
import numpy as np
import torch

# Add these near the top of the file, after imports
TRAIN_BATCH_SIZE = 256
VAL_BATCH_SIZE = 128
NUM_EPOCHS = 20
EMBEDDING_DIM = 64
LEARNING_RATE = 0.01
WEIGHT_DECAY = 0.01
EARLY_STOPPING_PATIENCE = 5
EARLY_STOPPING_MIN_DELTA = 0.001
SCHEDULER_PATIENCE = 2
SCHEDULER_FACTOR = 0.5
SCHEDULER_MIN_LR = 1e-6
L2_LAMBDA = 1e-4

# Add at the top with other constants
NUMERICAL_COLS = [
        "age",
        "duration",
        "acousticness",
        # "danceability",
        # "energy",
        "key",
        # "loudness",
        "mode",
        "speechiness",
        "instrumentalness",
        "liveness",
        # "valence",
        "tempo",
        "time_signature",
        "explicit",
        "dance_valence",
        "energy_loudness",
]


class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001, save_path="models/best_model.pth"):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float("inf")  # Initialize with infinity
        self.early_stop = False
        self.save_path = save_path

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.min_delta:  # Changed condition
            self.best_loss = val_loss
            self.save_checkpoint(model)
            self.counter = 0
        else:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, model):
        """Save model when validation loss decreases."""
        torch.save(model.state_dict(), self.save_path)


def prepare_tensor_data(features, target, device):
    """Helper function to convert features to tensors."""
    numerical_cols = [
        "age",
        "duration",
        "acousticness",
        # "danceability",
        # "energy",
        "key",
        # "loudness",
        "mode",
        "speechiness",
        "instrumentalness",
        "liveness",
        # "valence",
        "tempo",
        "time_signature",
        "explicit",
        "dance_valence",
        "energy_loudness",
    ]

    return {
        "user_id": torch.tensor(
            np.stack(features["user_id"].tolist()), dtype=torch.float
        ).to(device),
        "gender_ids": torch.tensor(
            features["gender_encoded"].values, dtype=torch.long
        ).to(device),
        "genre_features": torch.tensor(
            features[
                [col for col in features.columns if col.startswith("genre_tfidf_")]
            ].values,
            dtype=torch.float,
        ).to(device),
        "artist_features": torch.tensor(
            features[
                [col for col in features.columns if col.startswith("artist_tfidf_")]
            ].values,
            dtype=torch.float,
        ).to(device),
        "music_features": torch.tensor(
            features[
                [col for col in features.columns if col.startswith("music_tfidf_")]
            ].values,
            dtype=torch.float,
        ).to(device),
        "numerical_features": torch.tensor(
            features[numerical_cols].values, dtype=torch.float
        ).to(device),
        "release_years": torch.tensor(
            features["release_year_encoded"].values, dtype=torch.long
        ).to(device),
        "target": torch.tensor(target.values, dtype=torch.float)
        .unsqueeze(1)
        .to(device),
    }


def create_dataloader(tensor_data, batch_size=128, shuffle=True):
    """Helper function to create a DataLoader from tensor data."""
    try:
        dataset = TensorDataset(
            tensor_data["user_id"],
            tensor_data["artist_features"],
            tensor_data["gender_ids"],
            tensor_data["music_features"],
            tensor_data["genre_features"],
            tensor_data["numerical_features"],
            tensor_data["release_years"],
            tensor_data["target"],
        )
        return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    except Exception as e:
        print(f"Error creating DataLoader: {e}")
        print("Tensor shapes:")
        for key, tensor in tensor_data.items():
            print(f"{key}: {tensor.shape}")
        raise


def validate_model(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            (
                batch_user_id_hashed,
                batch_artist_features,
                batch_gender_ids,
                batch_music_features,
                batch_genre_features,
                batch_numerical_features,
                batch_release_years,
                batch_target,
            ) = batch

            predictions = model(
                batch_user_id_hashed,
                batch_artist_features,
                batch_gender_ids,
                batch_music_features,
                batch_genre_features,
                batch_numerical_features,
                batch_release_years,
            )
            batch_loss, _ = criterion(
                predictions,
                batch_target,
                batch_genre_features,
                batch_artist_features,
                batch_music_features,
            )
            val_loss += batch_loss.item()
    return val_loss / len(dataloader)


def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        (
            batch_user_id_hashed,
            batch_artist_features,
            batch_gender_ids,
            batch_music_features,
            batch_genre_features,
            batch_numerical_features,
            batch_release_years,
            batch_target,
        ) = batch

        optimizer.zero_grad()
        predictions = model(
            batch_user_id_hashed,
            batch_artist_features,
            batch_gender_ids,
            batch_music_features,
            batch_genre_features,
            batch_numerical_features,
            batch_release_years,
        )

        loss, loss_details = criterion(
            predictions,
            batch_target,
            batch_genre_features,
            batch_artist_features,
            batch_music_features,
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Optionally, log loss details
        print(
            f"ListNet Loss: {loss_details['listnet_loss']}, ILS Penalty: {loss_details['ils_penalty']}, Total Loss: {loss_details['total_loss']}"
        )

    return total_loss / len(dataloader)


def main():
    try:
        # Define device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        # Initialize the preprocessor
        preprocessor = DataPreprocessor(
            test_size=0.2,
            random_state=42,
            max_artist_features=3995,
            max_genre_features=21,
            max_music_features=5784,
        )

        # Load and preprocess data
        filepath = "../../data/engineered_data.csv"

        # In main function, add error handling for data loading
        try:
            data = preprocessor.load_data(filepath)
        except Exception as e:
            print(f"Error loading data from {filepath}: {e}")
            raise

        if data.empty:
            raise ValueError("Loaded data is empty")

        # Ensure 'plays' column exists
        if "plays" not in data.columns:
            raise ValueError("Target column 'plays' not found in input data")

        data_encoded = preprocessor.encode_features_train(data)

        # Add plays column to encoded data if not present
        if "plays" not in data_encoded.columns:
            data_encoded["plays"] = data["plays"]

        features = preprocessor.feature_engineering(data_encoded)

        # Verify plays column exists before splitting
        if "plays" not in features.columns:
            raise ValueError("Target column 'plays' lost during preprocessing")

        # Correctly define music_feature_cols before splitting
        music_feature_cols = [
            col for col in features.columns if col.startswith("music_tfidf_")
        ]

        (
            train_features,
            val_features,
            test_features,
            train_target,
            val_target,
            test_target,
        ) = preprocessor.split_data(features)
        # Save preprocessors
        preprocessor.save_preprocessors(directory="models/")

        # Convert features to tensors
        train_tensors = prepare_tensor_data(train_features, train_target, device)
        val_tensors = prepare_tensor_data(val_features, val_target, device)
        test_tensors = prepare_tensor_data(test_features, test_target, device)

        # Create dataloaders
        train_dataloader = create_dataloader(
            train_tensors, batch_size=TRAIN_BATCH_SIZE, shuffle=True
        )
        val_dataloader = create_dataloader(
            val_tensors, batch_size=VAL_BATCH_SIZE, shuffle=False
        )

        # Add this debug code before model initialization:
        print("Feature dimensions:")
        print(f"Gender classes: {len(preprocessor.gender_encoder.classes_)}")
        print(
            f"Music TF-IDF features: {len([col for col in train_features.columns if col.startswith('music_tfidf_')])}"
        )
        print(
            f"Genre TF-IDF features: {len([col for col in train_features.columns if col.startswith('genre_tfidf_')])}"
        )
        print(
            f"Artist TF-IDF features: {len([col for col in train_features.columns if col.startswith('artist_tfidf_')])}"
        )
        print(
            f"Release year classes: {len(preprocessor.release_year_encoder.categories_[0])}"
        )

        # Add this debug code in train.py before model initialization
        music_feature_cols = [
            col for col in train_features.columns if col.startswith("music_tfidf_")
        ]
        print(f"Actual music features count: {len(music_feature_cols)}")
        print(f"First few music feature names: {music_feature_cols[:5]}")

        # Before model initialization, add these variables
        num_genres = len(
            [col for col in train_features.columns if col.startswith("genre_tfidf_")]
        )
        num_artist_features = len(
            [col for col in train_features.columns if col.startswith("artist_tfidf_")]
        )

        # Update model initialization
        model = HybridRecommender(
            num_genders=len(preprocessor.gender_encoder.classes_),
            num_music_items=len(music_feature_cols),
            num_genres=num_genres,
            num_artist_features=num_artist_features,
            num_numerical_features=15,
            num_release_years=len(preprocessor.release_year_encoder.categories_[0]),
            embedding_dim=64,
        ).to(device)

        # After model initialization
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Total parameters: {total_params:,}")
        print(f"Trainable parameters: {trainable_params:,}")
        print(f"Non-trainable parameters: {total_params - trainable_params:,}")

        # Define optimizer and scheduler
        optimizer = torch.optim.AdamW(
            model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
        )
        scheduler = ReduceLROnPlateau(
            optimizer,
            mode="min",
            factor=SCHEDULER_FACTOR,
            patience=SCHEDULER_PATIENCE,
            min_lr=SCHEDULER_MIN_LR,
        )

        # Define loss function
        criterion = EnhancedListNetLoss(k=10)

        early_stopping = EarlyStopping(
            patience=EARLY_STOPPING_PATIENCE,
            min_delta=EARLY_STOPPING_MIN_DELTA,
            save_path="models/best_model.pth",
        )
        val_losses = []

        # Training loop
        for epoch in range(NUM_EPOCHS):
            model.train()
            epoch_loss = 0.0

            # Training phase
            epoch_loss = train(model, train_dataloader, criterion, optimizer, device)

            # Validation phase
            model.eval()
            val_loss = validate_model(model, val_dataloader, criterion, device)
            val_losses.append(val_loss)

            # Early Stopping check
            early_stopping(val_loss, model)
            if early_stopping.early_stop:
                print("Early stopping triggered")
                break

            print(
                f"Epoch [{epoch+1}/{NUM_EPOCHS}], Train Loss: {epoch_loss/len(train_dataloader):.4f}, Val Loss: {val_loss:.4f}"
            )
            scheduler.step(val_loss)

        # Load the best model before evaluation
        try:
            model.load_state_dict(torch.load("models/best_model.pth"))
            model = model.to(device)
        except Exception as e:
            print(f"Warning: Could not load best model: {e}")
            print("Using last model state instead")

        # Evaluation
        model.eval()
        with torch.no_grad():
            predictions = model(
                test_tensors["user_id"],
                test_tensors["artist_features"],
                test_tensors["gender_ids"],
                test_tensors["music_features"],
                test_tensors["genre_features"],
                test_tensors["numerical_features"],
                test_tensors["release_years"],
            )

        # Convert tensors to numpy arrays
        predictions_np = predictions.cpu().numpy().reshape(-1)
        test_target_np = test_tensors["target"].cpu().numpy().reshape(-1)
        test_user_ids = test_features["user_id"].values

        # Evaluate the model per user
        evaluate_model_per_user(predictions_np, test_target_np, test_user_ids, k=10)

        # Save the trained model
        torch.save(model.state_dict(), "models/model.pth")
        print("Model saved to 'models/model.pth'")
        print("Training complete!")

        # Add these print statements at key points
        print(f"Data loaded successfully. Shape: {data.shape}")
        print(f"Training samples: {len(train_dataloader.dataset)}")
        print(f"Validation samples: {len(val_dataloader.dataset)}")
        print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user")
        # Save the model in its current state
        torch.save(model.state_dict(), "models/interrupted_model.pth")
        print("Model saved to 'models/interrupted_model.pth'")
    except Exception as e:
        print(f"Error during training: {e}")
        raise
    finally:
        # Cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()


def ndcg_at_k(y_true, y_pred, k):
    """Modified NDCG@k implementation."""
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)

    # Get predicted ranking
    pred_indices = np.argsort(y_pred)[::-1][:k]

    # Calculate DCG
    dcg = np.sum([y_true[idx] / np.log2(i + 2) for i, idx in enumerate(pred_indices)])

    # Calculate ideal DCG
    ideal_indices = np.argsort(y_true)[::-1][:k]
    idcg = np.sum([y_true[idx] / np.log2(i + 2) for i, idx in enumerate(ideal_indices)])

    return dcg / idcg if idcg > 0 else 0.0


def precision_at_k(y_true, y_pred, k):
    """Modified Precision@k implementation."""
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)

    # Get top k indices
    indices = np.argsort(y_pred)[::-1][:k]

    # Get actual top k indices
    actual_top_k = np.argsort(y_true)[::-1][:k]

    # Calculate precision
    true_positives = len(set(indices) & set(actual_top_k))
    return true_positives / k

def recall_at_k(y_true, y_pred, k):
    """Modified Recall@k implementation."""
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)

    # Get top k indices
    indices = np.argsort(y_pred)[::-1][:k]

    # Get actual top k indices
    actual_top_k = np.argsort(y_true)[::-1][:k]

    # Calculate recall
    true_positives = len(set(indices) & set(actual_top_k))
    return true_positives / len(actual_top_k)

def f1_score_at_k(y_true, y_pred, k):
    """Modified F1@k implementation."""
    precision = precision_at_k(y_true, y_pred, k)
    recall = recall_at_k(y_true, y_pred, k)

    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def evaluate_model_per_user(predictions, test_targets, user_ids, k=10):
    """Evaluate model performance per user using ranking metrics."""
    # Convert user_ids from list of numpy arrays to tuple of values for hashing
    try:
        # Convert each numpy array to a tuple for hashing
        user_ids_hashable = [tuple(uid) for uid in user_ids]

        # Get unique users
        unique_users = list(set(map(tuple, user_ids)))

        ndcg_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        for user in unique_users:
            # Create mask for current user
            user_mask = [tuple(uid) == user for uid in user_ids_hashable]
            user_mask = np.array(user_mask)

            user_pred = predictions[user_mask]
            user_true = test_targets[user_mask]

            if len(user_pred) > 0:
                relevance_threshold = np.percentile(user_true, 80)
                user_true_binary = (user_true >= relevance_threshold).astype(int)

                ndcg_scores.append(ndcg_at_k(user_true, user_pred, k))
                precision_scores.append(precision_at_k(user_true_binary, user_pred, k))
                recall_scores.append(recall_at_k(user_true_binary, user_pred, k))
                f1_scores.append(f1_score_at_k(user_true_binary, user_pred, k))

        # Calculate and print averages
        if ndcg_scores:
            print(f"Average NDCG@{k}: {np.mean(ndcg_scores):.4f}")
            print(f"Average Precision@{k}: {np.mean(precision_scores):.4f}")
            print(f"Average Recall@{k}: {np.mean(recall_scores):.4f}")
            print(f"Average F1-score@{k}: {np.mean(f1_scores):.4f}")
        else:
            print("No valid predictions found for evaluation")

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
        print(f"user_ids shape: {user_ids.shape}, dtype: {user_ids.dtype}")
        print(f"predictions shape: {predictions.shape}, dtype: {predictions.dtype}")
        print(f"test_targets shape: {test_targets.shape}, dtype: {test_targets.dtype}")
        raise

    return {
        "ndcg": np.mean(ndcg_scores) if ndcg_scores else 0,
        "precision": np.mean(precision_scores) if precision_scores else 0,
        "recall": np.mean(recall_scores) if recall_scores else 0,
        "f1": np.mean(f1_scores) if f1_scores else 0,
    }

if __name__ == "__main__":
    main()


Using device: cuda
Error during training: "['danceability', 'energy', 'loudness', 'valence'] not in index"


KeyError: "['danceability', 'energy', 'loudness', 'valence'] not in index"