In [4]:
import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple
from scipy.spatial.distance import cdist
from metrics import intra_list_diversity, genre_coverage, feature_variance



In [23]:
def prepare_features_for_clustering(
    df: pd.DataFrame, 
    feature_cols: Optional[List[str]] = None
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Prepare feature data for clustering by removing rows with NaN values.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing song data
    feature_cols : list of str, optional
        Feature columns to use. If None, uses default features
        
    Returns:
    --------
    tuple
        (Clean DataFrame with all columns, DataFrame with only features)
    """
    if not feature_cols:
        feature_cols = ['explicit', 'danceability', 'energy', 'key', 'loudness', 'mode',
                        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo', 'time_signature', 'encoded_genre']
    
    # Extract feature columns
    df_features = df[feature_cols].copy()
    
    # Remove rows with NaN values
    df_features = df_features.dropna()
    
    # Filter main dataframe to same rows
    clean_df = df.loc[df_features.index]
    
    return clean_df, df_features

In [6]:
def select_starting_song(
    df: pd.DataFrame, 
    starting_song_index: Optional[int] = None
) -> int:
    """
    Select a starting song index, either from input or randomly.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing songs
    starting_song_index : int, optional
        Index of starting song. If None or invalid, selects randomly
        
    Returns:
    --------
    int
        Index of the starting song
    """
    if starting_song_index is None or starting_song_index not in df.index:
        return np.random.choice(df.index)
    return starting_song_index

In [7]:
def get_cluster_assignments(
    model, 
    df_features: pd.DataFrame
) -> np.ndarray:
    """
    Get cluster assignments for all songs in the dataset.
    
    Parameters:
    -----------
    model : clustering model
        Model with predict() method
    df_features : pandas DataFrame
        DataFrame containing song features
        
    Returns:
    --------
    numpy.ndarray
        Array of cluster assignments
    """
    return model.predict(df_features)

In [8]:
def compute_cluster_transition_probabilities(
    model, 
    current_cluster: int, 
    temperature: float
) -> np.ndarray:
    """
    Compute probabilities for transitioning to other clusters.
    
    Parameters:
    -----------
    model : clustering model
        Model with cluster_centers_ attribute
    current_cluster : int
        Current cluster ID
    temperature : float
        Controls randomness (lower = more predictable)
        
    Returns:
    --------
    numpy.ndarray
        Array of probabilities for each cluster
    """
    # if not hasattr(model, 'cluster_centers_'):
    #     # If model doesn't have centroids, return None
    #     return None
        
    centroids = model.cluster_centers_
    # Calculate distances from current cluster to all clusters
    distances = cdist([centroids[current_cluster]], centroids)[0]
    
    # Convert distances to probabilities (smaller distance = higher probability)
    # Apply temperature to control randomness
    probs = np.exp(-distances / temperature)
    return probs / probs.sum()

In [9]:
def select_next_cluster(
    model,
    current_cluster: int,
    unique_clusters: np.ndarray,
    temperature: float,
    explore: bool = False
) -> int:
    """
    Select the next cluster to sample from.
    
    Parameters:
    -----------
    model : clustering model
        Clustering model with predict() method
    current_cluster : int
        Current cluster ID
    unique_clusters : numpy.ndarray
        Array of unique cluster IDs
    temperature : float
        Controls randomness
    explore : bool
        Whether to force exploration of other clusters
        
    Returns:
    --------
    int
        Next cluster ID
    """
    # If not exploring, stay in current cluster
    if not explore:
        return current_cluster
        
    # When exploring, sample from clusters based on distance
    if hasattr(model, 'cluster_centers_'):
        # Get transition probabilities
        probs = compute_cluster_transition_probabilities(
            model, current_cluster, temperature
        )
        # Sample a cluster based on proximity
        return np.random.choice(unique_clusters, p=probs)
    else:
        # Simple random selection if no centroids available
        other_clusters = [c for c in unique_clusters if c != current_cluster]
        if other_clusters:
            return np.random.choice(other_clusters)
        return current_cluster

In [10]:
def get_available_songs(
    df: pd.DataFrame,
    all_clusters: np.ndarray,
    cluster: int,
    selected_indices: set
) -> List[int]:
    """
    Get available (not yet selected) songs from a cluster.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing songs
    all_clusters : numpy.ndarray
        Array of cluster assignments for all songs
    cluster : int
        Cluster ID to find songs from
    selected_indices : set
        Set of already selected song indices
        
    Returns:
    --------
    list
        List of available song indices
    """
    cluster_songs = df.index[all_clusters == cluster]
    return [idx for idx in cluster_songs if idx not in selected_indices]

In [11]:
def find_cluster_with_available_songs(
    df: pd.DataFrame,
    all_clusters: np.ndarray,
    unique_clusters: np.ndarray,
    selected_indices: set
) -> Optional[int]:
    """
    Find a cluster that still has available songs.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing songs
    all_clusters : numpy.ndarray
        Array of cluster assignments for all songs
    unique_clusters : numpy.ndarray
        Array of unique cluster IDs
    selected_indices : set
        Set of already selected song indices
        
    Returns:
    --------
    int or None
        Cluster ID with available songs, or None if all songs are selected
    """
    for cluster in unique_clusters:
        available_songs = get_available_songs(
            df, all_clusters, cluster, selected_indices
        )
        if available_songs:
            return cluster
    return None

In [12]:
def extract_human_readable_features(song: Dict) -> Dict:
    """
    Extract only human-readable features from a song dictionary.
    
    Parameters:
    -----------
    song : dict
        Dictionary containing song information
        
    Returns:
    --------
    dict
        Dictionary with only human-readable features
    """
    # Define human-readable features to keep
    readable_fields = [
        'artists','album_name','track_name','track_genre'
    ]
    
    # Create a new dictionary with only the readable fields
    readable_song = {}
    for field in readable_fields:
        if field in song and pd.notna(song[field]):
            readable_song[field] = song[field]
    
    return readable_song

In [13]:
def generate_dynamic_playlist(
    df: pd.DataFrame, 
    model, 
    starting_song_index: Optional[int] = None, 
    playlist_size: int = 20, 
    temperature: float = 0.3, 
    feature_cols: Optional[List[str]] = None,
    exploration_rate: float = 0.3  # 30% chance to explore
) -> List[Dict]:
    """
    Generate a playlist by stochastically sampling songs from clusters.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing encoded songs with features
    model : object
        Clustering model with predict() method
    starting_song_index : int, optional
        Index of the starting song. If None, a random song is selected
    playlist_size : int
        Number of songs to include in the playlist
    temperature : float
        Controls randomness (0.0 = deterministic, 1.0+ = more random)
    feature_cols : list, optional
        Feature columns to use for prediction
    exploration_rate : float
        Probability of exploring other clusters
        
    Returns:
    --------
    list
        List of dictionaries containing song information for the playlist
    """
    # Prepare data
    df, df_features = prepare_features_for_clustering(df, feature_cols)
    
    # Get cluster assignments for all songs
    all_clusters = get_cluster_assignments(model, df_features)
    unique_clusters = np.unique(all_clusters)
    
    # Select starting song
    starting_song_index = select_starting_song(df, starting_song_index)
    
    # Get starting song features and determine its cluster
    starting_features = df_features.loc[starting_song_index].values.reshape(1, -1)
    current_cluster = model.predict(starting_features)[0]
    
    # Initialize playlist
    playlist = [df.loc[starting_song_index].to_dict()]
    selected_indices = {starting_song_index}
    
    # Generate the rest of the playlist
    for _ in range(playlist_size - 1):
        # Decide whether to explore other clusters
        explore = np.random.random() < exploration_rate
        
        # Select next cluster (either stay or explore)
        next_cluster = select_next_cluster(
            model, current_cluster, unique_clusters, temperature, explore
        )
        
        # Get available songs in the selected cluster
        available_songs = get_available_songs(
            df, all_clusters, next_cluster, selected_indices
        )
        
        # If no songs available in that cluster, find another with available songs
        if not available_songs:
            next_cluster = find_cluster_with_available_songs(
                df, all_clusters, unique_clusters, selected_indices
            )
            
            # If no cluster has available songs, we've used all songs
            if next_cluster is None:
                break
                
            # Get available songs from the new cluster
            available_songs = get_available_songs(
                df, all_clusters, next_cluster, selected_indices
            )
        
        # Select a random song from available songs
        next_song_index = np.random.choice(available_songs)
        playlist.append(df.loc[next_song_index].to_dict())
        selected_indices.add(next_song_index)
        
        # Update current cluster for smooth transitions
        next_features = df_features.loc[next_song_index].values.reshape(1, -1)
        current_cluster = model.predict(next_features)[0]
    
    return playlist

In [14]:
def save_playlist_to_csv(playlist: List[Dict], filepath: str) -> None:
    """
    Save the generated playlist to a CSV file with only human-readable features.
    
    Parameters:
    -----------
    playlist : list
        List of dictionaries containing song information
    filepath : str
        Path to save the CSV file
    """
    # Process each song to keep only human-readable features
    readable_playlist = [extract_human_readable_features(song) for song in playlist]
    
    # Convert to DataFrame and save
    playlist_df = pd.DataFrame(readable_playlist)
    playlist_df.to_csv(filepath, index=False)
    print(f"Playlist saved to {filepath}")

In [15]:
def display_playlist(playlist: List[Dict], show_features: bool = False) -> None:
    """
    Display the generated playlist in a readable format.
    
    Parameters:
    -----------
    playlist : list
        List of dictionaries containing song information
    show_features : bool
        Whether to display audio features
    """
    print(f"\n{'=' * 50}")
    print(f"Generated Playlist ({len(playlist)} songs)")
    print(f"{'=' * 50}")
    
    for i, song in enumerate(playlist, 1):
        # Extract song information (adjust based on your data structure)
        title = song.get('name', song.get('track_name', 'Unknown Title'))
        artist = song.get('artists', song.get('artist_name', 'Unknown Artist'))
        genre = song.get('track_genre', 'Unknown Genre')
        
        print(f"{i}. {title} - {artist} - {genre}")
        
        if show_features and any(key in song for key in ['danceability', 'energy', 'valence']):
            # Filter for only audio features if requested
            features = {k: v for k, v in song.items() if k in [
                'danceability', 'energy', 'valence', 'tempo', 
                'acousticness', 'instrumentalness'
            ]}
            
            # Print formatted features
            if features:
                feature_str = ", ".join([f"{k}: {v:.2f}" for k, v in features.items()])
                print(f"   Features: {feature_str}")
    
    print(f"{'=' * 50}")

In [16]:
# Example usage
# if __name__ == "__main__":
    # Assuming df and model are already loaded
    # df = pd.read_csv('music_data.csv')
    # model = load_compressed_model('pop', 'models')
    
    # Generate playlist
    # playlist = generate_dynamic_playlist(
    #     df, 
    #     model, 
    #     playlist_size=15, 
    #     temperature=0.4
    # )
    
    # Display the playlist
    # display_playlist(playlist, show_features=True)
    
    # Save to CSV (only human-readable features)
    # save_playlist_to_csv(playlist, 'my_dynamic_playlist.csv')

In [17]:
from clustering_functions import load_compressed_model

In [18]:
DATA_FP = 'data_music_features/processed_spotify_sample.csv'

In [24]:
df = pd.read_csv(DATA_FP)
model = load_compressed_model('Baseline', 'models/25-03-07')

playlist = generate_dynamic_playlist(
    df, 
    model, 
    playlist_size=15, 
    temperature=0.01
)

display_playlist(playlist, show_features=True)




Generated Playlist (15 songs)
1. Volta Pra Mim - Ao Vivo - Thiaguinho - pagode
   Features: danceability: 0.56, energy: 0.60, acousticness: 0.65, instrumentalness: 0.00, valence: 0.34, tempo: 0.61
2. Coincidência - Ao Vivo - Turma do Pagode - pagode
   Features: danceability: 0.57, energy: 0.77, acousticness: 0.60, instrumentalness: 0.00, valence: 0.65, tempo: 0.67
3. Echolot - Michael Wendler - party
   Features: danceability: 0.81, energy: 0.78, acousticness: 0.31, instrumentalness: 0.00, valence: 0.72, tempo: 0.54
4. Pardon - Rick Gallagher Project - piano
   Features: danceability: 0.43, energy: 0.03, acousticness: 0.99, instrumentalness: 0.95, valence: 0.06, tempo: 0.52
5. Aber dich gibt's nur einmal für mich - Die Draufgänger - party
   Features: danceability: 0.69, energy: 0.68, acousticness: 0.21, instrumentalness: 0.00, valence: 0.75, tempo: 0.40
6. Love Like This - Ben Rector - piano
   Features: danceability: 0.65, energy: 0.14, acousticness: 0.91, instrumentalness: 0.00, v



In [25]:
import pandas as pd

df = pd.read_csv(DATA_FP)
model = load_compressed_model('Baseline', 'models/25-03-07')

# Generate playlist (returns a list of dictionaries)
playlist = generate_dynamic_playlist(
    df, 
    model, 
    playlist_size=15, 
    temperature=0.01
)

# Convert playlist to a DataFrame
recommendations = pd.DataFrame(playlist)  

# Define feature columns
# feature_cols = ['danceability', 'energy', 'valence', 'tempo']

# Evaluate playlist
ild_score = intra_list_diversity(recommendations, feature_cols)
gc_score = genre_coverage(recommendations, df)
fv_score = feature_variance(recommendations, feature_cols)

# Print results
print(f"Intra-List Diversity: {ild_score}")
print(f"Genre Coverage: {gc_score:.2%}")
print(f"Feature Variance Score: {fv_score}")


Intra-List Diversity: 0.5232647178324149
Genre Coverage: 4.76%
Feature Variance Score: 0.03812011902244186




In [26]:
print("Model trained with features:", model.feature_names_in_)  # Shows expected feature names
print("Current dataset features:", recommendations.columns.tolist())  # Shows actual columns


Model trained with features: ['explicit' 'danceability' 'energy' 'key' 'loudness' 'mode' 'speechiness'
 'acousticness' 'instrumentalness' 'liveness' 'valence' 'tempo'
 'time_signature' 'encoded_genre']
Current dataset features: ['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre', 'encoded_genre']
