# Decision Tree Recommendation System

In [2]:
import pandas as pd
import numpy as np

In [10]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

class DecisionTreeRecommender: 
    """ A recommendation system that uses a tree. The tree's main goal is to classify the song title; 
    however, with the max depth parameter, the user can control the depth. 

    Attributes:
    -----------
    max_depth : int
        The maximum depth of the tree (check scikit learn docs)
    min_samples_leaf : int
        The minimum number of samples required to split an internal node (check scikit learn docs)
    numeric_preprocessor : StandardScaler
        preprocessor for the numerical standardization
    categorical_preprocessor : OneHotEncoder
        preprocessor for the numerical standardization
    trees : array of DecisionTreeRegressor
        trees that are used to split on the best feature
    titles : array of strings
        song titles of the data
    features : array of arrays
        all other features (other than title names)
    numeric_features : array of strings
        names of the columns of the numerical features to be scaled
    categorical_features : array of strings
        names of the columns of the categorical features to be scaled
    """
    def __init__(self, max_depth=10, min_samples_leaf=1):
        self.max_depth = max_depth 
        self.min_sample_leaf = min_samples_leaf 
        self.numeric_preprocessor = StandardScaler() 
        self.categorical_preprocessor = OneHotEncoder(drop='first', sparse_output=False) 
        self.trees = None 
        self.titles = None 
        self.features = None 
        self.numeric_features = None 
        self.categorical_features = None

    def _preprocess_features(self, X):
        """
        Preprocesses both the numerical and the categorical features of the data
        
        Parameters:
        -----------
        X : pd.Dataframe
            dataframe of attributes and their values for each row
        
        Returns:
        --------
        processed_df : pd.DataFrame
            processed dataframe of both of the types of attributes
        """

        # 1. process the numerical features
        if self.numeric_features:
            numeric_scaled = self.numeric_preprocessor.fit_transform(X[self.numeric_features])
            numeric_df = pd.DataFrame(numeric_scaled, columns=self.numeric_features)
        else:
            numeric_df = pd.DataFrame()

        # 2. process the categorical features
        if self.categorical_features:
            categorical_scaled = self.categorical_preprocessor.fit_transform(X[self.categorical_features])
            # Note: because this is one-hot encoding, we need to get the new and additional names of these columns
            categorical_column_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(categorical_scaled, columns=categorical_column_names)

        else:
            categorical_df = pd.DataFrame()

        # 3. combine the dfs into one to be all of the processed features
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)

        return processed_df
    
    def _preprocess_single_sample(self, x):
        """
        Preprocesses a single sample. This includes both numerical and categorical features

        Parameters:
        ----------
        x : series
            attributes of a single song

        Returns:
        --------
        processed_df : pd.series 
            input song's preprocessed attributes
        """
        # note: x needs to be reshaped bc preprocessors use dataframes

        # 1. process the numerical features
        if self.numeric_features:
            numeric_scaled = self.numeric_preprocessor.transform(x[self.numeric_features].reshape(1, -1))
            numeric_df = pd.DataFrame(numeric_scaled, columns=self.numeric_features)
        else:
            numeric_df = pd.DataFrame()

        # 2. process the categorical features
        if self.categorical_features:
            categorical_scaled = self.categorical_preprocessor.transform(x[self.categorical_features].reshape(1, -1))
            categorical_column_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(categorical_scaled, columns=categorical_column_names)
        else:
            categorical_df = pd.DataFrame()

        # combine the dfs into one and then return x (scaled) as a series
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)
        return processed_df.values[0]
    
    def fit(self, X, song_titles, numeric_features=None, categorical_features=None):
        """
        Fits the decision tree to the given song data

        Parameters:
        -----------
        X : pandas.DataFrame
            DataFrame containing both numeric and categorical features
        song_titles : array-like
            List of song titles
        numeric_features : list
            List of numeric feature column names
        categorical_features : list
            List of categorical feature column names
        """
        self.titles = song_titles
        self.features = X
        self.numeric_features = numeric_features or []
        self.categorical_features = categorical_features or []

        # 1. preprocess features
        scaled_features = self._preprocess_features(X)

        # 2. train trees
        

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

class DecisionTreeRecommender:
    def __init__(self, max_depth=10, min_sample_leaf=1):
        self.max_depth = max_depth # The maximum depth of the tree (check scikit learn docs)
        self.min_sample_leaf = min_sample_leaf # The minimum number of samples required to split an internal node (check scikit learn docs)
        self.numeric_preprocessor = StandardScaler() # this is to do the numerical standardization
        self.categorical_preprocessor = OneHotEncoder(drop='first', sparse_output=False) # this is for categorical
        self.trees = None # holds all the trees
        self.titles = None # song title names
        self.features = None # all other features (other than title names)
        self.numeric_features = None 
        self.categorical_features = None
        self.encoded_feature_names = None

    def _preprocess_features(self, X):
        """Preprocess both numeric and categorical features"""
        # Process numeric features
        if self.numeric_features:
            numeric_scaled = self.numeric_preprocessor.fit_transform(X[self.numeric_features])
            numeric_df = pd.DataFrame(
                numeric_scaled, 
                columns=self.numeric_features
            )
        else:
            numeric_df = pd.DataFrame()
            
        # Process categorical features
        if self.categorical_features:
            categorical_encoded = self.categorical_preprocessor.fit_transform(X[self.categorical_features])
            # Get feature names after one-hot encoding
            categorical_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(
                categorical_encoded,
                columns=categorical_names
            )
        else:
            categorical_df = pd.DataFrame()
            
        # Combine processed features
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)
        # self.encoded_feature_names = processed_df.columns.tolist() # this isn't used lol
        
        return processed_df.values
    
    def _preprocess_single_sample(self, x):
        """Preprocess a single sample using fitted preprocessors"""
        # Handle numeric features
        if self.numeric_features:
            numeric_scaled = self.numeric_preprocessor.transform(x[self.numeric_features].reshape(1, -1))
            numeric_df = pd.DataFrame(
                numeric_scaled, 
                columns=self.numeric_features
            )
        else:
            numeric_df = pd.DataFrame()
            
        # Handle categorical features
        if self.categorical_features:
            categorical_encoded = self.categorical_preprocessor.transform(x[self.categorical_features].reshape(1, -1))
            categorical_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(
                categorical_encoded,
                columns=categorical_names
            )
        else:
            categorical_df = pd.DataFrame()
            
        # Combine processed features
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)
        return processed_df.values[0]

    def fit(self, X, song_titles, numeric_features=None, categorical_features=None):
        """
        Fit the recommender system
        
        Parameters:
        -----------
        X : pandas.DataFrame
            DataFrame containing both numeric and categorical features
        song_titles : array-like
            List of song titles
        numeric_features : list
            List of numeric feature column names
        categorical_features : list
            List of categorical feature column names
        """
        self.titles = song_titles
        self.features = X
        self.numeric_features = numeric_features or []
        self.categorical_features = categorical_features or []
        
        # Preprocess features
        scaled_features = self._preprocess_features(X)
        
        # Train a tree for each processed feature
        n_features = scaled_features.shape[1]
        self.trees = []
        
        for i in range(n_features):
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_sample_leaf
            )
            tree.fit(scaled_features, scaled_features[:, i])
            self.trees.append(tree)
        
        return self
    
    def predict_features(self, x):
        """Predict feature values using the decision trees"""
        # Process numeric features
        if self.numeric_features:
            numeric_data = x[self.numeric_features].values.reshape(1, -1)
            numeric_scaled = self.numeric_preprocessor.transform(numeric_data)
            numeric_df = pd.DataFrame(
                numeric_scaled, 
                columns=self.numeric_features
            )
        else:
            numeric_df = pd.DataFrame()
            
        # Process categorical features
        if self.categorical_features:
            categorical_data = x[self.categorical_features].to_frame().T
            categorical_encoded = self.categorical_preprocessor.transform(categorical_data)
            categorical_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(
                categorical_encoded,
                columns=categorical_names
            )
        else:
            categorical_df = pd.DataFrame()
            
        # Combine processed features
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)
        scaled_x = processed_df.values[0]
        
        predictions = np.zeros_like(scaled_x)
        for i, tree in enumerate(self.trees):
            predictions[i] = tree.predict([scaled_x])[0]
        
        return predictions
    
    def recommend(self, song_title, top_k=5):
        if song_title not in self.titles:
            raise ValueError(f"Song '{song_title}' not found in the training data")
        
        song_idx = list(self.titles).index(song_title)
        x = self.features.iloc[song_idx]
        
        # Get predictions for all songs
        predictions = np.zeros((len(self.titles), len(self.trees)))
        for i in range(len(self.titles)):
            if i == song_idx:
                continue
            predictions[i] = self.predict_features(self.features.iloc[i])
        
        # Compute similarities 
        query_prediction = self.predict_features(x)
        similarities = cosine_similarity([query_prediction], predictions)[0]
        
        # Get top-k similar songs
        similar_indices = np.argsort(similarities)[::-1][:top_k]
        
        # Return the titles of similar songs with similarity scores
        recommendations = [
            (self.titles[idx], similarities[idx])
            for idx in similar_indices
        ]
        
        return recommendations

In [None]:
# Example usage:
def sklearn_recommendation_system(data_path, song_title, top_k=5, method='clustering'):
    # Load data
    df = pd.read_csv(data_path)

    from pandas.api.types import is_numeric_dtype
    numeric_features = [col for col in df.columns if is_numeric_dtype(df[col])]
    categorical_features = [col for col in df.columns if not is_numeric_dtype(df[col])]
    
    # Create and fit the recommender
    recommender = DecisionTreeRecommender(max_depth=6)
    recommender.fit(
        X=df[numeric_features + categorical_features],
        song_titles=df['track_name'].values,
        numeric_features=numeric_features,
        categorical_features=categorical_features
    )
    
    # Get recommendations for a song
    recommendations = recommender.recommend(song_title, top_k=4)
    
    # Print recommendations
    print(f"\nRecommendations for '{song_title}':")
    for title, score in recommendations:
        print(f"- {title} (similarity: {score:.3f})")
        
    return recommender

### created a subset of the data (100 rows)

In [20]:
sklearn_recommendation_system("subset.csv", "Call You Mine")


Recommendations for 'Call You Mine':
- Need U (similarity: 0.409)
- Let Me Love You (Until You Learn To Love Yourself) (similarity: 0.386)
- Anything Could Happen (similarity: 0.349)
- Roses (similarity: 0.349)


<__main__.DecisionTreeRecommender at 0x14e580193210>

In [16]:
df = pd.read_csv("subset.csv")
df[df["track_name"] == "Not Ok"]

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
8,6aoGtdWXBkYQ2O2wnyIz2x,Not Ok,Kygo,73,3GTuto6NDtZegL6idSk183,Not Ok,2019-05-23,Dance Pop,37i9dQZF1DWZQaaqNMbbXa,pop,...,11,-8.14,0,0.0372,0.0292,2e-06,0.325,0.408,105.0,210944


In [17]:
df[df["track_name"] == "Higher Love"]

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6oJ6le65B3SEqPwMRNXWjY,Higher Love,Kygo,87,4wquJImu8RtyEuDtIAsfcE,Higher Love,2019-06-28,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,8,-7.159,1,0.0324,0.0154,6e-06,0.101,0.404,103.952,228267


In [None]:
# sklearn_recommendation_system("cleaned_data/songs_cleaned.csv", "One Love")