# Decision Tree Recommendation System

In [1]:
import pandas as pd
import numpy as np

In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

class DecisionTreeRecommender:
    def __init__(self, max_depth=10, min_sample_leaf=1):
        self.max_depth = max_depth # The maximum depth of the tree (check scikit learn docs)
        self.min_sample_leaf = min_sample_leaf # The minimum number of samples required to split an internal node (check scikit learn docs)
        # self.scaler = StandardScaler()
        self.numeric_preprocessor = StandardScaler()
        self.categorical_preprocessor = OneHotEncoder(drop='first', sparse=False)
        self.trees = None
        self.titles = None
        self.features = None
        self.numeric_features = None
        self.categorical_features = None
        self.encoded_feature_names = None

    def _preprocess_features(self, X):
        """Preprocess both numeric and categorical features"""
        # Process numeric features
        if self.numeric_features:
            numeric_scaled = self.numeric_preprocessor.fit_transform(X[self.numeric_features])
            numeric_df = pd.DataFrame(
                numeric_scaled, 
                columns=self.numeric_features
            )
        else:
            numeric_df = pd.DataFrame()
            
        # Process categorical features
        if self.categorical_features:
            categorical_encoded = self.categorical_preprocessor.fit_transform(X[self.categorical_features])
            # Get feature names after one-hot encoding
            categorical_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(
                categorical_encoded,
                columns=categorical_names
            )
        else:
            categorical_df = pd.DataFrame()
            
        # Combine processed features
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)
        self.encoded_feature_names = processed_df.columns.tolist()
        
        return processed_df.values
    
    def _preprocess_single_sample(self, x):
        """Preprocess a single sample using fitted preprocessors"""
        # Handle numeric features
        if self.numeric_features:
            numeric_scaled = self.numeric_preprocessor.transform(x[self.numeric_features].reshape(1, -1))
            numeric_df = pd.DataFrame(
                numeric_scaled, 
                columns=self.numeric_features
            )
        else:
            numeric_df = pd.DataFrame()
            
        # Handle categorical features
        if self.categorical_features:
            categorical_encoded = self.categorical_preprocessor.transform(x[self.categorical_features].reshape(1, -1))
            categorical_names = self.categorical_preprocessor.get_feature_names_out(self.categorical_features)
            categorical_df = pd.DataFrame(
                categorical_encoded,
                columns=categorical_names
            )
        else:
            categorical_df = pd.DataFrame()
            
        # Combine processed features
        processed_df = pd.concat([numeric_df, categorical_df], axis=1)
        return processed_df.values[0]

    def fit(self, X, song_titles, numeric_features=None, categorical_features=None):
        """
        Fit the recommender system
        
        Parameters:
        -----------
        X : pandas.DataFrame
            DataFrame containing both numeric and categorical features
        song_titles : array-like
            List of song titles
        numeric_features : list
            List of numeric feature column names
        categorical_features : list
            List of categorical feature column names
        """
        self.titles = song_titles
        self.features = X
        self.numeric_features = numeric_features or []
        self.categorical_features = categorical_features or []
        
        # Preprocess features
        scaled_features = self._preprocess_features(X)
        
        # Train a tree for each processed feature
        n_features = scaled_features.shape[1]
        self.trees = []
        
        for i in range(n_features):
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_sample_leaf=self.min_sample_leaf
            )
            tree.fit(scaled_features, scaled_features[:, i])
            self.trees.append(tree)
        
        return self
    
    def predict_features(self, x):
        """Predict feature values using the decision trees"""
        scaled_x = self._preprocess_single_sample(x)
        predictions = np.zeros_like(scaled_x)
        
        for i, tree in enumerate(self.trees):
            predictions[i] = tree.predict([scaled_x])[0]
        
        return predictions
    
    def recommend(self, song_title, top_k=5):
        """
        Recommend similar songs
        
        Parameters:
        -----------
        song_title : str
            Title of the song to find recommendations for
        top_k : int
            Number of recommendations to return
            
        Returns:
        --------
        list : List of recommended song titles
        """
        if song_title not in self.titles:
            raise ValueError(f"Song '{song_title}' not found in the training data")
        
        song_idx = list(self.titles).index(song_title)
        x = self.features.iloc[song_idx]
        
        # Get predictions for all songs
        scaled_features = self._preprocess_features(self.features)
        predictions = np.zeros_like(scaled_features)
        
        for i in range(len(self.titles)):
            if i == song_idx:
                continue
            predictions[i] = self.predict_features(self.features.iloc[i])
        
        # Compute similarities
        query_prediction = self.predict_features(x)
        similarities = cosine_similarity([query_prediction], predictions)[0]
        
        # Get top-k similar songs
        similar_indices = np.argsort(similarities)[::-1][:top_k]
        
        # Return the titles of similar songs with similarity scores
        recommendations = [
            (self.titles[idx], similarities[idx])
            for idx in similar_indices
        ]
        
        return recommendations

In [19]:
# Example usage:
def sklearn_recommendation_system(data_path, song_to_recommend, top_k=5, method='clustering'):
    # Load data
    df = pd.read_csv(data_path)

    from pandas.api.types import is_numeric_dtype
    numeric_features = [col for col in df.columns if is_numeric_dtype(df[col])]
    categorical_features = [col for col in df.columns if not is_numeric_dtype(df[col])]

    # print("Numerical columns:", numerical_cols)
    # print("Categorical columns:", categorical_cols)
    
    # Extract features and song titles
    song_titles = df['track_name'].values
    features = df.drop(['track_name'], axis=1).values
    
    # Create and fit the recommender
    recommender = DecisionTreeRecommender(max_depth=6)
    recommender.fit(
        X=df[numeric_features + categorical_features],
        song_titles=df['track_name'].values,
        numeric_features=numeric_features,
        categorical_features=categorical_features
    )
    
    # Get recommendations for a song
    song_title = "Example Song"
    recommendations = recommender.recommend(song_title, top_k=5)
    
    # Print recommendations
    print(f"\nRecommendations for '{song_title}':")
    for title, score in recommendations:
        print(f"- {title} (similarity: {score:.3f})")
        
    return recommender

In [22]:
sklearn_recommendation_system("cleaned_data/songs_cleaned.csv", "One Love")



MemoryError: Unable to allocate 10.6 GiB for an array with shape (69449, 20490) and data type float64