In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from scipy.sparse import csr_matrix

#### Load dataset

In [2]:
songs_df = pd.read_csv("cleaned_data/songs_cleaned.csv")

In [3]:
X = songs_df.sample(frac=1, random_state=42).reset_index(drop=True)

#### Determine which attributes are numerical and which are categorical (will be used for standardizing)

In [4]:
# Get numerical columns
numerical_cols = songs_df.select_dtypes(include=['number']).columns.tolist()

In [5]:
categorical_cols = songs_df.select_dtypes(exclude=['number']).columns.tolist()

#### Using a ColumnTransformer, create a preprocessor that which standardize both the categorical and numerical attributes

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols),  
        ('num', StandardScaler(), numerical_cols)      
    ], sparse_threshold=0.3)  # Keeps output sparse when >30% sparse

In [7]:
# fit the knn
X_preprocessed = preprocessor.fit_transform(X)

In [8]:
# Get the full list of feature names after the transformation
full_feature_names = preprocessor.get_feature_names_out()

# Find the indices of numerical features (those starting with 'num')
num_indices = [i for i, name in enumerate(full_feature_names) if name.startswith('num')]

# Find the indices of categorical features (those starting with 'cat')
cat_indices = [i for i, name in enumerate(full_feature_names) if name.startswith('cat')]


In [12]:
onehot_encoder = preprocessor.named_transformers_['cat']
scaler = preprocessor.named_transformers_['num']

#### 1. KNN

In [13]:
knn = NearestNeighbors(n_neighbors=1, metric='euclidean')

In [14]:
knn.fit(X_preprocessed)

In [15]:
# a) define knn recommender 
def knn_recommender(playlist_keys, r):
    """Finds r nearest songs based on the aggregated features of a playlist."""

    # 1. make sure the playlist is a list (these are the track_keys)
    if not isinstance(playlist_keys, list):
        playlist_keys = [playlist_keys]

    # 2. make sure that the songs are in the song "database"
    missing_keys = [key for key in playlist_keys if key not in songs_df["track_key"].values]
    if missing_keys:
        return f"These keys are not in the database: {missing_keys}"

    # 3. process the given songs
    playlist_songs = songs_df[songs_df["track_key"].isin(playlist_keys)]
    processed_songs = preprocessor.transform(playlist_songs)

    # 4. aggregate the songs based on their scaled and encoded values
    playlist_vector = csr_matrix(np.mean(processed_songs, axis=0).reshape(1, -1))

    # 5. get nearest neighbors
    # note: we get more than requested to make we have enough to recommend without overlapping inputted playlist
    distances, indices = knn.kneighbors(playlist_vector, n_neighbors=r*2)

    # 6. inverse preprocess nearest songs and return recommendations
    recommendations = []

    print("\nRecommended Songs:")
    recs = 0
    for j in range(len(distances[0])):
        # a) get the song idex and row from preprocessed
        song_idx = indices[0][j]
        song_row = X_preprocessed[song_idx]

        # b) convert to array
        song_array = song_row.toarray()[0]

        # c) inversely transform the song
        original_categorical = onehot_encoder.inverse_transform(song_array[cat_indices].reshape(1, -1))
        original_numerical = scaler.inverse_transform(song_array[num_indices].reshape(1, -1))

        original_features = np.concatenate((original_categorical, original_numerical), axis=1)

        # d) get the song from the original df based on key
        original_song = songs_df[songs_df["track_key"] == original_features[0][6]]

        # e) make sure it isn't one of the songs from the playlist
        if original_song["track_key"].iloc[0] not in playlist_keys:

            # print(f"  Distance: {distances[0][j]}")
            print(original_song["track_name"].iloc[0])
            recommendations.append(original_song)
            recs+=1

            if recs == r:
                break

    return recommendations

In [16]:
test_1 = knn_recommender(["samurai - tiesto remix _ r3hab"], 3)


Recommended Songs:
Samurai - Tiësto Remix
Slammer - Original Mix
Everybody Is In The Place [Mix Cut] - I AM Hardwell Intro Edit


In [159]:
test_2 = knn_recommender(["clarity _ zedd"], 3)


Recommended Songs:
A Sky Full of Stars
Stay The Night - Featuring Hayley Williams Of Paramore
This Is What It Feels Like


In [161]:
test_3 = knn_recommender(["heroes (we could be) _ alesso"], 3)


Recommended Songs:
A Sky Full of Stars
Hurricane
Red Lights


In [164]:
test_4 = knn_recommender(["clarity _ zedd", "heroes (we could be) _ alesso", "samurai - tiesto remix _ r3hab"], 3)


Recommended Songs:
Dancing On My Own
This Is What It Feels Like
If I Lose Myself - Alesso vs OneRepublic
