In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity

# import import_ipynb # an extension that allows me to import a ipynb file 
from mf import *
from knn import *

In [31]:
# Getting KNN and MF song recommendations

# Test / example user input 
# playlist_knn = ["million years _ gareth emery", "fool's gold _ jill scott", "firefly _ breaking benjamin"]
# playlist_mf = ["firefly _ breaking benjamin", "greyhound _ swedish house mafia", "girlfriend _ bobby brown"]

user_songs = ['greyhound _ swedish house mafia', 'saturday night _ the herbaliser', 'time to pretend _ mgmt']

playlist_mf = mf_recommender(user_songs, 20)
playlist_knn = knn_recommender(user_songs, 20)

print("MF Recommended Songs:", playlist_mf)
print("KNN Recommended Songs:", playlist_knn)



Recommended Songs:
Safe And Sound - Dzeko And Torres’ Digital Dreamin Remix
Stay
Kilometer (Aeroplane 'Italo 84' Remix)
Coffee and TV
Losing You
In the Air Tonight
Uprising
Delight - Original Mix
Odessa
Onye Mmanya
On Our Own - With Rap
One More Time
Dancing On My Own
Stupify
Narda
Roses
Return Of The Tres
GfC
Love Me (feat. Mase)
Dynamite
MF Recommended Songs: ['appetite for destruction _ n.w.a.', 'time to pretend _ mgmt', "don't stop your love _ keith sweat", 'feet on the ground _ nicky romero', 'the spark _ afrojack', 'vuelve _ ricky martin', 'tears dry on their own _ amy winehouse', 'bigfoot _ w&w', 'detroit vs. everybody _ eminem', 'radical - original mix _ dyro', 'mercy _ muse', 'million years _ gareth emery', 'girlfriend - single version _ bobby brown', '4am _ melanie fiona', 'better in time _ leona lewis', 'greyhound _ swedish house mafia', 'firefly _ breaking benjamin', 'love rain - (coffee shop mix) _ mos def', "fool's gold _ jill scott", 'frikitona _ plan b']
KNN Recommende

In [32]:
# Load dataset
df = pd.read_csv("cleaned_data/songs_cleaned.csv")

# Select numerical features for similarity comparison
feature_columns = [
    "danceability", "energy", "key", "loudness", "mode", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", "tempo"
]

# Create a dictionary mapping 'track_key' to its numerical feature vector
song_feature_dict = {
    row["track_key"]: row[feature_columns].values.astype(float)
    for _, row in df.iterrows()
}

print(f"Loaded {len(song_feature_dict)} songs into feature dictionary.")


Loaded 8516 songs into feature dictionary.


In [33]:
def get_average_vector(song_list, feature_dict):
    """
    Converts a list of song titles into an average feature vector

    Parameters:
        song_list (list): List of recommended songs
        feature_dict (dict): Dictionary mapping song titles to feature vectors

    Returns:
        np.array: Averaged feature vector
    """
    vectors = [feature_dict[song] for song in song_list if song in feature_dict]
    
    if not vectors:
        return np.zeros(len(next(iter(feature_dict.values()))))  # Return zero vector if no matches

    return np.mean(vectors, axis=0)

# Compute vectors for both recommended playlists 
knn_vector = get_average_vector(playlist_knn, song_feature_dict)
mf_vector = get_average_vector(playlist_mf, song_feature_dict)


In [None]:
# Min-Max Normalization of Euclidean Distance 
# Min-Max scaling is more accurate than Max-Distance Scaling because it considers dataset-wide min/max


def normalize_euclidean_distance(songs_df, non_normal_euclidean, min_dist=0):
    """Auxiliary function to normalize a given euclidean distance calculation using min-max normalization"""
    # min_dist estimating min distance (0 if two identical songs exist)

    # Computing max possible Euclidean distance (worst case scenario)
    feature_min = songs_df[feature_columns].min().values
    feature_max = songs_df[feature_columns].max().values
    max_dist = euclidean(feature_min, feature_max)
    # Normalize distance
    return (non_normal_euclidean - min_dist) / (max_dist - min_dist) if max_dist > min_dist else 0


In [36]:
def compute_similarity(songs_df, vec1, vec2):
    """
    Computes euclidean distance and cosine similarity between two feature vectors

    Parameters:
        vec1 (np.array): First feature vector
        vec2 (np.array): Second feature vector

    Returns:
        tuple: euclidean distance cosine similarity
    """
    euclidean_dist = euclidean(vec1, vec2)
    normalized_euclidean_dist = normalize_euclidean_distance(songs_df, euclidean_dist)
    cosine_sim = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    return euclidean_dist, normalized_euclidean_dist, cosine_sim

# Compute similarity between KNN and MF recommendations
euclidean_dist, normalized_euclidean_dist, cosine_sim = compute_similarity(df, knn_vector, mf_vector)

print(f"Euclidean Distance: {euclidean_dist:.4f}")
print(f"Normalized Euclidean Distance: {normalized_euclidean_dist:.4f}")
print(f"Cosine Similarity: {cosine_sim:.4f}")


Euclidean Distance: 15.2409
Normalized Euclidean Distance: 0.0843
Cosine Similarity: 0.9998


In [37]:
# Jaccard similarity ranges from 0 to 1; 1 means that two sets are exactly the same, 
# 0 means that the two sets do not have any element in common

def jaccard_similarity(playlist_a, playlist_b):
    set_a = set(playlist_a)
    set_b = set(playlist_b)
    
    intersection = len(set_a & set_b)  # Tracks in both playlists
    union = len(set_a | set_b)         # Unique tracks in either playlist
    
    return intersection / union if union != 0 else 0  # Avoid division by zero

# Test
similarity = jaccard_similarity(playlist_knn, playlist_mf)
print(f"Jaccard Similarity: {similarity:.2f}")



Jaccard Similarity: 0.00
