In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity

# import import_ipynb # an extension that allows me to import a ipynb file 
from mf import *
from knn import *

In [None]:
# # Getting KNN and MF song recommendations

# # Test / example user input 
# playlist_knn = ["million years _ gareth emery", "fool's gold _ jill scott", "firefly _ breaking benjamin"]
# playlist_mf = ["firefly _ breaking benjamin", "greyhound _ swedish house mafia", "girlfriend _ bobby brown"]

# user_songs = ['greyhound _ swedish house mafia', 'saturday night _ the herbaliser', 'time to pretend _ mgmt']

# playlist_mf = mf_recommender(user_songs, 20)
# playlist_knn = knn_recommender(user_songs, 20)

# print("MF Recommended Songs:", playlist_mf)
# print("KNN Recommended Songs:", playlist_knn)


# songs attribute dataset

In [9]:
# Load dataset
df = pd.read_csv("cleaned_data/songs_cleaned.csv")

# Select numerical features for similarity comparison
feature_columns = [
    "danceability", "energy", "key", "loudness", "mode", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", "tempo"
]

# Create a dictionary mapping 'track_key' to its numerical feature vector
song_feature_dict = {
    row["track_key"]: row[feature_columns].values.astype(float)
    for _, row in df.iterrows()
}

print(f"Loaded {len(song_feature_dict)} songs into feature dictionary.")


Loaded 8516 songs into feature dictionary.


# average the vector of the recommended playlists

In [10]:
def get_average_vector(song_list, feature_dict):
    """
    Converts a list of song titles into an average feature vector

    Parameters:
        song_list (list): List of recommended songs
        feature_dict (dict): Dictionary mapping song titles to feature vectors

    Returns:
        np.array: Averaged feature vector
    """
    vectors = [feature_dict[song] for song in song_list if song in feature_dict]
    
    if not vectors:
        return np.zeros(len(next(iter(feature_dict.values()))))  # Return zero vector if no matches

    return np.mean(vectors, axis=0)

# Compute vectors for both recommended playlists 
# knn_vector = get_average_vector(playlist_knn, song_feature_dict)
# mf_vector = get_average_vector(playlist_mf, song_feature_dict)


# Min-Max Normalization of Euclidean Distsance

In [11]:
# Min-Max Normalization of Euclidean Distance 
# Min-Max scaling is more accurate than Max-Distance Scaling because it considers dataset-wide min/max


def normalize_euclidean_distance(songs_df, non_normal_euclidean, min_dist=0):
    """Auxiliary function to normalize a given euclidean distance calculation using min-max normalization"""
    # min_dist estimating min distance (0 if two identical songs exist)

    # Computing max possible Euclidean distance (worst case scenario)
    feature_min = songs_df[feature_columns].min().values
    feature_max = songs_df[feature_columns].max().values
    max_dist = euclidean(feature_min, feature_max)
    # Normalize distance
    return (non_normal_euclidean - min_dist) / (max_dist - min_dist) if max_dist > min_dist else 0


# simularity computation

In [12]:
def compute_similarity(songs_df, vec1, vec2):
    """
    Computes euclidean distance and cosine similarity between two feature vectors

    Parameters:
        vec1 (np.array): First feature vector
        vec2 (np.array): Second feature vector

    Returns:
        tuple: euclidean distance cosine similarity
    """
    euclidean_dist = euclidean(vec1, vec2)
    normalized_euclidean_dist = normalize_euclidean_distance(songs_df, euclidean_dist)
    cosine_sim = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    return euclidean_dist, normalized_euclidean_dist, cosine_sim

# Compute similarity between KNN and MF recommendations
# euclidean_dist, normalized_euclidean_dist, cosine_sim = compute_similarity(df, knn_vector, mf_vector)

# print(f"Euclidean Distance: {euclidean_dist:.4f}")
# print(f"Normalized Euclidean Distance: {normalized_euclidean_dist:.4f}")
# print(f"Cosine Similarity: {cosine_sim:.4f}")


# Jaccard similarity

In [13]:
# Jaccard similarity ranges from 0 to 1; 1 means that two sets are exactly the same, 
# 0 means that the two sets do not have any element in common

def jaccard_similarity(playlist_a, playlist_b):
    set_a = set(playlist_a)
    set_b = set(playlist_b)
    
    intersection = len(set_a & set_b)  # Tracks in both playlists
    union = len(set_a | set_b)         # Unique tracks in either playlist
    
    return intersection / union if union != 0 else 0  # Avoid division by zero

# # Test
# similarity = jaccard_similarity(playlist_knn, playlist_mf)
# print(f"Jaccard Similarity: {similarity:.2f}")



# Experiment

### Input: 1, 5, 10, ..., 50 songs as playlists
### Output: 1, 5, 10, ..., 50 songs as a playlist

## We want to compare these (euclidean, normalized euclidean, cosine, and jaccard)

In [14]:
# function to get the test playlists

def user_sample_songs(s):
    df = pd.read_csv("cleaned_data/songs_cleaned.csv")

    # get a sample of s songs from the dataset
    playlist_df = df.sample(n=s)
    
    return playlist_df["track_key"].tolist()


In [15]:
# Getting KNN and MF song recommendations
def knn_mf_recs(user_songs, r):
    playlist_mf = mf_recommender(user_songs, r)
    playlist_knn = knn_recommender(user_songs, r)

    return playlist_knn, playlist_mf

In [19]:
df_cols = ["user playlist", "knn recs", "mf recs", "s", "r", "euclidean dist", 
           "normalized euclidean dist", "cosine sim", "jaccard sim"]

In [20]:
experiment_df = pd.DataFrame(columns=df_cols)

In [24]:
def simulation(s, r):
    # get the users playlist with s songs
    user_playlist = user_sample_songs(s)

    # get the r recommendations
    knn_recs, mf_recs = knn_mf_recs(user_playlist, r)

    # compute average vector from above
    knn_averg_vector = get_average_vector(knn_recs, song_feature_dict)
    mf_averg_vector = get_average_vector(mf_recs, song_feature_dict)

    # compute similarity
    euclidean_dist, normalized_euclidean_dist, cosine_sim = compute_similarity(df, knn_averg_vector, mf_averg_vector)
    jaccard_sim = jaccard_similarity(knn_recs, mf_recs)

    return [user_playlist, knn_recs, mf_recs, s, r, euclidean_dist, normalized_euclidean_dist, cosine_sim, jaccard_sim]

#### example of how this is gonna work using s = 1 and r = 1

In [25]:
sim = simulation(1, 1)


Recommended Songs:
My Bag


In [27]:
experiment_df.loc[len(experiment_df)] = sim

In [28]:
experiment_df

Unnamed: 0,user playlist,knn recs,mf recs,s,r,euclidean dist,normalized euclidean dist,cosine sim,jaccard sim
0,[a mirage _ the essence],[my bag _ lloyd cole and the commotions],[the addict _ bo saris],1,1,17.755155,0.09822,0.997409,0.0


#### now doing s = 5, 10, ..., 50 and r = 5, 10, ..., 50 (this is s*r times meaning 100 times - i think)

In [None]:
c = 0
for i in range(5, 51, 5):
    for j in range(5, 51, 5):
        sim = simulation(i, j)
        experiment_df.loc[len(experiment_df)] = sim
        c+=1
        print(c)

In [31]:
experiment_df.to_csv("similarities.csv", index=False)

In [32]:
experiment_df.head()

Unnamed: 0,user playlist,knn recs,mf recs,s,r,euclidean dist,normalized euclidean dist,cosine sim,jaccard sim
0,[a mirage _ the essence],[my bag _ lloyd cole and the commotions],[the addict _ bo saris],1,1,17.755155,0.09822,0.997409,0.0
1,"[one in a million _ aaliyah, honey love _ r. k...","[piedra _ caifanes, twork it out _ usher, nigh...","[the kids (feat. janelle monáe) _ b.o.b, honey...",5,5,7.129031,0.039437,0.999333,0.0
2,[freeloader - spencer & hill radio edit _ dave...,"[flashlight _ r3hab, hair of the dog _ nazaret...","[close to me - remastered _ the cure, without ...",5,10,13.306499,0.07361,0.999937,0.0
3,"[let's straighten it out _ latimore, little ga...","[boom, boom, boom _ willie clayton, is this lo...","[la gozadera _ gente de zona, awa adounia _ li...",5,15,2.584997,0.0143,0.999828,0.0
4,"[eyes wide open _ sabrina carpenter, rock and ...","[make your mark _ drew ryan scott, liquid conf...","[eyes wide open _ sabrina carpenter, babylon (...",5,20,11.995161,0.066356,0.999383,0.0


In [None]:
for i in range(5, 51, 5):
    sim = simulation(1, i)
    experiment_df.loc[len(experiment_df)] = sim
    print(i)

In [35]:
experiment_df.to_csv("similarities.csv", index=False)

In [None]:
for j in range(5, 51, 5):
    sim = simulation(j, 1)
    experiment_df.loc[len(experiment_df)] = sim
    print(j)

In [37]:
experiment_df.to_csv("similarities.csv", index=False)

In [38]:
len(experiment_df)

121