In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

spot = pd.read_csv("ds4420_spotify.csv")
spot.head()

In [None]:
# Scale the data
num_spot = spot.iloc[:, [5, 8]].to_numpy()
scaler = StandardScaler()
scale_spot = scaler.fit_transform(num_spot)
scale_spot[0:5,:]

In [None]:
# Example x, y, z
x = scale_spot[0].reshape(-1, 1)
y = scale_spot[1].reshape(-1, 1)
z = scale_spot[10].reshape(-1, 1)
x

In [None]:
# Calculate distances
# Notice R and Python have slightly different norm functions due to rounding
distances = {
    "L1_x_z": np.linalg.norm(z - x, ord=1),
    "L1_y_z": np.linalg.norm(z - y, ord=1),
    "L2_x_z": np.linalg.norm(z - x, ord=2),
    "L2_y_z": np.linalg.norm(z - y, ord=2),
    "Linf_x_z": np.linalg.norm(z - x, ord=np.inf),
    "Linf_y_z": np.linalg.norm(z - y, ord=np.inf),
}
distances

In [None]:
# Cosine similarity
# individual pair (z vs. x)
print(f'S_C(z,x): {cosine_similarity(np.vstack([z.T, x.T]))[0, 1]}')

# all pairs
cosine_similarity(np.vstack([z.T, x.T, y.T]))

In [None]:
# Collaborative Filtering Example
drg = np.array([[-5 / 3, 0, 4 / 3, 1 / 3]])
st1 = np.array([[1, -1, 0, 0]])
st2 = np.array([[-2, 2, 0, 0]])
st3 = np.array([[0, 1 / 3, 4 / 3, -5 / 3]])
st4 = np.array([[-2 / 3, 1 / 3, 0, 1 / 3]])

similarity_matrix = cosine_similarity(np.vstack((drg, st1, st2, st3, st4)))
similarity_matrix

In [None]:
predicted_rating = (
    similarity_matrix[0, 2] * 4 + similarity_matrix[0, 4] * 5
) / (similarity_matrix[0, 2] + similarity_matrix[0, 4])
predicted_rating

In [None]:
# MinMax Scaling of each Score
# Set diagonal elements to NaN
np.fill_diagonal(similarity_matrix, np.nan)

sim_scores_scaled = pd.DataFrame(similarity_matrix).apply(
    lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)), axis=0
)

print(sim_scores_scaled)

In [None]:
(.510*4 + 1*1)/(.510 + 1)

In [None]:
# content-based filtering
# add the "explicit" column to the scaled data
explicit = spot['explicit'].astype(int).to_numpy().reshape(-1, 1)
full_spot = np.hstack((scale_spot, explicit))
full_spot[0:5,:]

In [None]:
# Song #11 corresponds to "Mr. Brightside"
drg = full_spot[10].reshape(1, -1)
cosine_sim_to_drg = []

# calculate cosine similarity for each song relative to "Mr. Brightside"
for i in range(full_spot.shape[0]):
    temp_cosine = cosine_similarity(drg, full_spot[i].reshape(1, -1))[0, 0]
    cosine_sim_to_drg.append(temp_cosine)

similarity_df = pd.DataFrame({
    'song': spot['song_title'],
    'artist': spot['artist_name'],
    'sim_scores': cosine_sim_to_drg
})

# sort the DataFrame by similarity scores in descending order
similarity_df = similarity_df.sort_values(by='sim_scores', ascending=False)

similarity_df.head()

In [None]:
# Item-Item Example
item_mat = np.array([
    [2, np.nan, 5, 4],
    [5, 3, np.nan, 4],
    [1, 5, 3, np.nan],
    [np.nan, 3, 4, 1],
    [3, 4, np.nan, 4]
])
item_mat_scaled = item_mat - np.nanmean(item_mat, axis=0)
item_mat_scaled

In [None]:
# Compute pairwise cosine similarities for items
sim_scores = []
for i in range(item_mat_scaled.shape[1] - 1):
    for j in range(i + 1, item_mat_scaled.shape[1]):
        SongA = item_mat_scaled[:, i]
        SongB = item_mat_scaled[:, j]
        shared = ~np.isnan(SongA) & ~np.isnan(SongB)
        sim = cosine_similarity(SongA[shared].reshape(1, -1), SongB[shared].reshape(1, -1))[0, 0]
        sim_scores.append(sim)

sim_scores

In [None]:
# Predict ratings for Student 2 and Song 4
sim_scores = np.array(sim_scores)
Student2_Song4_a = (
    item_mat[2, 0] * sim_scores[2] +
    item_mat[2, 1] * sim_scores[4] +
    item_mat[2, 2] * sim_scores[5]
) / (sim_scores[2] + sim_scores[4] + sim_scores[5])
print("Predicted rating using all similar songs:", Student2_Song4_a)

Student2_Song4_b = (
    item_mat[2, 0] * sim_scores[2] +
    item_mat[2, 1] * sim_scores[4]
) / (sim_scores[2] + sim_scores[4])
print("Predicted rating using two most similar songs:", Student2_Song4_b)