# Recommender Systems

## Mounting the Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. ANNOYINDEX with Automated Playlist Generation

#### Installing required libraries 

In [2]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp310-cp310-linux_x86_64.whl size=582731 sha256=a7a590e012d334741a88d0316b65b79f2f5e945b8aeb17b658f4f70c9f738d7b
  Stored in directory: /root/.cache/pip/wheels/7a/d9/59/473fa56df8e39430eeda369500b4e7127f5b243ba24c3c4297
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [3]:
import pandas as pd
import numpy as np
import random

from annoy import AnnoyIndex
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances


#### Data Preprocessing

In [4]:
#Loading the dataset and extracting the features
data = pd.read_csv('/content/drive/Shareddrives/CMPE 256 Project/preprocessed_track_features.csv')
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
data_features = data[features].values

In [5]:
#Normalizing the features
scaler = StandardScaler()
scaled_data_features = scaler.fit_transform(data_features)

#### Creating ANNOY Index

In [6]:
#Building ANNOY index
n_trees = 10
n_dimensions = len(features)
index = AnnoyIndex(n_dimensions, 'angular')

for i, feature_vector in enumerate(scaled_data_features):
    index.add_item(i, feature_vector)

index.build(n_trees)

True

#### Finding nearest neighbours 

In [7]:
#Finding nearest neighbours
n_neighbors = 20
nearest_neighbors = {}
for i in range(len(scaled_data_features)):
    nearest_neighbors[i] = index.get_nns_by_item(i, n_neighbors)

#### Finding Recommendations 

In [8]:
track_id_to_index = {track_id: index for index, track_id in enumerate(data['id'])}
index_to_track_id = {index: track_id for track_id, index in track_id_to_index.items()}


In [150]:
seed_track_id = random.choice(data['id'].values) # Replace this with the id of your seed track
seed_track_id = 15
playlist_length = 50

seed_index = track_id_to_index[seed_track_id]
playlist_indices = [seed_index]

for i in range(playlist_length - 1):
    last_index = playlist_indices[-1]
    # Choose the first nearest neighbor that is not already in the playlist
    for neighbor in nearest_neighbors[last_index]:
        if neighbor not in playlist_indices:
            next_index = neighbor
            break
    playlist_indices.append(next_index)

# Convert the integer indices back to track IDs
playlist_track_ids = [index_to_track_id[index] for index in playlist_indices]
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

print('Input Song:', seed_track_id)
print("Generated Playlist:")
for i, index in enumerate(playlist_indices, start=1):
    track_id = str(index_to_track_id[index])
    #track_id=15
    track_features_1 = data.loc[data['id'] == int(track_id), features]
    track_features_2 = data.loc[data['id'] == seed_track_id, features]
    #print(track_features_1.to_numpy())
    #print(track_features_2.to_numpy())
    track_similarity = cosine_similarity(track_features_1.to_numpy(), track_features_2.to_numpy())
    print(f"{i}. {track_id}   Cosine Similarity -> {track_similarity[0]}")

Input Song: 15
Generated Playlist:
1. 15   Cosine Similarity -> [1.]
2. 143830   Cosine Similarity -> [0.99989346]
3. 910443   Cosine Similarity -> [0.99978429]
4. 182371   Cosine Similarity -> [0.99970844]
5. 470026   Cosine Similarity -> [0.99977013]
6. 796608   Cosine Similarity -> [0.999618]
7. 296381   Cosine Similarity -> [0.99936321]
8. 866776   Cosine Similarity -> [0.99965792]
9. 130597   Cosine Similarity -> [0.99981725]
10. 994466   Cosine Similarity -> [0.99974773]
11. 236855   Cosine Similarity -> [0.99968997]
12. 1157868   Cosine Similarity -> [0.9996592]
13. 914740   Cosine Similarity -> [0.99960714]
14. 488149   Cosine Similarity -> [0.9996597]
15. 316065   Cosine Similarity -> [0.99988023]
16. 273503   Cosine Similarity -> [0.99967629]
17. 52040   Cosine Similarity -> [0.99985231]
18. 424367   Cosine Similarity -> [0.99973561]
19. 342742   Cosine Similarity -> [0.99977344]
20. 306421   Cosine Similarity -> [0.99974667]
21. 374375   Cosine Similarity -> [0.99940903]
22.

#### Transition Smoothness

In [48]:
def pairwise_cosine_similarity(playlist, data, data_features):
    similarities = []
    for i in range(len(playlist) - 1):
        track_1_id = playlist[i]
        track_2_id = playlist[i + 1]
        
        track_1_index = data[data['id'] == track_1_id].index[0]
        track_2_index = data[data['id'] == track_2_id].index[0]
        
        track_1_features = data_features[track_1_index]
        track_2_features = data_features[track_2_index]
        
        similarity = cosine_similarity([track_1_features], [track_2_features])[0][0]
        similarities.append(similarity)
    return similarities

In [49]:
similarities = pairwise_cosine_similarity(playlist_track_ids, data, scaled_data_features)

In [50]:
average_similarity = np.mean(similarities)
print("Average Cosine Similarity:", average_similarity)

Average Cosine Similarity: 0.9954234962814188


In [51]:
# Assuming playlist_track_ids contains the track IDs of the generated playlist
playlist_indices = [track_id_to_index[track_id] for track_id in playlist_track_ids]

#### Intra-list Diversity 

In [52]:
def intra_list_diversity(playlist, features):
    pairwise_distances = cosine_distances(features[playlist])
    avg_distance = np.mean(pairwise_distances)
    return avg_distance


In [53]:
# Calculate Intra-list diversity
diversity_score = intra_list_diversity(playlist_indices, scaled_data_features)
print(f"Intra-list Diversity: {diversity_score}")

Intra-list Diversity: 0.03551929424582136


## 2. Knowledge Based Graph Recommender Systems

## Modules Required for Knowledge Based Graph

##Creating the playlist dataframe with the help for playlist_features.csv

In [56]:
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast
playlist_df = pd.read_csv("/content/drive/Shareddrives/CMPE 256 Project/playlist_features_500.csv")
track_df = pd.read_csv("/content/drive/Shareddrives/CMPE 256 Project/preprocessed_track_features.csv")

##Creating the Graph using the Playlist IDs as nodes

In [57]:
# Create a graph using the playlist IDs as nodes
G = nx.Graph()
G.add_nodes_from(list(playlist_df['playlist_id']))

## Cosine Similarity

In [58]:
# Calculate the cosine similarity between the playlist features
cosine_sim = cosine_similarity(playlist_df.iloc[:, 2:])

##Adding the edges on based on the cosine similarity to the edges

In [59]:
# Add edges to the graph based on the cosine similarity
for i in range(len(cosine_sim)):
    for j in range(i + 1, len(cosine_sim)):
        if cosine_sim[i][j] > 0.8:  # Add an edge if the cosine similarity is greater than 0.8
            G.add_edge(playlist_df.iloc[i]['playlist_id'], playlist_df.iloc[j]['playlist_id'])

##__Function for Recommending playlists based on the user's input of track features__

In [61]:
### Updated 1
# Define a function to recommend playlists based on the input track's features
def recommend_playlists(track_features, num_recommendations=5):
    # Calculate the cosine similarity between the input track's features and the playlist features
    track_sim = cosine_similarity([track_features], playlist_df.iloc[:, 2:])
    # Get the playlist IDs with the highest cosine similarity and their corresponding similarity scores
    top_playlists_indexes = track_sim[0].argsort()[::-1][:num_recommendations]
    top_playlists = list(playlist_df.iloc[top_playlists_indexes]['playlist_id'])
    cosine_similarity_score = list(track_sim[0][top_playlists_indexes])
    # Get the subgraph of the top playlists
    subgraph = G.subgraph(top_playlists)
    # Get the connected components of the subgraph
    components = list(nx.connected_components(subgraph))
    # Sort the connected components by size and get the largest one
    components.sort(key=len, reverse=True)
    largest_component = components[0]
    # Get the playlist IDs in the largest component
    playlist_ids = list(largest_component)
    # Get the playlist information from the playlist_df
    recommended_playlists = playlist_df[playlist_df['playlist_id'].isin(playlist_ids)]
    # Return the recommended playlists along with their cosine similarity scores
    return recommended_playlists, cosine_similarity_score

##__An Example for Recommending playlists based on the user's input of track features__

In [62]:
# Example usage:
# Define the track features for the input track
desired_columns = ['acousticness', 'danceability', 'key', 'loudness', 'mode', 'tempo', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
track_id=15
track_features = track_df.loc[track_df['id'] == track_id, desired_columns]
#track_features = [0.5, 0.7, 0.8, 0.2, 0.4, 0.6, 0.1]
#print(track_features.to_numpy()[0])
# Recommend playlists based on the input track's features
num_recommendations=5
recommended_playlists, similarity_score = recommend_playlists(track_features.to_numpy()[0], num_recommendations)
# Print the recommended playlists
#print(recommended_playlists, similarity_score)

In [63]:
# Print the recommended playlists
print(f"Top {num_recommendations} playlists based on the input track's features:\n")
k=1
for i, row in recommended_playlists.iterrows():
    print(f"Sr. No:{k}")
    similarity_s=similarity_score[k-1]
    print(f"Similarity_Score:{similarity_s}")
    print(f"Playlist ID: {row['playlist_id']}")
    print(f"Tracks: {row['track_ids']}")
    print(f"Acousticness: {row['acousticness']:.2f}")
    print(f"Danceability: {row['danceability']:.2f}")
    print(f"Energy: {row['energy']:.2f}")
    print(f"Instrumentalness: {row['instrumentalness']:.2f}")
    print(f"Liveness: {row['liveness']:.2f}")
    print(f"Speechiness: {row['speechiness']:.2f}")
    print(f"Valence: {row['valence']:.2f}")
    print()
    k+=1

Top 5 playlists based on the input track's features:

Sr. No:1
Similarity_Score:0.999938414345819
Playlist ID: 45
Tracks: [1161585, 933448, 520932, 715736, 1028764, 57345, 463756, 461019, 457157, 1079898, 49309, 835481, 828232, 834810, 802617]
Acousticness: 0.01
Danceability: 0.30
Energy: 0.80
Instrumentalness: 0.03
Liveness: 0.09
Speechiness: 0.05
Valence: 0.36

Sr. No:2
Similarity_Score:0.9999263149253689
Playlist ID: 217
Tracks: [463074, 850162, 419251, 75903, 524584, 616114, 811665, 1088645, 639491]
Acousticness: 0.03
Danceability: 0.66
Energy: 0.56
Instrumentalness: 0.86
Liveness: 0.13
Speechiness: 0.03
Valence: 0.30

Sr. No:3
Similarity_Score:0.9999207449185021
Playlist ID: 275
Tracks: [985471, 505846, 812004, 1000928, 441220, 101982, 1030109, 974532, 612487, 418824]
Acousticness: 0.36
Danceability: 0.48
Energy: 0.70
Instrumentalness: 0.00
Liveness: 0.67
Speechiness: 0.40
Valence: 0.72

Sr. No:4
Similarity_Score:0.9999085152862887
Playlist ID: 316
Tracks: [49265, 111656, 672125, 

##__Adding the playlist_features and track_features data into Pandas Dataframe__

In [64]:
def recommend_tracks(playlist_id, num_tracks):
    # Get the track ids for the given playlist
    playlist_track_ids = [int(tid) for tid in playlist_df[playlist_df['playlist_id']==playlist_id]['track_ids'].iloc[0][1:-1].split(", ")]
    
    # Remove tracks that are already in the playlist from consideration
    track_features = track_df[~track_df['id'].isin(playlist_track_ids)].iloc[:, 3:].values
    
    # Find the cosine similarity between the playlist and each track
    playlist_features = playlist_df[playlist_df['playlist_id']==playlist_id].iloc[:, 2:].values
    similarities = cosine_similarity(track_features, playlist_features)
    
    # Get the indices of the top num_tracks tracks based on cosine similarity
    top_indices = np.argsort(similarities.flatten())[::-1][:num_tracks]
    
    # Get the track ids and their corresponding cosine similarity score for the top recommended tracks
    top_tracks = track_df[~track_df['id'].isin(playlist_track_ids)].iloc[top_indices, [0]].copy()
    top_tracks['cosine_similarity_score'] = similarities.flatten()[top_indices]
    
    return top_tracks

In [65]:
recommend_tracks(playlist_id=1, num_tracks=5)

Unnamed: 0,id,cosine_similarity_score
293872,293872,0.999997
538908,538908,0.999996
663556,663556,0.999995
108323,108323,0.999994
304542,304542,0.999994


## 3. Deep Neural Network

### Importing the libraries used for Deep Neural Network

In [157]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras import backend as K
from keras import backend as K

### Loading the playlist features from the google drive

In [158]:
# Convert the track_ids column into a list of integers
playlist_df_1 = pd.read_csv("/content/drive/Shareddrives/CMPE 256 Project/playlist_features_500.csv")
playlist_df_1['playlist_id'] = playlist_df_1['playlist_id'].apply(lambda x: x - 1)

### Creatign the trainable features and output from the dataframe generated from the playlist features

In [159]:
#print(playlist_df['playlist_id'])
X = playlist_df_1.drop(['playlist_id', 'track_ids'], axis=1)
y = playlist_df_1['playlist_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [160]:
#Checking the shape of the X
y

0        0
1        1
2        2
3        3
4        4
      ... 
495    495
496    496
497    497
498    498
499    499
Name: playlist_id, Length: 500, dtype: int64

### Normalizing the Numerical Features which are present in the Training Dataset

In [161]:
# Normalize the numerical features in the training set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Defining the metric - Cosine Similarity and Cosine Loss functions for the model training 

In [162]:
def cosine_similarity(y_true, y_pred):
    true_norm = K.sqrt(K.sum(K.square(y_true), axis=-1, keepdims=True))
    pred_norm = K.sqrt(K.sum(K.square(y_pred), axis=-1, keepdims=True))
    dot = K.sum(y_true * y_pred, axis=-1, keepdims=True)
    return dot / (true_norm * pred_norm)

def cosine_loss(y_true, y_pred):
    y_true = K.l2_normalize(tf.cast(y_true, dtype=tf.float32), axis=-1)
    y_pred = K.l2_normalize(y_pred, axis=-1)
    return K.mean(1 - K.sum((y_true * y_pred), axis=-1))


### Defining the hyperparameters for the Training through the Grid-Search and running with the best-hyperparameters 

In [163]:
# Define the hyperparameters to tune
param_grid = {
    'units': [32,64],
    'dropout_rate': [0.4,0.6],
    'l2_reg': [0.01,0.1,0.02],
    'lr': [0.01, 0.05],
    'batch_size': [32],
    'epochs': [10,20]
}

### Creating the Sequential Model

In [168]:
# Define the model function
def create_model(units=32, dropout_rate=0.2, l2_reg=0.001, lr=0.001, epochs=200, batch_size=32):
    model = Sequential()
    model.add(Dense(units=units, activation='relu', input_dim=X_train.shape[1], kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=units, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Flatten())
    model.add(Dense(units=len(playlist_df['playlist_id'].unique()), activation='softmax'))
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
    return model

# Create the model
model = KerasClassifier(build_fn=create_model, verbose=0)

  model = KerasClassifier(build_fn=create_model, verbose=0)


#Using the GridSearchCV for finding the best Hyperparameters for training the Deep Neural Network

In [169]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Train the model with the best hyperparameters
best_model = create_model(**grid_search.best_params_)
best_model.fit(X_train, y_train, epochs=grid_search.best_params_['epochs'], batch_size=grid_search.best_params_['batch_size'], validation_split=0.2)

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras/wrappers/scikit_learn.py", line 335, in score
    raise ValueError(
ValueError: The model is not configured to compute accuracy. You should pass `metrics=["accuracy"]` to the `model.compile()` method.

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]


Best hyperparameters: {'batch_size': 32, 'dropout_rate': 0.4, 'epochs': 10, 'l2_reg': 0.01, 'lr': 0.01, 'units': 32}
Best score: nan
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0aee41f850>

### Evaluating Trained Model on Testing Data

In [170]:
# Evaluate the model's performance on the testing set
test_loss= best_model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
#print('Test accuracy:', test_accuracy)

Test loss: 8.663061141967773


### Running the single Inference for checking of the model

In [171]:
track_features = [0.95045,0.191935,7,-25.10925,1,88.6685,0.0434915,0.50725,0.090595,0.04388,0.08335]
track_features = scaler.transform(np.array([track_features]))
print(track_features)
playlist_id = np.argmax(best_model.predict(np.array([track_features][0])))
print(playlist_id)

[[ 1.30702537 -1.59535571  0.46897974 -1.92059541  0.68205679 -0.99089287
  -1.53897384  0.5667631  -0.59646223 -0.35800714 -1.27751995]]
83




In [172]:
best_model.summary()

Model: "sequential_729"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7290 (Dense)          (None, 32)                384       
                                                                 
 dropout_6561 (Dropout)      (None, 32)                0         
                                                                 
 dense_7291 (Dense)          (None, 32)                1056      
                                                                 
 dropout_6562 (Dropout)      (None, 32)                0         
                                                                 
 dense_7292 (Dense)          (None, 32)                1056      
                                                                 
 dropout_6563 (Dropout)      (None, 32)                0         
                                                                 
 dense_7293 (Dense)          (None, 32)             

### Example usage by providing the track-id

In [176]:
# Example usage:
# Define the track features for the input track
desired_columns = ['acousticness', 'danceability', 'key', 'loudness', 'mode', 'tempo', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
track_id=307183
track_features = track_df.loc[track_df['id'] == track_id, desired_columns]
recommended_playlist = np.argmax(best_model.predict(np.array([track_features][0])))
# Print the recommended playlists
#print(playlist_df_1.loc[playlist_df_1['playlist_id'] == recommended_playlist])
playlist_features = playlist_df_1.loc[playlist_df_1['playlist_id'] == recommended_playlist, desired_columns]
track_similariity = cosine_similarity(track_features_1.to_numpy(), playlist_features.to_numpy())
print(recommended_playlist)
print("The similarity between the songs 15 and the given playlist is:- ", track_similarity)

83
The similarity between the songs 15 and the given playlist is:-  [[0.01412899]]


## 4. PCA and DBSCAN

#### Importing all the necessary libraries 

In [110]:
#importing all the libraries 
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics


#### Data Preprocessing 

In [111]:
#Loading the dataset 
data = pd.read_csv('/content/drive/Shareddrives/CMPE 256 Project/preprocessed_track_features.csv')
#data = data.sample(frac=0.9)

In [112]:
#Extracting the features
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = data[features]

In [113]:
#Taking care of missing values 
X_filled = X.fillna(X.mean())


In [114]:
#Normalizing the features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#### Principal Component Analysis 

Reducing the dimentionality of the dataset 

In [115]:
#Principal Component Analysis
n_components = 7
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)


#### DBSCAN Clustering 
Fitting the clustering model on the feature data set

In [130]:
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=10)
clusters = dbscan.fit_predict(X_pca)

In [131]:
data['cluster'] = clusters

#### Finding Recommendations 

Here, we are finding the recommendations on a randomly selected track_id and trying to evaluate and rank the recommended tracks on the basis of cosine similarity

In [154]:
# Select a random track_id from the dataset as input
input_song = data.sample(1)
input_track_id = input_song['id'].values[0]
input_track_id = 15

In [133]:
# Extract and preprocess the features of the input song
input_song_features = input_song[features]
input_song_features_filled = input_song_features.fillna(input_song_features.mean())
input_song_features_scaled = scaler.transform(input_song_features_filled)


In [134]:
#the cluster of the input song
input_song_cluster = input_song['cluster'].iloc[0]

In [135]:
# Get initial recommendations from the same cluster
recommendations = data[data['cluster'] == input_song_cluster].sample(200)


In [136]:
# Extract and preprocess the features of the recommended songs
recommended_ids = recommendations['id'].values
recommended_features = data[data['id'].isin(recommended_ids)][features]
recommended_features_filled = recommended_features.fillna(recommended_features.mean())
recommended_features_scaled = scaler.transform(recommended_features_filled)


In [137]:
# Apply PCA to the input song and recommended songs
input_song_features_pca = pca.transform(input_song_features_scaled)
recommended_features_pca = pca.transform(recommended_features_scaled)


In [138]:
# Calculate cosine similarity between the input song and recommended songs
cosine_similarities = cosine_similarity(input_song_features_pca, recommended_features_pca)


In [139]:
# Re-rank the recommendations based on their cosine similarity to the input song
recommendations['cosine_similarity'] = cosine_similarities.flatten()
recommendations_sorted = recommendations.sort_values(by='cosine_similarity', ascending=False)


In [140]:
# Filter the recommendations based on cosine similarity threshold
cosine_similarity_threshold = 0.7
filtered_recommendations = recommendations_sorted[recommendations_sorted['cosine_similarity'] > cosine_similarity_threshold]

In [155]:
print("Input song:")
print(input_song[['id', 'name', 'artists']])
print("\nRecommended songs:")
print(filtered_recommendations[['id', 'name', 'artists', 'cosine_similarity']])

Input song:
            id            name        artists
702910  702910  Atin Ang Mundo  ['The Juans']

Recommended songs:
              id                                               name  \
563423    563423                              Laurel Canyon Sunrise   
653626    653626  La rondine: Act III: Che volete da me? (Magda,...   
661962    661962            Suite in D Minor (I: 65-71): IV. Bouree   
342091    342091                            Blue Heron/White Bridge   
886474    886474                                      Seven Murders   
601662    601662                                 Forgive Me My Love   
551901    551901                                            Vertige   
519939    519939                               Esta Noite De Natale   
971100    971100                                    Nido de Espinas   
619966    619966                    Shake You Down - Single Version   
397884    397884                                                 If   
883405    883405        

In [142]:
# Exclude noise points
non_noise_indices = clusters != -1
X_pca_no_noise = X_pca[non_noise_indices]
clusters_no_noise = clusters[non_noise_indices]

# Compute metrics
#silhouette_score = metrics.silhouette_score(X_pca_no_noise, clusters_no_noise)
davies_bouldin_score = metrics.davies_bouldin_score(X_pca_no_noise, clusters_no_noise)
calinski_harabasz_score = metrics.calinski_harabasz_score(X_pca_no_noise, clusters_no_noise)

#print("Silhouette Score (excluding noise points):", silhouette_score)
print("Davies-Bouldin Index (excluding noise points):", davies_bouldin_score)
print("Calinski-Harabasz Index (excluding noise points):", calinski_harabasz_score)

Davies-Bouldin Index (excluding noise points): 1.0215942588174676
Calinski-Harabasz Index (excluding noise points): 573.4611955534361
