# CMPE: 256 Advanced Data Mining 
## Spotify Music Recommendation System
## Approach: ANNOY + nearest neighbours


#### Installing required libraries 

In [1]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp310-cp310-linux_x86_64.whl size=582731 sha256=b440d31b3f1c243375f8feeb28bd60e629a6bba45d5a78e89c8439272be0b336
  Stored in directory: /root/.cache/pip/wheels/7a/d9/59/473fa56df8e39430eeda369500b4e7127f5b243ba24c3c4297
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [2]:
import pandas as pd
import numpy as np
import random

from annoy import AnnoyIndex
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances


#### Data Preprocessing

In [33]:
#Loading the dataset and extracting the features
data = pd.read_csv('tracks_features.csv')
#data['id']= data.index
#features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
data_features = data[features].values

In [34]:
#Normalizing the features
scaler = StandardScaler()
scaled_data_features = scaler.fit_transform(data_features)

#### Creating ANNOY Index

In [35]:
#Building ANNOY index
n_trees = 10
n_dimensions = len(features)
index = AnnoyIndex(n_dimensions, 'angular')

for i, feature_vector in enumerate(scaled_data_features):
    index.add_item(i, feature_vector)

index.build(n_trees)

True

#### Finding nearest neighbours 

In [36]:
#Finding nearest neighbours
n_neighbors = 20
nearest_neighbors = {}
for i in range(len(scaled_data_features)):
    nearest_neighbors[i] = index.get_nns_by_item(i, n_neighbors)

#### Finding Recommendations 

In [37]:
track_id_to_index = {track_id: index for index, track_id in enumerate(data['id'])}
index_to_track_id = {index: track_id for track_id, index in track_id_to_index.items()}


In [52]:
#seed_track_id = random.choice(data['id'].values) # Replace this with the id of your seed track
seed_track_id = '03fxLdlPWH9FZN83S2Gbvi'
playlist_length = 50

seed_index = track_id_to_index[seed_track_id]
playlist_indices = [seed_index]

for i in range(playlist_length - 1):
    last_index = playlist_indices[-1]
    # Choose the first nearest neighbor that is not already in the playlist
    for neighbor in nearest_neighbors[last_index]:
        if neighbor not in playlist_indices:
            next_index = neighbor
            break
    playlist_indices.append(next_index)

# Convert the integer indices back to track IDs
playlist_track_ids = [index_to_track_id[index] for index in playlist_indices]

print('Input Song:', seed_track_id)
print("Generated Playlist:")
for i, index in enumerate(playlist_indices, start=1):
    track_id = str(index_to_track_id[index])
    print(f"{i}. {track_id}")


Input Song: 03fxLdlPWH9FZN83S2Gbvi
Generated Playlist:
1. 03fxLdlPWH9FZN83S2Gbvi
2. 6bX1k4WwoqRU5lXwuLH0yQ
3. 7rWKZq4RazueaP5Y6tT11B
4. 750TW4vyRLkfSNhTuGBh1E
5. 1mp6FtMycViM8TvyAu12tK
6. 7wvOT4t4r8vNPd5479zLJy
7. 3etA2lGCZPknmBZLRc81ZY
8. 3MNhMjSdRormcb5C5NGp95
9. 0O9OPv8X9O3zXLmckWcUtO
10. 0I3u8MoPGtvVVlZ9LxdpnA
11. 1xkJJvlcrzLtTjYqmTrpav
12. 51Rhk5lqjCzYamgdKpQvoX
13. 17ucydgJWNUmrR0kZm2uU7
14. 4QaRQPE9AylK30odm6AFEd
15. 76PSv3kkDr8V6go6m68b5n
16. 2UzbK9wKqQlh94Y43su3dg
17. 71LsnVsnUUEYVSKnAoe2SG
18. 5mr7g6XWPR8o9I5EXpjAYF
19. 3oCmErrtVxXjdkEFbr8jft
20. 4pDwMKVemP3IHA3yNXrac4
21. 4SqAUapiLcsd7t3FoQUKs9
22. 6qFoWfOKi5WlPl72uB9UZS
23. 0oeC1BJsodmxf2GZOxklWj
24. 43L66KWmj6uvzHciqMh5w6
25. 1nKxVhfBYYNF8H74Leoq2q
26. 5TPsVuaNP97gGntjxSssXA
27. 7xU0pTefiUAIdjawYmaqRd
28. 0G6Or0GsOWRVyelr4iSsSj
29. 04ThlEpxqanS9fc4qbxPyX
30. 1qEXcFgUTl5PxqjBBPGjav
31. 2NrRBN2vbWmXO9HBOOYacX
32. 7wmsoiGbDymMGZquRMQJZz
33. 3ydDoNv3kA6fIYQSv7UaMf
34. 1JSWjiwFcWLU1dBXGyLJAE
35. 2EJ6CfCp5kfOpMzZIjwSuU
36. 4qdkQ

#### Transition Smoothness

In [46]:
def pairwise_cosine_similarity(playlist, data, data_features):
    similarities = []
    for i in range(len(playlist) - 1):
        track_1_id = playlist[i]
        track_2_id = playlist[i + 1]
        
        track_1_index = data[data['id'] == track_1_id].index[0]
        track_2_index = data[data['id'] == track_2_id].index[0]
        
        track_1_features = data_features[track_1_index]
        track_2_features = data_features[track_2_index]
        
        similarity = cosine_similarity([track_1_features], [track_2_features])[0][0]
        similarities.append(similarity)
    return similarities

In [53]:
similarities = pairwise_cosine_similarity(playlist_track_ids, data, scaled_data_features)

In [54]:
average_similarity = np.mean(similarities)
print("Average Cosine Similarity:", average_similarity)

Average Cosine Similarity: 0.9944617958275137


In [21]:
# Assuming playlist_track_ids contains the track IDs of the generated playlist
playlist_indices = [track_id_to_index[track_id] for track_id in playlist_track_ids]

#### Intra-list Diversity 

In [22]:
def intra_list_diversity(playlist, features):
    pairwise_distances = cosine_distances(features[playlist])
    avg_distance = np.mean(pairwise_distances)
    return avg_distance


In [23]:
# Calculate Intra-list diversity
diversity_score = intra_list_diversity(playlist_indices, scaled_data_features)
print(f"Intra-list Diversity: {diversity_score}")

Intra-list Diversity: 0.08496492645945168


#### Artist diversity

In [24]:
def artist_diversity(playlist, track_to_artist):
    unique_artists = len(set([track_to_artist[track] for track in playlist]))
    artist_diversity_score = unique_artists / len(playlist)
    return artist_diversity_score

In [25]:
track_to_artist = dict(zip(data['id'], data['artist_ids']))
artist_diversity_score = artist_diversity(playlist_track_ids, track_to_artist)
print(f"Artist Diversity: {artist_diversity_score}")

Artist Diversity: 0.94
