In [None]:
          # Royalty   Gender      Brave       Strong      Hard working    Wealthy
# King      0.99        1         0.75        0.7         0.7             1
# Queen     0.95        0         0.75        0.5         0.6             0.8
# Soldier   0.6         1         0.8         0.8         0.95            0.1
# Prince    0.95        1         0.7         0.85        0.7             0.7
# Peasant   0.1         0.75      0.75        0.95        0.99            0.05

In [6]:
# Lets convert these to vectors

import numpy as np

King = np.array([0.99, 1, 0.75, 0.7, 0.7, 1])
Queen = np.array([0.95, 0, 0.75, 0.5, 0.6, 0.8])
Soldier = np.array([0.6, 1, 0.8, 0.8, 0.95, 0.1])
Prince = np.array([0.95, 1, 0.7, 0.85, 0.7, 0.7])
Peasant = np.array([0.1, 0.75, 0.75, 0.95, 0.99, 0.05])
Men = np.array([0.3,1,0.75,0.7,0.7,0.3])
Women = np.array([0.15,0,0.5,0.65,0.7,0.15])

In [2]:
from numpy import dot
from numpy.linalg import norm

# cosine similarity
def cosine_similarity(v1, v2):
  numer = np.dot(v1,v2)
  deno = norm(v1) * norm(v2)
  return numer/deno
# Cosine_similarity = A.B / (|A|.|B|)  -> Dot product of vector / scalar values of the vectors

In [3]:
cosine_similarity(King, Queen)

np.float64(0.876967611419989)

In [7]:
King - Men + Women # answer is Queen

array([0.84, 0.  , 0.5 , 0.65, 0.7 , 0.85])

# What is Similarity Search?
- Similarity Search (also called nearest neighbor search) is the process of finding items in a dataset that are most similar to a given query vector.
- It uses distance metrics like:
  - Cosine Similarity (used in NLP often)
  - Euclidean Distance
  - Dot Product

Example
- You embed the query "How do I Train a neural network?" and want to find the top 5 most similar questions/articles in your DB.

Types of VectorDB:

- Purpose built vector DB (dedicated to vector storage and similarity search) -> Pineconce, weavite, Milvus etc...
- Traditional DB with Vectot support (additional packages supporting vector indexing) -> Postgre, ElasticSearch, Redis with vector support
- Libraries / DB packages for local vector search -> FAISS, ScaNN, etc...

----

The Goal of This Project

Considering song features like!? Danceability, Energy,intensity,tempo,etc...

We shall use song ID to cover some semantic in the song as well

In [8]:
# Load and clean the data
# Select useful audio features -
# like danceability, ennery, acousticness etc (This tells how upbeat , mellow, ballad etc is the song like)
# Convert these into embeddings
# WE shall also convert the song IDs (Unique Idenity) to embeddings
# Basically we are going to represent each song as vetor embeddings by combining its audio features and song ID embeddings
# Store the embeddings in the vectos atabase FAISS
# Leverage this database to find songs similar to the one we want

In [1]:
! pip install -q faiss-cpu sentence-transformers spotify

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m680.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m790.2 kB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import numpy as np
import pandas as pd
import faiss # Facebook AI Similarity Search
from sentence_transformers import SentenceTransformer
# import spotify
# import spotify.oauth import SpotifyClientCredentials

In [None]:
# client_id = 'asdh'
# sp = spotify.Spotify(auth_manager= SpotifyClientCredentials(client_id= client_id,client_secret= client_secret))

In [6]:
df = pd.read_csv('Spotify_data.csv')
df.dropna().drop_duplicates(subset = ['id']).reset_index(drop = True,inplace=True)

In [27]:
df.head(5)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


# Create Feature vectors

In [10]:
df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [11]:
features = ['danceability','energy','loudness','speechiness','acousticness','instrumentalness','tempo']

In [13]:
# Sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight and fast

# Create embeddings for the song IDs
id_embeddings = model.encode(df['id'].to_list(), show_progress_bar=True)
# So each is passed through the trasformer model to produce a high dimensional vector (384)

#Convert audio features
audio_features = df[features].values.astype('float32')
Combined_features = np.hstack((id_embeddings,audio_features))


# 7 + 384 = 91 features

Batches:   0%|          | 0/5333 [00:00<?, ?it/s]

In [14]:
audio_features

array([[ 2.79000e-01,  2.11000e-01, -2.00960e+01, ...,  9.82000e-01,
         8.78000e-01,  8.09540e+01],
       [ 8.19000e-01,  3.41000e-01, -1.24410e+01, ...,  7.32000e-01,
         0.00000e+00,  6.09360e+01],
       [ 3.28000e-01,  1.66000e-01, -1.48500e+01, ...,  9.61000e-01,
         9.13000e-01,  1.10339e+02],
       ...,
       [ 6.34000e-01,  8.58000e-01, -2.22600e+00, ...,  1.01000e-01,
         8.56000e-06,  9.16880e+01],
       [ 6.71000e-01,  6.23000e-01, -7.16100e+00, ...,  9.98000e-03,
         7.55000e-06,  7.50550e+01],
       [ 8.56000e-01,  7.21000e-01, -4.92800e+00, ...,  1.32000e-01,
         4.71000e-03,  9.49910e+01]], dtype=float32)

In [15]:
Combined_features

array([[-3.92721668e-02,  5.36861196e-02, -2.32730992e-02, ...,
         9.81999993e-01,  8.78000021e-01,  8.09540024e+01],
       [ 8.40503629e-03,  3.48948240e-02,  6.08965242e-03, ...,
         7.31999993e-01,  0.00000000e+00,  6.09360008e+01],
       [-9.07071531e-02,  1.56073850e-02, -2.33153310e-02, ...,
         9.61000025e-01,  9.12999988e-01,  1.10338997e+02],
       ...,
       [ 3.53870913e-02,  2.69716512e-02, -3.10079921e-02, ...,
         1.01000004e-01,  8.56000042e-06,  9.16880035e+01],
       [-1.19956195e-01,  7.31386477e-03,  2.16148645e-02, ...,
         9.97999962e-03,  7.55000019e-06,  7.50550003e+01],
       [ 2.24715099e-03,  1.52808884e-02, -7.00545590e-03, ...,
         1.31999999e-01,  4.71000001e-03,  9.49909973e+01]], dtype=float32)

Lets Build FAISS Index

In [16]:
Combined_features.shape

(170653, 391)

In [17]:
dimension = Combined_features.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(Combined_features)

In [22]:
# Recommendation Function

def recommend_similar_tracks(song_id,top_k=5):

  # To get audio features of the song
  input_audio = df[df['id'] == song_id][features].values.astype('float32')

  # Get ID embeddings
  input_id_embedding = model.encode([song_id])

  # Combine for vector search
  input_vector = np.hstack((input_id_embedding, input_audio)).astype('float32')

  # Perform FAISS index search
  distance, indices = index.search(input_vector,top_k + 1)
  # Index.search returns 2 arrays
  # Distance : distances of the nearest neighbours from the query
  # Indices : row indices of these neighbours

  result = df.iloc[indices[0]] # Extract the result from the first and only query [indiced[0]]

  return result[result['id'] != song_id].head(top_k)

We had 5 songs stored in FAISS.<br>
You searched for Song C (row index = 2) <br>
and want to see top 2 recommendation <br>
| DataFrame row index | Song ID | Song Name |
| ------------------- | ------- | --------- |
| 0                   | A1      | Song A    |
| 1                   | B2      | Song B    |
| 2                   | C3      | Song C    |
| 3                   | D4      | Song D    |
| 4                   | E5      | Song E    |

indices = [[2, 4, 1]]
distances = [[0.0, 1.42, 2.87]]

In [26]:
# Generate a recommendation
sample_song_id = df.iloc[34]['id'] # Pick any valid song is
recommendation = recommend_similar_tracks(sample_song_id, top_k = 5)
print(recommendation)

        valence  year  acousticness                           artists  \
119691    0.489  1994         0.598  ['Russ Freeman', 'David Benoit']   
65431     0.250  1971         0.424                  ['Karen Dalton']   
68818     0.907  1988         0.300                 ['Prefab Sprout']   
155461    0.684  2020         0.430                    ['Kali Uchis']   
132160    0.646  1976         0.600                      ['Lou Reed']   

        danceability  duration_ms  energy  explicit                      id  \
119691         0.648       258360   0.518         0  40K6J6EgGL4VyA51ocacSu   
65431          0.517       280813   0.335         0  4H6tgkolJjxPHVo1qHae7R   
68818          0.821       262133   0.666         0  13t9pPIr9xb1GrpSXEj16h   
155461         0.779       222384   0.380         0  1Ap99oPYvI37kcFMBBTCUx   
132160         0.528       225827   0.698         0  5Xp2Jd8D0MYKhdanZyiYcO   

        instrumentalness  key  liveness  loudness  mode  \
119691          0.000001   

In [29]:
# Generate a recommendation
# sample_song_id = df.iloc[34]['id'] # Pick any valid song is
recommendation = recommend_similar_tracks('3ftBPsC5vPBKxYSee08FDH', top_k = 4)
print(recommendation)

        valence  year  acousticness                artists  danceability  \
63048     0.647  1960         0.788          ['Sam Cooke']         0.516   
128756    0.725  1959         0.759  ['The Kingston Trio']         0.637   
111936    0.893  1955         0.990           ['P. Leela']         0.438   
35005     0.383  2002         0.853        ['Johnny Cash']         0.536   

        duration_ms  energy  explicit                      id  \
63048        155320   0.450         0  1BIGqqQxGV9c45heFmUxLL   
128756       172853   0.515         0  2XVkxB5mFJvLuej0V6aVqb   
111936       166413   0.332         0  34F5G3WbjqkIqmRIDmDenA   
35005        233240   0.394         0  3iBemYZi4lw53UYDlxqMlw   

        instrumentalness  key  liveness  loudness  mode                  name  \
63048           0.000002    0     0.197    -9.318     1  That's Where It's At   
128756          0.000003    2     0.232    -9.431     1         A Worried Man   
111936          0.000004   11     0.159    -9.352 