## Music Recommendation with Embeddings and Vector Databases

### Libraries, modules and functions

In [1]:
import os
import json
import pandas as pd
from glob import glob
import numpy as np
import torch
from IPython.display import Audio as player
from datasets import load_dataset, Audio, concatenate_datasets, Dataset
from panns_inference import AudioTagging
from qdrant_client import QdrantClient
from qdrant_client.http import models
from utils.audio import generate_random_dataset, download_kaggle_dataset

  from .autonotebook import tqdm as notebook_tqdm


Download Dataset

In [4]:
download_kaggle_dataset("jorgeruizdev/ludwig-music-dataset-moods-and-subgenres")

Dataset URL: https://www.kaggle.com/datasets/jorgeruizdev/ludwig-music-dataset-moods-and-subgenres
License(s): CC-BY-NC-SA-4.0
Downloading ludwig-music-dataset-moods-and-subgenres.zip to /home/josealcocer27/pycon-colombia-2024/notebooks


100%|█████████▉| 11.3G/11.3G [08:27<00:00, 30.0MB/s]  




100%|██████████| 11.3G/11.3G [08:27<00:00, 23.9MB/s]


### Random samples from genres

In [2]:
genres = ["latin", "pop", "reggae", "rock"]
ds = generate_random_dataset(genres, n_samples = 130)

Loading latin...
Loading pop...
Loading reggae...
Loading rock...


Extract songs id

In [3]:
ids = [
    (
      ds[i]['audio']['path'].split("/")[-1].replace(".mp3", '')
    )
    for i in range(len(ds))
]
index = [num for num in range(len(ds))]
ids[:4]

['3zsUrcSjHX6d8DeIYyj1bE',
 '4NzcnIx2X1zq2DGVPdP8IY',
 '5jZQJVYHgH01c5C3LMkxZ1',
 '6kvMpEyqrGBNhZ4Ztb8TPo']

In [4]:
ds = ds.add_column("index", index)
ds = ds.add_column("ids", ids)
ds[-1]

{'audio': {'path': '/home/josealcocer27/pycon-colombia-2024/data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/latin/6K5UtSAXChS6Rpm2NZJnrF.mp3',
  'array': array([ 0.00000000e+00, -4.89616436e-09,  3.03773473e-09, ...,
         -3.03649813e-01, -3.14786136e-01, -2.29230165e-01]),
  'sampling_rate': 44100},
 'index': 519,
 'ids': '6K5UtSAXChS6Rpm2NZJnrF'}

Read labels

In [5]:
label_path = "../data/unstructured/ludwig-music-dataset-moods-and-subgenres/labels.json"
labels = pd.read_json(label_path)
labels.head()

Unnamed: 0,tracks
35ecMLCJ1x2giJuvHLrI1t,{'otherSubgenres': {'L': [{'S': 'electronic---...
3p0EUhkUeCNrBIZwkjmeYe,"{'otherSubgenres': {'L': []}, 'artist': {'S': ..."
0rb6HvdvWJRkyhxsfFf1ep,"{'otherSubgenres': {'L': [{'S': 'rock'}, {'S':..."
4ssD5IkaicvM3L2Ff8FPWQ,"{'otherSubgenres': {'L': []}, 'artist': {'S': ..."
586ncAs8cYRTBlrxMDfmSP,{'otherSubgenres': {'L': [{'S': 'electronic---...


Extract metadata

In [6]:
def get_metadata(x: dict) -> pd.Series:
    """Extracts metadata from a dict with info about the song."""
    cols = ['artist', 'genre', 'name', 'subgenres']
    list_of_cols = []
    for col in cols:
        try:
            mdata = list(x[col].values())[0]
        except:
            mdata = "Unknown"
        list_of_cols.append(mdata)
    return pd.Series(list_of_cols, index=cols)

In [10]:
labels = labels['tracks'].apply(get_metadata).reset_index()
labels.head()

Unnamed: 0,index,artist,genre,name,subgenres
0,35ecMLCJ1x2giJuvHLrI1t,Riovolt,electronic,It Ain't Over 'till It's Over,"[{'S': 'electronic---ambient'}, {'S': 'electro..."
1,3p0EUhkUeCNrBIZwkjmeYe,R.L. Burnside,blues,Fireman Ring the Bell,[{'S': 'blues---country blues'}]
2,0rb6HvdvWJRkyhxsfFf1ep,Chapterhouse,rock,Falling Down,[{'S': 'rock---shoegaze'}]
3,4ssD5IkaicvM3L2Ff8FPWQ,Lowell Fulsom,funk / soul,Tramp,[{'S': 'funk / soul---rhythm & blues'}]
4,586ncAs8cYRTBlrxMDfmSP,Paul Ellis,electronic,Dissolve,[{'S': 'electronic---ambient'}]


In [11]:
def get_vals(genres: list) -> list:
    """Extracts subgenres from the dict of song's metadata."""
    genre_list = []
    for dicts in genres:
        if type(dicts) != str:
            for _, val in dicts.items():
                genre_list.append(val)
    return genre_list

labels['subgenres'] = labels.subgenres.apply(get_vals)
labels['subgenres'].head()

0    [electronic---ambient, electronic---downtempo,...
1                              [blues---country blues]
2                                    [rock---shoegaze]
3                       [funk / soul---rhythm & blues]
4                               [electronic---ambient]
Name: subgenres, dtype: object

Select URLs/paths of songs

In [12]:
files = []
for genre in genres:
  file_path = os.path.join("../data/unstructured/ludwig-music-dataset-moods-and-subgenres", "mp3","mp3", genre, "*.mp3")
  genre_files = glob(file_path)
  files.extend(genre_files)
ids = [i.split('/')[-1].replace(".mp3", '') for i in files]
music_paths = pd.DataFrame(zip(ids, files), columns=["ids", 'urls'])
music_paths.head()

Unnamed: 0,ids,urls
0,6ZgIoAuzxOcb7mfpFoGV58,../data/unstructured/ludwig-music-dataset-mood...
1,03tbpnBQ9kiAL8GX0ouZUG,../data/unstructured/ludwig-music-dataset-mood...
2,17W4YBY4SDiotsptTLghvO,../data/unstructured/ludwig-music-dataset-mood...
3,0RqeKSVKmyjmuG6Ky3Nu11,../data/unstructured/ludwig-music-dataset-mood...
4,16RKMsCuOBWMu9YnbPyegG,../data/unstructured/ludwig-music-dataset-mood...


Merge metadata: Genre, Subgenres, Song Name, URLs

In [13]:
metadata = ds.select_columns(['index', 'ids']) \
                     .to_pandas() \
                     .merge(right=labels, how="left", left_on='ids', right_on='index') \
                     .merge(right=music_paths, how="left", left_on='ids', right_on='ids') \
                     .drop("index_y", axis=1) \
                     .rename({"index_x": "index"}, axis=1)

metadata.head()

Unnamed: 0,index,ids,artist,genre,name,subgenres,urls
0,0,3zsUrcSjHX6d8DeIYyj1bE,The Jesus and Mary Chain,rock,Far Gone and Out,[rock---shoegaze],../data/unstructured/ludwig-music-dataset-mood...
1,1,4NzcnIx2X1zq2DGVPdP8IY,Virus,rock,Shutout,[rock---art rock],../data/unstructured/ludwig-music-dataset-mood...
2,2,5jZQJVYHgH01c5C3LMkxZ1,Melvins,rock,Missing,"[rock---alternative rock, rock---punk]",../data/unstructured/ludwig-music-dataset-mood...
3,3,6kvMpEyqrGBNhZ4Ztb8TPo,DIIV,rock,Take Your Time,[rock---shoegaze],../data/unstructured/ludwig-music-dataset-mood...
4,4,1a47wd7JsslfmklBtIU6QS,Hope of the States,rock,66 Sleepers to Summer,[rock---post rock],../data/unstructured/ludwig-music-dataset-mood...


In [14]:
payload = metadata.drop(['index', 'ids'], axis=1).to_dict(orient="records")
payload[:2]

[{'artist': 'The Jesus and Mary Chain',
  'genre': 'rock',
  'name': 'Far Gone and Out',
  'subgenres': ['rock---shoegaze'],
  'urls': '../data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/rock/3zsUrcSjHX6d8DeIYyj1bE.mp3'},
 {'artist': 'Virus',
  'genre': 'rock',
  'name': 'Shutout',
  'subgenres': ['rock---art rock'],
  'urls': '../data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/rock/4NzcnIx2X1zq2DGVPdP8IY.mp3'}]

### Embeddings

In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"
at = AudioTagging(checkpoint_path=None, device=device)

Checkpoint path: /home/josealcocer27/panns_data/Cnn14_mAP=0.431.pth
Using CPU.


In [6]:
def get_embeddings(batch: dict):
    """Generates embeddings from Dataset's batch with PANNs."""
    arrays = [torch.tensor(val['array'], dtype=torch.float64) for val in batch['audio']]
    inputs = torch.nn.utils.rnn.pad_sequence(arrays, batch_first=True, padding_value=0).type(torch.cuda.FloatTensor)
    with torch.no_grad():
        _, embedding = at.inference(inputs)
    batch['panns_embeddings'] = embedding
    return batch

In [20]:
with torch.inference_mode():
    ds = ds.map(get_embeddings, batched=True, batch_size=8)

Map: 100%|██████████| 520/520 [24:08<00:00,  2.79s/ examples]


### Qdrant Vector Database

Create client

In [7]:
client = QdrantClient(host="localhost", port=6333)

Create collection

In [22]:
my_collection = "music_collection3"
client.recreate_collection(
    collection_name=my_collection,
    vectors_config=models.VectorParams(size=2048, distance=models.Distance.COSINE)
)

True

Upsert records

In [23]:
ds_pandas = ds.to_pandas()

client.upsert(
    collection_name=my_collection,
    points=models.Batch(
        ids=ds_pandas['index'],
        vectors=ds_pandas['panns_embeddings'],
        payloads=payload
    )
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
result = client.retrieve(
    collection_name=my_collection,
    ids=[100],
    with_vectors=True # we can turn this on and off depending on our needs
)
result

[Record(id=100, payload={'artist': 'Russian Circles', 'genre': 'rock', 'name': 'Ethel', 'subgenres': ['rock---post rock'], 'urls': '../data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/rock/6ThSoSKCXgvd7qtqRd2d1o.mp3'}, vector=[0.0, 0.0, 0.0, 0.0, 0.0, 0.005795607, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01083843, 0.0, 0.0, 0.06413969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004939821, 0.0011205913, 0.076729886, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017911065, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.060938634, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03592922, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.073928244, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.027402062, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [10]:
metadata.query("artist == 'Eddie Santiago'")

NameError: name 'metadata' is not defined

In [27]:
metadata[metadata["genre"]=="latin"]["artist"].unique()

array(['Os Paralamas do Sucesso', 'Los Fabulosos Cadillacs', 'Anis',
       'Anelis', 'Les Babacools', 'Skank', 'Harry Belafonte',
       'Culcha Candela', 'Citizen Cope', 'King Changó', 'Gondwana',
       'Big Sugar', 'Les Colocs', 'Los Cafres', 'The Cat Empire',
       'O Rappa', 'Byron Lee & The Dragonaires', 'Ska Cubano',
       'Taj Mahal', 'Abuela Coca', 'Joxe Ripiau', "EGO-WRAPPIN'",
       'Blundetto', 'Pablopavo', 'Orchestra Baobab', 'Peps Blodsband',
       'No te va gustar', 'Ayo', 'See Spot', 'Dry Kill Logic',
       'Black Gandhi', 'Laurel Aitken', 'Raggabund', 'Onda Vaga',
       'Darko Rundek', 'Intoxicados', 'Carlos Puebla', 'Baden Powell',
       'Cal Tjader', 'Ibrahim Ferrer', 'Adriana Calcanhotto',
       'Gipsy Kings', 'Jimmy Buffett', 'Sin Bandera', '¡Cubanismo!',
       'Cesária Évora', 'Cartola', 'Joselito', 'Bebeto', 'Ojos de Brujo',
       'Ry Cooder', 'Armik', 'Los Delinqüentes', 'Jorge Pardo',
       'Carmen Linares', 'Buika', 'Niña Pastori', 'Tito Puente',
 

Search similar songs

In [34]:
client.search(
    collection_name=my_collection,
    query_vector=ds_pandas.iloc[510]['panns_embeddings'],
    limit=10
)

[ScoredPoint(id=510, version=0, score=0.9999999, payload={'artist': 'Eddie Santiago', 'genre': 'latin', 'name': 'Hasta aquí te fui fiel', 'subgenres': ['latin---salsa'], 'urls': '../data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/latin/1K26lXMWRZh5dwQugTUmc5.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=432, version=0, score=0.8996564, payload={'artist': 'Ibrahim Ferrer', 'genre': 'latin', 'name': 'Mamí me gustó', 'subgenres': ['latin---cubano'], 'urls': '../data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/latin/6SPljwgWMsRt6u2J8VQMmR.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=317, version=0, score=0.8807746, payload={'artist': 'Dion', 'genre': 'pop', 'name': 'All I Want to Do Is Live My Life', 'subgenres': ['pop---ballad'], 'urls': '../data/unstructured/ludwig-music-dataset-moods-and-subgenres/mp3/mp3/pop/2TApdD56ohXxrpPL6tMKeh.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=468, version=0, score=0.88056505, payload={'artist':