## Music Recommendation with Embeddings and Vector Databases

### Libraries, modules and functions

In [None]:
import os
import json
import pandas as pd
from glob import glob
import numpy as np
import torch
from transformers import AutoFeatureExtractor, AutoModel
from IPython.display import Audio as player
from datasets import load_dataset, Audio, concatenate_datasets, Dataset
from panns_inference import AudioTagging
from qdrant_client import QdrantClient
from qdrant_client.http import models
from utils import generate_random_dataset, download_kaggle_dataset

Download Dataset

In [1]:
download_kaggle_dataset("jorgeruizdev/ludwig-music-dataset-moods-and-subgenres")

Dataset URL: https://www.kaggle.com/datasets/jorgeruizdev/ludwig-music-dataset-moods-and-subgenres
License(s): CC-BY-NC-SA-4.0
Downloading ludwig-music-dataset-moods-and-subgenres.zip to /mnt/c/Users/ASUS/Documents/Projects/ml_projects/pycon-colombia-2024/notebooks


 25%|██▍       | 2.80G/11.3G [48:09<2:52:47, 880kB/s] 

### Random samples from genres

In [None]:
genres = ["latin", "pop", "reggae", "rock"]
ds = generate_random_dataset(genres, n_samples = 130)

Extract songs id

In [None]:
ids = [
    (
      ds[i]['audio']['path'].split("/")[-1].replace(".mp3", '')
    )
    for i in range(len(ds))
]
index = [num for num in range(len(ds))]
ids[:4]

In [None]:
ds = ds.add_column("index", index)
ds = ds.add_column("ids", ids)
ds[-1]

Read labels

In [None]:
label_path = "../data/unstructured/ludwig-music-dataset-moods-and-subgenres/labels.json"
labels = pd.read_json(label_path)
labels.head()

Extract metadata

In [None]:
def get_metadata(x: dict) -> pd.Series:
    """Extracts metadata from a dict with info about the song."""
    cols = ['artist', 'genre', 'name', 'subgenres']
    list_of_cols = []
    for col in cols:
        try:
            mdata = list(x[col].values())[0]
        except:
            mdata = "Unknown"
        list_of_cols.append(mdata)
    return pd.Series(list_of_cols, index=cols)

In [None]:
labels = labels['tracks'].apply(get_metadata).reset_index()
labels.head()

In [None]:
def get_vals(genres: list) -> list:
    """Extracts subgenres from the dict of song's metadata."""
    genre_list = []
    for dicts in genres:
        if type(dicts) != str:
            for _, val in dicts.items():
                genre_list.append(val)
    return genre_list

labels['subgenres'] = labels.subgenres.apply(get_vals)
labels['subgenres'].head()

Select URLs/paths of songs

In [None]:
files = []
for genre in genres:
  file_path = os.path.join("../data/unstructured/ludwig-music-dataset-moods-and-subgenres", "mp3","mp3", genre, "*.mp3")
  genre_files = glob(file_path)
  files.extend(genre_files)
ids = [i.split('/')[-1].replace(".mp3", '') for i in files]
music_paths = pd.DataFrame(zip(ids, files), columns=["ids", 'urls'])
music_paths.head()