# Music Recognition(SHAZAM) with Concrete ML

## Importing Libraries

In [17]:
import numpy as np
import librosa
import os
from numpy.linalg import norm
from scipy.ndimage import maximum_filter
from scipy.ndimage import generate_binary_structure, binary_erosion
from concrete import fhe
from sklearn.linear_model import LogisticRegression
from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDTC
from concrete.ml.sklearn import LogisticRegression as ConcreteLR
from sklearn.model_selection import train_test_split
import time
import pandas as pd

## CONSTANTS

In [18]:
MUSIC_DB_PATH = '../dataset_download/dataset2/fma_small'
TEST_SAMPLE_PATH = '../dataset_download/test_samples'
SAMPLE_RATE = 22050
MFCC_SAMPLES_PER_SEC = 2
NUM_SONGS = 400

## Necessary functions for model

### Compute features function.
This function returns a matrix of features. The ith entry in `feats` is the sum of mfcc coefficients in a 500ms window, the entire song is divided into 500ms chunks.

In [19]:
def compute_features(y):
    feats = []
    win_sz = SAMPLE_RATE//MFCC_SAMPLES_PER_SEC
    for i in range(len(y)//win_sz):
        y_chunk = y[win_sz*i:win_sz*(i+1)]
        mfcc_f = np.sum(librosa.feature.mfcc(y=y_chunk, sr=SAMPLE_RATE), axis=1)
        feats.append(mfcc_f)
    return feats


### Add features function.
This function retrieves features for a specified song and appends it to a feature_db array along with its id.

In [20]:
feature_db = []
def add_features(y, id):
    feats = compute_features(y)
    for feat in feats:
        feature_db.append((feat, id))

Load 3 minute samples from each song from the specified directory and build the feature database.

In [21]:
idx = []

for dirpath, dnames, fnames in os.walk(MUSIC_DB_PATH):
    if len(idx) >= NUM_SONGS:
        break
    for f in fnames:
        try:
            song_name = f
            song_path = os.path.join(dirpath, f)
            y, sr = librosa.load(song_path, sr = SAMPLE_RATE, duration = 180)
            if song_name not in idx:
                idx.append(song_name)
                print(f"id: {idx.index(song_name)}, song_name: {song_name}")
            add_features(y, idx.index(song_name))
            if len(idx) >= NUM_SONGS:
                break
        except:
            pass
print(f"Total number of features to train on: {len(feature_db)}")

id: 0, song_name: 135054.mp3
id: 1, song_name: 135336.mp3
id: 2, song_name: 135337.mp3
id: 3, song_name: 135043.mp3
id: 4, song_name: 135091.mp3
id: 5, song_name: 135092.mp3
id: 6, song_name: 135044.mp3
id: 7, song_name: 135989.mp3
id: 8, song_name: 135221.mp3
id: 9, song_name: 135369.mp3
id: 10, song_name: 135341.mp3
id: 11, song_name: 135340.mp3
id: 12, song_name: 135368.mp3
id: 13, song_name: 135220.mp3
id: 14, song_name: 135222.mp3
id: 15, song_name: 135342.mp3
id: 16, song_name: 135223.mp3
id: 17, song_name: 135227.mp3
id: 18, song_name: 135226.mp3
id: 19, song_name: 135224.mp3
id: 20, song_name: 135219.mp3
id: 21, song_name: 135225.mp3
id: 22, song_name: 135228.mp3
id: 23, song_name: 135374.mp3
id: 24, song_name: 135375.mp3
id: 25, song_name: 135229.mp3
id: 26, song_name: 135363.mp3
id: 27, song_name: 135028.mp3
id: 28, song_name: 135010.mp3
id: 29, song_name: 135986.mp3
id: 30, song_name: 135372.mp3
id: 31, song_name: 135373.mp3
id: 32, song_name: 135039.mp3
id: 33, song_name: 1



id: 247, song_name: 133916.mp3
id: 248, song_name: 133731.mp3
id: 249, song_name: 133453.mp3
id: 250, song_name: 133447.mp3
id: 251, song_name: 133479.mp3
id: 252, song_name: 133451.mp3
id: 253, song_name: 133445.mp3
id: 254, song_name: 133294.mp3
id: 255, song_name: 133444.mp3
id: 256, song_name: 133450.mp3
id: 257, song_name: 133332.mp3
id: 258, song_name: 133454.mp3
id: 259, song_name: 133440.mp3
id: 260, song_name: 133681.mp3
id: 261, song_name: 133535.mp3
id: 262, song_name: 133441.mp3
id: 263, song_name: 133333.mp3
id: 264, song_name: 133455.mp3
id: 265, song_name: 133443.mp3
id: 266, song_name: 133457.mp3
id: 267, song_name: 133537.mp3
id: 268, song_name: 133293.mp3
id: 269, song_name: 133456.mp3
id: 270, song_name: 133442.mp3
id: 271, song_name: 133275.mp3
id: 272, song_name: 133274.mp3
id: 273, song_name: 133102.mp3
id: 274, song_name: 133100.mp3
id: 275, song_name: 133538.mp3
id: 276, song_name: 133276.mp3
id: 277, song_name: 133459.mp3
id: 278, song_name: 133449.mp3
id: 279,

In [22]:
features = np.array([fp[0] for fp in feature_db])
ids = np.array([fp[1] for fp in feature_db])
print("initial bincount of features with respect to class ids:",np.bincount(ids))

initial bincount of features with respect to class ids: [60 59 60 60 59 59 59 60 59 59 59 60 59 59 59 59 59 60 60 60 59 59 60 60
 60 59 60 59 59 59 60 59 59 60 59 60 59 59 59 59 60 59 59 59 60 60 60 59
 59 59 59 59 59 59 59 60 60 59 59 60 60 59 59 59 59 59 59 59 59 59 59 59
 60 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
 59 60 59 59 59 60 60 60 59 59 60 60 60 59 59 60 59 59 60 59 60 60 59 59
 59 59 60 59 60 60 60 60 59 59 59 60 59 59 59 59 59 60 60 59 59 59 60 60
 59 60 60 59 59 60 60 59 60 60 59 59 59 60 59 60 59 59 59 60 59 59 60 59
 59 60 60 60 60 59 59 60 60 60 60 60 59 60 59 60 59 60 59 59 59 60 60 59
 60 59 59 59 59 59 59 60 59 60 60 60 59 59 60 59 59 60 60 60 60 59 60 59
 60 60 60 60 59 59 60 59 60 60 59 59 59 59 60 59 59 59 59 59 59 60 60 60
 59 60 60 59 60 59 60 60 59 60 59 59 60 60 60 60 60 59 59 60 60 60 59 60
 59 60 59 59 59 60 59 59 59 60 59 59 60 59 60 60 60 59 59 59 59 60 59 59
 60 60 59 60 60 59 60 59 59 59 60 60 59 59 60 60 59 59 60 59 60 60 6

## Evaluation function
- Model: the model to be evaluated
- Model Config: Model config parameters such as name, n_bits, max_depth
- truth_label: A dictionary which maps the test sample file name to corresponding true id
- predict_in_fhe: If true the prediction is run in fhe mode instead of simulation

In [58]:
def evaluate(model, model_config, truth_label, samples_path, predict_in_fhe = False):
    evaluation = {**model_config}
    evaluation["Inference Time"] = 0
    evaluation["Accuracy"] = 0

    if predict_in_fhe:
        evaluation["Inference Time (FHE)"] = 0
        evaluation["Accuracy (FHE)"] = 0

    for dirpath, dnames, fnames in os.walk(samples_path):
        for f in fnames:
            if f not in truth_label:
                continue
            
            print(f"processing {f}")
            song_path = os.path.join(dirpath, f)
            y, sr = librosa.load(song_path, sr = SAMPLE_RATE, duration = 15)
            features = compute_features(y)

            start = time.time()
            preds = model.predict(features)
            end = time.time()

            evaluation["Inference Time"] += end-start

            class_prediction = np.argmax(np.bincount(preds))
            if np.bincount(preds)[class_prediction] < 5:
                class_prediction = -1 # Unknown Class
            
            
            
            evaluation["Accuracy"] += (class_prediction == truth_label[f])
    
            if predict_in_fhe:
                start = time.time()
                preds = model.predict(features, fhe="execute")
                end = time.time()

                evaluation["Inference Time (FHE)"] += end-start

                class_prediction = np.argmax(np.bincount(preds))
                if np.bincount(preds)[class_prediction] < 5:
                    class_prediction = -1 
                
                
                
                evaluation["Accuracy (FHE)"] += (class_prediction == truth_label[f])
    evaluation["Accuracy"] /= len(truth_label)
    evaluation["Inference Time"] /= len(truth_label)

    if predict_in_fhe:
        evaluation["Inference Time (FHE)"] /= len(truth_label)
        evaluation["Accuracy (FHE)"] /= len(truth_label)
    return evaluation

## Models

In [44]:
concrete_configs = [ {'n_bits': 8}]
concrete_models = [ConcreteLR(**concrete_configs[0]) ]

In [45]:
for model in concrete_models:
    model.fit(features, ids)
    model.compile(features)

First we evaluate the models on a small hand curated recording sample set. This was created by playing the music on the phone and recording it through the laptops microphone.

In [48]:
#truth_label = {"sample1.mp3": idx.index("Alena Smirnova - Lyric song.mp3"),
#               "sample2.mp3": idx.index("Edoy - Topaz.mp3"),
#               "sample3_recorded.m4a": idx.index("Pierce Murphy - Devil In A Falling Sky.mp3")}
truth_label = {
               "133916.mp3": idx.index("133916.mp3"),
               "135054.mp3": idx.index("135054.mp3"),
               "024366.mp3": idx.index("024366.mp3"),
              }

print(truth_label)

{'133916.mp3': 247, '135054.mp3': 0, '024366.mp3': 397, '024366_recorded.m4a': 397}


In [51]:
evaluations = {}
model_names = [ "ConcreteLR"]
for i, model in enumerate(concrete_models):
    evaluations[model_names[i]] = evaluate(model, concrete_configs[i], truth_label, TEST_SAMPLE_PATH, predict_in_fhe=True)

sample 135054.mp3: prediction 0
sample 135054.mp3: prediction 0
sample 024366.mp3: prediction 397
sample 024366.mp3: prediction 397
sample 133916.mp3: prediction 247
sample 133916.mp3: prediction 247
sample 024366_recorded.m4a: prediction 293
sample 024366_recorded.m4a: prediction 293


In [52]:
import pandas as pd
df = pd.DataFrame(evaluations)
df

Unnamed: 0,ConcreteLR
Accuracy,0.75
Accuracy (FHE),0.75
Inference Time,0.008017
Inference Time (FHE),0.304193
n_bits,8.0


In [56]:
truth_label_train = {}

for i, id in enumerate(idx):
    truth_label_train[id] = i

In [59]:
evaluations = {}
model_names = [ "ConcreteLR"]
for i, model in enumerate(concrete_models):
    evaluations[model_names[i]] = evaluate(model, concrete_configs[i], truth_label_train, MUSIC_DB_PATH, predict_in_fhe=True)

processing 135054.mp3
processing 135336.mp3
processing 135337.mp3
processing 135043.mp3
processing 135091.mp3
processing 135092.mp3
processing 135044.mp3
processing 135989.mp3
processing 135221.mp3
processing 135369.mp3
processing 135341.mp3
processing 135340.mp3
processing 135368.mp3
processing 135220.mp3
processing 135222.mp3
processing 135342.mp3
processing 135223.mp3
processing 135227.mp3
processing 135226.mp3
processing 135224.mp3
processing 135219.mp3
processing 135225.mp3
processing 135228.mp3
processing 135374.mp3
processing 135375.mp3
processing 135229.mp3
processing 135363.mp3
processing 135028.mp3
processing 135010.mp3
processing 135986.mp3
processing 135372.mp3
processing 135373.mp3
processing 135039.mp3
processing 135365.mp3
processing 135371.mp3
processing 135370.mp3
processing 135364.mp3
processing 135990.mp3
processing 135089.mp3
processing 135339.mp3
processing 135338.mp3
processing 132117.mp3
processing 132675.mp3
processing 132310.mp3
processing 132272.mp3
processing

In [60]:
import pandas as pd
df = pd.DataFrame(evaluations)
df

Unnamed: 0,ConcreteLR
Accuracy,0.9725
Accuracy (FHE),0.9725
Inference Time,0.000585
Inference Time (FHE),0.32024
n_bits,8.0


## Comparision with SKlearn LR model

In [61]:
from sklearn.linear_model import LogisticRegression

sklearn_lr = LogisticRegression().fit(features, ids)

In [62]:
evaluations["sklearn_lr"] = evaluate(sklearn_lr, {}, truth_label_train, MUSIC_DB_PATH, predict_in_fhe=False)

processing 135054.mp3
processing 135336.mp3
processing 135337.mp3
processing 135043.mp3
processing 135091.mp3
processing 135092.mp3
processing 135044.mp3
processing 135989.mp3
processing 135221.mp3
processing 135369.mp3
processing 135341.mp3
processing 135340.mp3
processing 135368.mp3
processing 135220.mp3
processing 135222.mp3
processing 135342.mp3
processing 135223.mp3
processing 135227.mp3
processing 135226.mp3
processing 135224.mp3
processing 135219.mp3
processing 135225.mp3
processing 135228.mp3
processing 135374.mp3
processing 135375.mp3
processing 135229.mp3
processing 135363.mp3
processing 135028.mp3
processing 135010.mp3
processing 135986.mp3
processing 135372.mp3
processing 135373.mp3
processing 135039.mp3
processing 135365.mp3
processing 135371.mp3
processing 135370.mp3
processing 135364.mp3
processing 135990.mp3
processing 135089.mp3
processing 135339.mp3
processing 135338.mp3
processing 132117.mp3
processing 132675.mp3
processing 132310.mp3
processing 132272.mp3
processing

In [63]:
df = pd.DataFrame(evaluations)
df

Unnamed: 0,ConcreteLR,sklearn_lr
n_bits,8.0,
Inference Time,0.000585,0.000214
Accuracy,0.9725,0.9775
Inference Time (FHE),0.32024,
Accuracy (FHE),0.9725,
