In [1]:
%load_ext autoreload
%autoreload 2

# Classifying Music Note sounds using Few Shot Deep Learning

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import os
import math

# Load various imports 
import librosa
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import preprocessing

from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

from tensorflow.keras import backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Lambda, Conv2D, UpSampling2D, MaxPooling2D
from tensorflow.keras.losses import binary_crossentropy

#### utils

In [4]:
def fft(f):
    Ni = len(f)
    Mi = int(Ni / 2)
    if Mi <= 2:
        return [f[0] + f[1] + f[2] + f[3], 
               f[0] - 1j*f[1] - f[2] + 1j*f[3],
               f[0] - f[1] + f[2] - f[3],
               f[0] + 1j*f[1] - f[2] - 1j*f[3]]
    
    wn = math.cos(2*math.pi/Ni) - 1j*math.sin(2*math.pi/Ni)
    fe = [f[i] for i in range(Ni) if i % 2 == 0]
    fo = [f[i] for i in range(Ni) if i % 2 == 1]
    Fe = fft(fe)
    Fo = fft(fo)
    return [np.around(Fe[i] + (wn**i)*Fo[i], decimals=10) for i in range(Mi)] + [np.around(Fe[i] - (wn**i)*Fo[i], decimals=10) for i in range(Mi)]

def get_audio_data(filename):
    fs = 2**12 # sample rate
    tp = 2 # sampling duration
    N = n = fs*tp # number of samples
    
    # Extract data and sampling rate from file
    recording, fs = librosa.load(filename, sr=fs, duration=tp, mono=True)

    n = len(recording)        
    tp = int(n / fs)

    if tp < 2:
        pad_width = N - recording.shape[0]
        recording = np.pad(recording, pad_width=((0, pad_width),), mode='constant')

        n = len(recording)
        tp = int(n / fs)

    N = fs*tp # number of samples
    x = [np.round(float(recording[i]), 10) for i in range(n)] # input sequence
    return x, tp, n

def get_frequency_amplitude(x, tp, N):
    _X = fft(x) # discrete Fourier transform
    X = [np.round(Xi/N, 10) for Xi in _X] # frequency spectrum
    X_amp = [np.absolute(Xi) for Xi in X] # amplitude spectrum

    M = int(N/2)
    ti = [i*tp/N for i in range(N)]
    fi = [i/tp for i in range(M)]
    X_amp = np.array(X_amp[:M])*2
    
    return ti, fi, X_amp

def extract_features(filepath):
    # try:
    audio_features = get_audio_data(filepath)
    if not audio_features:
        return

    x, tp, N = audio_features
    ti, fi, X_amp = get_frequency_amplitude(x, tp, N)
    return X_amp
#     return fi, X_amp
    
    # except Exception as e:
    #     print("Error encountered while parsing file: ", file_name, e)
    #     return None 
    
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast', duration=3) 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    
    pad_width = 256 - mfccs.shape[1]
    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')     
    
#     mfccs = mfccs.mean(1)
    return mfccs

def sampling(args):
    z_mean, z_log_sigma = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim),
                              mean=0., stddev=0.1)
    return z_mean + K.exp(z_log_sigma) * epsilon

#### Load Preprocessed data 

In [5]:
# Set the path to the full UrbanSound dataset 
DATA_DIR = os.path.join("data", "guitar_sample")
# DATA_DIR = os.path.join("/content/drive/My Drive/Colab Notebooks/data", "guitar_sample")

# feature list
features = []

labels = os.listdir(DATA_DIR)

# Iterate through each sound file and extract the features 
for folder in tqdm(labels):
    for file in os.listdir(os.path.join(DATA_DIR, folder)):
        class_label = folder
        file_name = os.path.join(os.path.join(DATA_DIR, folder, file))
        
        data = extract_features(file_name)
        if data is None:
            continue
        
        data = np.array(data)
        data = np.expand_dims(data, axis=-1)
        features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files') 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))


Finished feature extraction from  47  files


In [40]:
# featuresdf.feature = featuresdf.feature.apply(lambda xx: xx.reshape((4096, 2)))
featuresdf.feature.iloc[0].shape

(40, 256, 1)

In [41]:
featuresdf.head()

Unnamed: 0,feature,class_label
0,"[[[-403.97342], [-427.74152], [-478.33362], [-...",1A
1,"[[[-449.60074], [-470.32797], [-528.5974], [-5...",1A
2,"[[[-374.77917], [-398.41254], [-462.8929], [-4...",1A
3,"[[[-400.76926], [-423.18558], [-481.08386], [-...",1A
4,"[[[-391.5319], [-416.02634], [-477.748], [-511...",1A


In [42]:
labels

['0A', '0B', '0D', '0EH', '0EL', '0G', '1A', '1B', '1D', '1EH', '1EL', '1G']

In [43]:
# Convert features and corresponding classification labels into numpy arrays
input_data = np.array(featuresdf.feature.tolist())
output_label = np.array(featuresdf.class_label.tolist())

# split train and test data
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=100)
for train_index, test_index in sss.split(input_data, output_label):
    x_train, x_test = input_data[train_index], input_data[test_index]
    y_train_label, y_test_label = output_label[train_index], output_label[test_index]
    
le = preprocessing.LabelEncoder()

y_train = le.fit_transform(y_train_label)
y_test = le.transform(y_test_label)

In [44]:
x_train.shape, y_train.shape

((35, 40, 256, 1), (35,))

In [45]:
x_test.shape, y_test.shape

((12, 40, 256, 1), (12,))

In [46]:
for i in range(3):
    print(f"class = {y_train[i]:>3d}, label = {y_train_label[i]:3s}")

class =   0, label = 0A 
class =   0, label = 0A 
class =   0, label = 0A 


### VAE model architecture 

In [47]:
# x_train = x_train.astype('float32') / x_train.max()
# x_test = x_test.astype('float32') / x_train.max()

In [48]:
input_img = Input(shape=x_train.shape[1:])
x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)
encoder = Model(input_img, encoded, name='encoder')

# at this point the representation is (4, 4, 8) i.e. 128-dimensional

# Create decoder
latent_dim = tuple(encoded.shape[1:])
latent_inputs = Input(shape=latent_dim, name='latent_space')
x = Conv2D(8, (3, 3), activation='relu', padding='same')(latent_inputs)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
decoder = Model(latent_inputs, decoded, name='decoder')

autoencoder = Model(input_img, decoder(encoder(input_img)))
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [49]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 40, 256, 1)]      0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 40, 256, 16)       160       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 20, 128, 16)       0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 20, 128, 8)        1160      
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 10, 64, 8)         0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 10, 64, 8)         584       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 5, 32, 8)          0   

In [50]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
latent_space (InputLayer)    [(None, 5, 32, 8)]        0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 5, 32, 8)          584       
_________________________________________________________________
up_sampling2d_6 (UpSampling2 (None, 10, 64, 8)         0         
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 10, 64, 8)         584       
_________________________________________________________________
up_sampling2d_7 (UpSampling2 (None, 20, 128, 8)        0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 20, 128, 16)       1168      
_________________________________________________________________
up_sampling2d_8 (UpSampling2 (None, 40, 256, 16)       0   

In [51]:
autoencoder.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 40, 256, 1)]      0         
_________________________________________________________________
encoder (Functional)         (None, 5, 32, 8)          1904      
_________________________________________________________________
decoder (Functional)         (None, 40, 256, 1)        2481      
Total params: 4,385
Trainable params: 4,385
Non-trainable params: 0
_________________________________________________________________


### Training 

Here we will train the model. As training a CNN can take a sigificant amount of time, we will start with a low number of epochs and a low batch size. If we can see from the output that the model is converging, we will increase both numbers.  

In [52]:
from tensorflow.keras.callbacks import TensorBoard

autoencoder.fit(
    x_train, 
    x_train,
    epochs=50,
    batch_size=128,
    shuffle=True,
    validation_data=(x_test, x_test),
    callbacks=[TensorBoard(log_dir='./tmp/autoencoder')]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x15920a569a0>

#### Evaluation

In [53]:
x_train_encoded = np.array(encoder.predict(x_train, batch_size=1000))
x_test_encoded = np.array(encoder.predict(x_test, batch_size=1000))

In [55]:
# plt.figure(figsize=(6, 6))
# plt.scatter(x_train_encoded[0, :, 0], x_train_encoded[0, :, 1], c=y_train)
# plt.colorbar()
# plt.show()

### Cluster close points

In [56]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
clf = make_pipeline(SVC(kernel="poly", gamma='auto'))

xx_train = x_train_encoded.reshape(len(x_train_encoded), -1)
xx_test = x_test_encoded.reshape(len(x_test_encoded), -1)

clf.fit(xx_train, y_train)
clf.score(xx_train, y_train)

1.0

In [57]:
clf.score(xx_test, y_test)

0.8333333333333334

In [58]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(xx_test)
confusion_matrix(y_test, y_pred)

array([[5, 2],
       [0, 5]], dtype=int64)

In [59]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       0.71      1.00      0.83         5

    accuracy                           0.83        12
   macro avg       0.86      0.86      0.83        12
weighted avg       0.88      0.83      0.83        12



In [60]:
sample_label = "A"
sample_filepath = os.path.join("data", "sampleA.wav")
sample_vector = extract_features(sample_filepath)
sample_vector = np.expand_dims(np.expand_dims(sample_vector, axis=-1), axis=0)
sample_embedded = encoder.predict(sample_vector)
sample_embedded = sample_embedded.reshape(len(sample_embedded), -1)
sample_embedded.shape

(1, 1280)

In [61]:
le.inverse_transform(clf.predict(sample_embedded))

array(['0A'], dtype='<U2')

In [64]:
# clf.predict_proba(sample_embedded)

In [63]:
x_test_sample = []
y_test_sample = []

sample_dirs = ["1EH", "1D", "1A", "1B", "1G", "1EL"]

for label in tqdm(sample_dirs):
    print(label, end=", ")
    labeldir= os.path.join(DATA_DIR, label)

    for filename in (os.listdir(labeldir)):
        anchor_filepath = os.path.join(DATA_DIR, label, filename)
        
        anchor_file_vector = extract_features(anchor_filepath)
        anchor_file_vector = np.expand_dims(anchor_file_vector, axis=-1)
        x_test_sample.append(anchor_file_vector)
        y_test_sample.append("1A" if label == "1A" else "0A")

print()
x_test_sample = encoder.predict(np.array(x_test_sample))
y_test_sample = np.array(y_test_sample)

x_test_sample = x_test_sample.reshape(len(x_test_sample), -1)
y_test_sample = le.transform(y_test_sample)


print(x_test_sample.shape)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

1EH, 1D, 1A, 1B, 1G, 1EL, 

(118, 1280)


In [65]:
clf.score(x_test_sample, y_test_sample)

0.4745762711864407

In [66]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(x_test_sample)
confusion_matrix(y_test_sample, y_pred)

array([[36, 62],
       [ 0, 20]], dtype=int64)

In [67]:
from sklearn.metrics import classification_report

print(classification_report(y_test_sample, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.37      0.54        98
           1       0.24      1.00      0.39        20

    accuracy                           0.47       118
   macro avg       0.62      0.68      0.46       118
weighted avg       0.87      0.47      0.51       118

