In [None]:
import import_ipynb
import cnn_genre_classifier_spectrograms as mgr
import IPython.display as ipd
import librosa
import numpy as np
import soundfile as sf
import torch
from captum.attr import IntegratedGradients
from pytorch_model_summary import summary

# Data preparation

In [None]:
# path to file that stores spectrograms and genre labels for each processed segment
# DATA_PATH = "/data/shared/GTZAN-DATASET/spectrograms_10segments.npy"

# create new sets
# X_train, X_validation, X_test, y_train, y_validation, y_test, scale_min, scale_max = mgr.load_data(DATA_PATH, test_size=0.20, validation_size=0.15, scale=True)

#load existing sets
dataset = torch.load("./data/shuffled_set_1_normalized.pth")
X_train = dataset['X_train']
X_validation = dataset['X_validation']
X_test = dataset['X_test']
y_train = dataset['y_train']
y_validation = dataset['y_validation']
y_test = dataset['y_test']
scale_min = dataset['scale_min']
scale_max = dataset['scale_max']

#save new data
# torch.save({
#         'X_train': X_train,
#         'X_validation': X_validation,
#         'X_test': X_test,
#         'y_train': y_train,
#         'y_validation': y_validation,
#         'y_test': y_test,
#         'scale_min': scale_min,
#         'scale_max': scale_max,
#         }, "./data/spectro10_normalized.pth")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

data =[
        "Blues",
        "Classical",
        "Country",
        "Disco",
        "Hiphop",
        "Jazz",
        "Metal",
        "Pop",
        "Reggae",
        "Rock"
    ]

bins = np.arange(0, y_validation.max().numpy() + 1.5) - 0.5
fig, ax = plt.subplots(figsize=(15,5))
counts, _, patches = ax.hist((y_train.numpy(), y_validation.numpy(), y_test.numpy()), bins=bins, range=(), rwidth=0.75, label=('Train', 'Validation', 'Test'))

for count, patch in zip(counts[0],patches[0]):
    ax.annotate(str(int(count)), xy=(patch.get_x(), patch.get_height() + 5))
    
for count, patch in zip(counts[1],patches[1]):
    ax.annotate(str(int(count)), xy=(patch.get_x(), patch.get_height() + 5))
    
for count, patch in zip(counts[2],patches[2]):
    ax.annotate(str(int(count)), xy=(patch.get_x(), patch.get_height() + 5))
    
ax.set_xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ax.set_xticklabels(data, rotation=45, fontsize=12)
plt.legend(loc=5, prop={'size': 12})

plt.show()

# Model methods

In [None]:
# load existing or create new model

# model = mgr.new_model()
model = mgr.load_model("./saved_models/model-spectro10-epoch25-batch512-norm.pth")

print(summary(model, torch.rand(1, 1, np.shape(X_train)[2], np.shape(X_train)[3]).cuda()), sep='\n')

In [None]:
# training loop

history = model.fit(
    X_train, y_train,
    validation_data=(X_validation, y_validation),
    epochs=15,
    batch_size=256,
    log=True,
    history=None)

In [None]:
# plot results

# history = torch.load("./saved_models/model-spectro10-epoch25-batch512-norm_history.pth")
mgr.plot_history(history)

print("Max_train_acc:", max(history['acc']), "  Min_train_loss:", min(history['loss']))
print("Max_val_acc:", max(history['val_acc']), "  Min_val_loss:", min(history['val_loss']))
    
t_acc, t_loss = model.test(X_test, y_test, out=True)

In [None]:
# rename model

model.model_name = 'model-spectro10-epoch25-batch512-norm'

In [None]:
# save model

mgr.save_model(model, "./saved_models")

In [None]:
# save training history

torch.save(history, "./saved_models/"  + model.model_name + "_history.pth")

In [None]:
# create and plot confusion matrix

model_predictions = model.get_predictions(X_test)
matrix = mgr.create_confusion_matrix(model_predictions, y_test)
mgr.plot_confusion_matrix(matrix)

# Loading a music sample

In [None]:
# load and process specific musical sample, print spectrogram

file_path = "/data/shared/GTZAN-DATASET/genres_dataset/rock/rock.00030.wav"
segment = 5

start = 66150 * segment
finish = 66150 * (segment + 1)

signal, sample_rate = librosa.load(file_path, sr=22050)
S_signal = librosa.stft(signal[start:finish], n_fft=1024, hop_length=512)
Y_signal = np.abs(S_signal) ** 2
Y_log_signal = librosa.power_to_db(Y_signal)

mgr.plot_spectrogram(Y_log_signal, sample_rate, 512, size=(20,10))

ipd.Audio(signal[start:finish], rate=22050)

In [None]:
# create variables for GBP and IG out of loaded sample

model = mgr.load_model("./saved_models/model-spectro10-epoch25-batch512-norm.pth").cpu().eval()
sample_signal = mgr.scale_input(torch.Tensor(Y_log_signal), scale_min, scale_max)[0].unsqueeze(0).unsqueeze(0)
output = model(sample_signal)
sample_prediction = torch.argmax(output)
print(output, sample_prediction)

In [None]:
#reconstruct original signal

reconstruction = mgr.unscale_input(sample_signal[0][0], scale_min, scale_max)
reconstruction = librosa.db_to_amplitude(reconstruction.numpy())
y_inv = librosa.griffinlim(reconstruction, hop_length=512, win_length=1024)
sf.write("./sample_original.wav", y_inv, sample_rate)

# Guided Gradients

In [None]:
# generate guided gradients and their pos/neg saliency maps

model = mgr.load_model("./saved_models/model-spectro10-epoch25-batch512-norm.pth").cpu().eval()
gbp = mgr.GuidedBackprop(model)
guided_grads = gbp.generate_gradients(sample_signal, sample_prediction)
pos_sal_gbp, neg_sal_gbp = mgr.get_positive_negative_saliency(guided_grads)

In [None]:
mgr.plot_spectrogram(pos_sal_gbp, 22050, 512)

In [None]:
# reconstruct signal using guided gradients positive saliency as spectral mask

reconstruction = mgr.unscale_input(sample_signal[0][0], scale_min, scale_max).numpy()
reconstruction = librosa.db_to_amplitude(reconstruction)
reconstruction_masked = reconstruction * pos_sal_gbp
y_inv = librosa.griffinlim(reconstruction_masked, hop_length=512, win_length=1024)
sf.write("./sample_masked_gbp.wav", y_inv, sample_rate)

# Integrated Gradients

In [None]:
# generate integrated gradients and their pos/neg saliency maps

model = mgr.load_model("./saved_models/model-spectro10-epoch25-batch512-norm.pth").cpu().eval()
baseline = torch.zeros(1, 1, 513, 130)
ig = IntegratedGradients(model)
integrated_grads = ig.attribute(sample_signal, baseline, sample_prediction).squeeze().numpy()
pos_sal_ig, neg_sal_ig = mgr.get_positive_negative_saliency(integrated_grads)

In [None]:
mgr.plot_spectrogram(pos_sal_ig, 22050, 512)

In [None]:
# reconstruct signal using integrated gradients positive saliency as spectral mask

reconstruction = mgr.unscale_input(sample_signal[0][0], scale_min, scale_max).numpy()
reconstruction = librosa.db_to_amplitude(reconstruction)
reconstruction_masked = reconstruction * pos_sal_ig
y_inv = librosa.griffinlim(reconstruction_masked, hop_length=512, win_length=1024)
sf.write("./sample_masked_ig.wav", y_inv, sample_rate)

## Saves images of attributions

In [None]:
from PIL import Image
import os

def save_gradient_images(gradient, file_name):
    if not os.path.exists('./results'):
        os.makedirs('./results')
    # normalize between 0-1
    gradient = gradient - gradient.min()
    gradient /= gradient.max()
    # save image
    path_to_file = os.path.join('./results', file_name + '.jpg')
    save_image(gradient, path_to_file)

    
def save_image(im, path):
    if isinstance(im, (np.ndarray, np.generic)):
        im = format_np_output(im)
        im = Image.fromarray(im)
    im.save(path)
    

def format_np_output(np_arr):
    # repeat first channel and convert 1xWxH to 3xWxH
    if np_arr.shape[0] == 1:
        np_arr = np.repeat(np_arr, 3, axis=0)
    # convert to WxHx3 in order to make it saveable by PIL
    if np_arr.shape[0] == 3:
        np_arr = np_arr.transpose(1, 2, 0)
    # multiply with 255 and change type to make it saveable by PIL
    if np.max(np_arr) <= 1:
        np_arr = (np_arr*255).astype(np.uint8)
    return np_arr

In [None]:
save_gradient_images(attributions[0], 'sample' + '_ig')
save_gradient_images(pos_sal_ig, 'sample' + '_pos_sal_ig')
save_gradient_images(neg_sal_ig, 'sample' + '_neg_sal_ig')

save_gradient_images(guided_grads, 'sample' + '_gbp')
save_gradient_images(pos_sal_gbp, 'sample' + '_pos_sal_gbp')
save_gradient_images(neg_sal_gbp, 'sample' + '_neg_sal_gbp')