# Notebook for generating attribution maps and masked spectrograms out of provided songs from 'data/songs' directory or out of examples from extracted feature sets

In [None]:
import import_ipynb
import cnn_genre_classifier as mgr
import IPython.display as ipd
import librosa
import numpy as np
import torch
from captum.attr import IntegratedGradients

# Loading a music sample from test set (requires extracted feature file from dataset)

In [None]:
# load test set, model, and print confusion matrix in order to choose what example to process
dataset = torch.load("./data/feature_sets/spectro10_normalized.pth")
X_test = dataset['X_test']
y_test = dataset['y_test']
scale_min = dataset['scale_min']
scale_max = dataset['scale_max']

model = mgr.load_model("./data/saved_models/model-spectro10-epoch15-batch64_512-norm.pth")
model_predictions = model.get_predictions(X_test)
matrix = mgr.create_confusion_matrix(model_predictions, y_test)
mgr.plot_confusion_matrix(matrix)

In [None]:
# using the confusion matrix, print indexes of all samples between selected pair of predicted_class/true_class

# 0-Blues, 1-Classical, 2-Country, 3-Disco, 4-HipHop, 5-Jazz, 6-Metal, 7-Pop, 8-Reggae, 9-Rock
true_class = 0
predicted_class = 5

for index, (p, q) in enumerate(zip(model_predictions.argmax(dim=1).type(torch.LongTensor), y_test.type(torch.LongTensor))):
    if (q == true_class and p == predicted_class):
        print(index)

In [None]:
# set index based on which example you want to apply attributions on
index = 1177

# reconstruct original signal and prepare variables for attribution methods
X = X_test[index].unsqueeze(0)
X_db = mgr.unscale_input(X.squeeze(), scale_min, scale_max).numpy()
X_amplitude = librosa.db_to_amplitude(X_db)
reconstruction = librosa.griffinlim(X_amplitude, hop_length=512, win_length=1024)
sample_rate = 22050
prediction = model_predictions.argmax(dim=1).type(torch.LongTensor)[index]

# listen to the segment
ipd.Audio(reconstruction, rate=22050)

# Loading a music sample from 'songs' folder in data directory

In [None]:
# load and process specific segment from a song
file_path = "./data/songs/rock.00030.wav"
segment = 1

#exract spectrogram from selected segment using fourier transform. Same process
#that was used to extract features from processing of GTZAN dataset
start = 66150 * (segment - 1)
finish = 66150 * segment
signal, sample_rate = librosa.load(file_path, sr=22050)
stft = librosa.stft(signal[start:finish], n_fft=1024, hop_length=512)
X_power = np.abs(stft) ** 2
X_db = librosa.power_to_db(X_power)
X_amplitude = librosa.db_to_amplitude(X_db)

# load model and feed him a normalized example to get predicted class
model = mgr.load_model("./data/saved_models/model-spectro10-epoch15-batch64_512-norm.pth").cpu().eval()
normalization_spectrograms = torch.load("./data/songs/accessory/normalization_spectrograms.pth")
scale_min = normalization_spectrograms['scale_min']
scale_max = normalization_spectrograms['scale_max']
X = mgr.scale_input(torch.Tensor(X_db), scale_min, scale_max)[0].unsqueeze(0).unsqueeze(0)
output = model(X)
prediction = torch.argmax(output)
classes = torch.load("./data/songs/accessory/classes.pth")['classes']
print("Predicted: {} ({})".format(classes[prediction], prediction))

# listen to the segment
ipd.Audio(signal[start:finish], rate=sample_rate)

# Guided Gradients

In [None]:
# generate guided gradients and their pos/neg saliency maps in respect to model prediction
# reconstruct signal using guided gradients positive saliency as spectral mask

model = mgr.load_model("./data/saved_models/model-spectro10-epoch15-batch64_512-norm.pth").cpu().eval()
gbp = mgr.GuidedBackprop(model)
guided_grads = gbp.generate_gradients(X, prediction)
pos_sal_gbp, _ = mgr.get_positive_negative_saliency(guided_grads)
X_masked_gbp = X_amplitude * pos_sal_gbp
reconstruction_gbp = librosa.griffinlim(X_masked_gbp, hop_length=512, win_length=1024)
X_masked_gbp = librosa.amplitude_to_db(X_masked_gbp)

In [None]:
mgr.plot_spectrogram(X_db, sample_rate, 512, title="Original spectrum")

In [None]:
mgr.plot_spectrogram(pos_sal_gbp, sample_rate, 512, title="GBP attribution map")

In [None]:
mgr.plot_spectrogram(X_masked_gbp, sample_rate, 512, title="GBP masked spectrum")

In [None]:
ipd.Audio(reconstruction_gbp, rate=sample_rate)

# Integrated Gradients

In [None]:
# generate integrated gradients and their pos/neg saliency maps in respect to model prediction
# reconstruct signal using integrated gradients positive saliency as spectral mask

model = mgr.load_model("./data/saved_models/model-spectro10-epoch15-batch64_512-norm.pth").cpu().eval()
baseline = torch.zeros(1, 1, 513, 130)
ig = IntegratedGradients(model)
integrated_grads = ig.attribute(X, baseline, prediction).squeeze().numpy()
pos_sal_ig, _ = mgr.get_positive_negative_saliency(integrated_grads)
X_masked_ig = X_amplitude * pos_sal_ig
reconstruction_ig = librosa.griffinlim(X_masked_ig, hop_length=512, win_length=1024)
X_masked_ig = librosa.amplitude_to_db(X_masked_ig)

In [None]:
mgr.plot_spectrogram(X_db, sample_rate, 512, title="Original spectrum")

In [None]:
mgr.plot_spectrogram(pos_sal_ig, sample_rate, 512, title="IG attribution map")

In [None]:
mgr.plot_spectrogram(X_masked_ig, sample_rate, 512, title="IG masked spectrum")

In [None]:
ipd.Audio(reconstruction_ig, rate=sample_rate)