In [None]:
import os
import librosa
import librosa.display
import math
import IPython.display as ipd
import numpy as np
import matplotlib.pyplot as plt

REBUILD_DATA = True

DATASET_PATH = "/data/shared/GTZAN-DATASET/genres_dataset"
OUTPUT_PATH = "/data/shared/GTZAN-DATASET/spectrograms_3segments.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION


def save_spectrograms(dataset_path, json_path, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts spectrograms from music dataset and saves them into a file along witgh genre labels.
        :param dataset_path (str): Path to dataset
        :param output_path (str): Path to output file used to save spectrograms
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    # dictionary to store mapping, labels, and spectrograms
    data = {
        "mapping": [],
        "labels": [],
        "spectrograms": []
    }

    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    temporal_bins = math.ceil(samples_per_segment / hop_length)
    frequency_bins = int(n_fft / 2 + 1)
    
    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("\\")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

                # load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
                
                # process all segments of audio file
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment

                    #calculating short-time fourier transform
                    S_signal = librosa.stft(signal[start:finish], n_fft=n_fft, hop_length=hop_length)

                    #extracting spectrogram - squared magnitude of stft
                    Y_signal = np.abs(S_signal) ** 2

                    #extracting log-amplitude spectrogram
                    Y_log_signal = librosa.power_to_db(Y_signal)
                    
                    #store spectrogram and label
                    if np.shape(Y_log_signal) == (frequency_bins, temporal_bins):
                        data["spectrograms"].append(Y_log_signal)
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, d+1))
                    else:
                        print("ERROR: expected shape of ({}, {}), but got {} instead".format(frequency_bins, temporal_bins, np.shape(Y_log_signal)))
                        print("ERROR: {}, segment:{}".format(file_path, d+1))
        
    # save data to a file
    np.save(OUTPUT_PATH, data)
        
        
if REBUILD_DATA:
    save_spectrograms(DATASET_PATH, OUTPUT_PATH, num_segments=3)
