# All Dataset Preparation

## Installations

install
1. muspy & pypianoroll via pip install in environment folder
2. fluidsynth via conda install -c conda-forge fluidsynth

In [None]:
from IPython.display import clear_output, Audio, display
from ipywidgets import interact, IntSlider

import os
import os.path
import random
import datetime
import json

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from tqdm import tqdm # valuebar for iterations

In [None]:
import muspy
import pypianoroll
import pretty_midi

## Load all datasets

In [None]:
# values:
n_pitches = 6*12  # number of pitches
lowest_pitch = 2*12  # MIDI note number of the lowest pitch
beat_resolution = 4 # temporal resolution of a beat (in timestep), 24 in data, 12 for MusiGAN
bars_per_instance = 12 # number of bars per instance in prepared data 
bar_resolution = 4 * beat_resolution
sample_size = 4 * bars_per_instance # number of beats per instance created by track-cropping, 4 bars for MusiGAN

genre_list = ['Rap', 'Latin', 'International', 'Electronic', 
              'Country', 'Folk', 'Blues', 'Reggae', 'Jazz',
              'Vocal', 'New-Age', 'RnB', 'Pop_Rock', "Classical", "Game"] # genre <-> numeric label = index for lpd5

In [None]:
loaded_file_directory = "./prepared_data/"

lpd5 = np.load(loaded_file_directory + "lpd5_12bars/prepared_arrays.npz")
lpd5_data_array, lpd5_label_array = lpd5["data"], lpd5["labels"]

maestro = np.load(loaded_file_directory + "maestro_12bars/prepared_arrays.npz")
maestro_data_array, maestro_label_array = maestro["data"], maestro["labels"]
maestro_label_array = np.full(len(maestro_label_array), 13) # convert into numbers

nsamdb_train = np.load(loaded_file_directory + "nsamdb_12bars/prepared_arrays.npz")
nsamdb_data_array_1, nsamdb_label_array_1 = nsamdb_train["data"], nsamdb_train["labels"]

nsamdb_test = np.load(loaded_file_directory + "nsamdb_12bars/prepared_arrays_2.npz")
nsamdb_data_array_2, nsamdb_label_array_2 = nsamdb_test["data"], nsamdb_test["labels"]

nsamdb_valid = np.load(loaded_file_directory + "nsamdb_12bars/prepared_arrays_3.npz")
nsamdb_data_array_3, nsamdb_label_array_3 = nsamdb_valid["data"], nsamdb_valid["labels"]

nsamdb_label_array = np.full(len(nsamdb_label_array_1) + len(nsamdb_label_array_2) + len(nsamdb_label_array_3), 14) # convert into numbers 

## Combine & evaluate

In [None]:
loaded_data_array = np.concatenate((lpd5_data_array, 
                             maestro_data_array, 
                             nsamdb_data_array_1, nsamdb_data_array_2, nsamdb_data_array_3), axis = 0)

loaded_label_array = np.concatenate((lpd5_label_array, maestro_label_array, nsamdb_label_array), axis = 0)

In [None]:
# check no empty ones

n = 0
for i in range(loaded_data_array.shape[0]):
    if not np.any(loaded_data_array[i]):
        n += 1
    # plot_pianoroll(data_array[i])
    
print("No empty ones?", n == 0)

In [None]:
print("Dataset Size:", len(loaded_label_array))
print("Lpd5:", len(lpd5_label_array))
print("Maestro:", len(maestro_label_array))
print("NSA MDB:", len(nsamdb_label_array))

In [None]:
# plot total pitch range 

n_pitches_array = loaded_data_array.sum(axis = 1).sum(axis = 0)
plt.plot(np.arange(72), n_pitches_array, "ro")
plt.ylabel("# of pitches")
plt.xlabel("pitches")
# plt.yscale("log")

In [None]:
# number of data for each genre?

plt.hist(loaded_label_array, bins = 15)
plt.ylabel("# of instances")
plt.xlabel("genre label")

print(genre_list)

## Extract biggest Genres

In [None]:
extracted_genres = ['Latin', 'Electronic', 'Country', 'RnB', 'Pop_Rock', 'Classical', 'Game']
genre_mask = np.full(len(loaded_label_array), False)
for genre in extracted_genres:
    genre_mask += (loaded_label_array == genre_list.index(genre))

extracted_data_array = loaded_data_array[genre_mask]
extracted_label_array = loaded_label_array[genre_mask]

In [None]:
plt.hist(extracted_label_array, bins = 15)
plt.ylabel("# of instances")
plt.xlabel("genre label")

print(extracted_genres)

## rename labels & save

In [None]:
for idx, label in enumerate(extracted_label_array):
    extracted_label_array[idx] = extracted_genres.index(genre_list[label])

plt.hist(extracted_label_array, bins = 15)
plt.ylabel("# of instances")
plt.xlabel("genre label")

print(extracted_genres)

In [None]:
# save prepared data

## create unique file directory to save data
timestamp = datetime.datetime.now()
file_directory = f"./prepared_data/datacombi_{timestamp}"
os.makedirs(file_directory)
os.makedirs(file_directory + "/audio_examples") # for later..

## save preparation parameters as json file
prep_pars_dict = {"n_pitches": n_pitches,
                 "lowest_pitch": lowest_pitch,
                 "beat_resolution": beat_resolution, 
                  "beats_per_instance": sample_size,
                "genres": extracted_genres}
with open(file_directory + "/preparation_params.json", "w") as file:
    json.dump(prep_pars_dict, file, indent = 6)

## save data as compressed npz files
np.savez_compressed(file_directory + "/prepared_arrays.npz", data=extracted_data_array, labels=extracted_label_array)

## Evaluate Prepared Data

In [None]:
# load data

loaded_file_directory = file_directory
loaded_data = np.load(loaded_file_directory + "/prepared_arrays.npz")
loaded_data_array, loaded_label_array = loaded_data["data"], loaded_data["labels"]

In [None]:
# convert random instances of loaded data to wave (audio) file & display them

n = 20 # number or random examples
rand_idxs = np.random.randint(0, len(loaded_label_array), n)

for i in tqdm(rand_idxs):
    X, y = loaded_data_array[i, :, :], loaded_label_array[i]
    
    genre_of_X = extracted_genres[y]
    
    X_padded = np.pad(X, ((0, 0), (lowest_pitch, 128 - lowest_pitch - n_pitches))) # complete pitch range
    X_music = muspy.from_pianoroll_representation(X_padded > 0, 
                resolution = beat_resolution, encode_velocity = False) # convert to muspy.music_object

    X_timestamp = datetime.datetime.now()
    muspy.write_audio(path = file_directory + f"/audio_examples/{genre_of_X}_{X_timestamp}.wav", 
                      music = X_music) 
    
    # display audio & show pianoroll
    print(genre_of_X + ":")
    display(Audio(filename = file_directory + f"/audio_examples/{genre_of_X}_{X_timestamp}.wav"))
    muspy.visualization.show_pianoroll(X_music)
    plt.show()