# Synthesize paired '.wav' audios

In [32]:
DEFAULT_SAMPLING_RATE=44100
NSYNTH_SAMPLE_RATE=16000
NSYNTH_VELOCITIES=[25, 50, 100, 127]
hop_length = 512
bins_per_octave = 16 * 2
amin=1/(2**16)

In [3]:
args={'nsynth_path':'.//nsynth-train//audio'
    ,'midi_path':'.//archive'
    ,'audios_path':'.//data//audios'
    ,'playback_speed':1
    ,'duration_rate':4
    ,'transpose':0}


instruments = [
    {'name': 'keyboard', 'source_type': 'acoustic', 'preset': 0},
    {'name': 'string', 'source_type': 'acoustic', 'preset': 0}
]

midifiles=[]
music_types = os.listdir(args['midi_path'])
for Type in music_types:
    for music in os.listdir(args['midi_path']+'//'+Type):
        midifiles.append(args['midi_path']+'//'+Type+'//'+music) # Get all mdi files path
        
print()
print("Instruments: \t", len(instruments), [instrument['name'] for instrument in instruments])
print("MIDI files: \t", len(midifiles))


Instruments: 	 2 ['keyboard', 'string']
MIDI files: 	 2


In [4]:
class NoteSynthesizer():
    def __init__(self, dataset_path, sr=44100, transpose=0, leg_stac=.9, velocities=np.arange(0,128), preset=0, preload=True):
        self.dataset_path = dataset_path
        self.sr = sr
        self.transpose = transpose
        self.leg_stac = leg_stac
        self.velocities = velocities
        self.preset = preset

        self.preload = preload

    def _quantize(self, value, quantized_values):
        diff = np.array([np.abs(q - value) for q in quantized_values])
        return quantized_values[diff.argmin()]

    def _get_note_name(self, note, velocity, instrument, source_type, preset=None):
        if preset is not None:
            preset = preset
        else:
            preset = self.preset
        return instrument+'_'+source_type+'_'+str(preset).zfill(3)+'-'+str(note).zfill(3)+'-'+str(velocity).zfill(3)+'.wav'

    def preload_notes(self, instrument, source_type, preset=None):
        preset = preset if(preset is not None) else self.preset
        print("Preloading notes for " + instrument + "_" + source_type + "_" + str(preset).zfill(3))
        self.notes = {}
        for n in range(22, 108):
            for v in self.velocities:
                note_name = self._get_note_name(n, v, instrument, source_type, preset)
                try:
                    audio, sr = librosa.load(os.path.join(self.dataset_path, note_name), sr=self.sr) # get audio (return audio and sampling rate)
                except:
                    audio = None
                self.notes[note_name] = audio
        print("Notes loaded")

    def _render_note(self, note_filename, duration, velocity):
        try:
            if(self.preload):
                note = self.notes[note_filename]
            else:
                note, _ = librosa.load(note_filename) # load synth music data
            decay_ind = int(self.leg_stac*duration)
            envelope = np.exp(-np.arange(len(note)-decay_ind)/3000.)
            note[decay_ind:] = np.multiply(note[decay_ind:],envelope)
        except:
            print('Note not fonund', note_filename)
            note = np.zeros(duration)
        return note[:duration]

    def render_sequence(self, sequence, instrument='guitar', source_type='acoustic', preset=0, playback_speed=1, duration_scale=1, transpose=0):
        # read from midi
        midi_data = pretty_midi.PrettyMIDI(sequence)
        seq,end_time = [],midi_data.get_end_time()
        for inst in midi_data.instruments:
            for note in inst.notes:
                if note.start < end_time:
                    note.velocity = self._quantize(note.velocity, self.velocities)
                    seq.append((note.pitch, note.velocity, note.start/end_time, note.end/end_time))

        total_length = int(end_time * self.sr / playback_speed)
        data = np.zeros(total_length)
  
        for note, velocity, note_start, note_end in seq:
            start_sample = int(note_start * total_length)
            end_sample = int(note_end * total_length)
            duration = end_sample - start_sample

            duration = int(duration * duration_scale)
            end_sample = start_sample + duration
            
            # Get corresponding synth file name
            note_filename=instrument+'_'+source_type+'_'+str(preset).zfill(3)+'-'+str(note).zfill(3)+'-'+str(velocity).zfill(3)+'.wav'
            
            note = self._render_note(note_filename, duration, velocity)

            if(end_sample <= len(data) and duration == len(note)):
                data[start_sample:end_sample] += note
            elif(duration > len(note) and end_sample <= len(data)):
                data[start_sample:start_sample+len(note)] += note

        data /= np.max(np.abs(data)) 
        return data, self.sr 

In [5]:
for instrument in instruments:
    synth = NoteSynthesizer(
                                dataset_path=args['nsynth_path'], 
                                sr=NSYNTH_SAMPLE_RATE, 
                                velocities=NSYNTH_VELOCITIES, 
                                transpose=float(args['transpose'])
                            )
    synth.preload_notes(instrument=instrument['name'], source_type=instrument['source_type'])

    instrument_folder = instrument['name']+'_'+instrument['source_type']
    if(not os.path.isdir(os.path.join(args['audios_path'], instrument_folder))): # init path
        os.makedirs(os.path.join(args['audios_path'],instrument_folder))

    for mid in midifiles:
        _, seq_name = os.path.split(mid)
        output_name = os.path.join(args['audios_path'], instrument_folder, os.path.splitext(seq_name)[0]+'.wav')
        
        print("Instrument: \t", instrument_folder)
        print("Sequence: \t", mid)
        print("Output: \t", output_name, '\n')

        audio, _ = synth.render_sequence(
                                            sequence=str(mid),
                                            instrument=instrument['name'],
                                            source_type=instrument['source_type'],
                                            preset=instrument['preset'],
                                            playback_speed=float(args['playback_speed']),
                                            duration_scale=float(args['duration_rate']),
                                        )

        if(DEFAULT_SAMPLING_RATE != NSYNTH_SAMPLE_RATE):
            audio = librosa.core.resample(audio, NSYNTH_SAMPLE_RATE, DEFAULT_SAMPLING_RATE)
        write_wav(output_name, DEFAULT_SAMPLING_RATE, np.array(32000.*audio, np.short))

Preloading notes for keyboard_acoustic_000
Notes loaded
Instrument: 	 keyboard_acoustic
Sequence: 	 D://4550//TEST//archive//albeniz//alb_esp1.mid
Output: 	 D://4550//TEST//output\keyboard_acoustic\alb_esp1.wav 

Instrument: 	 keyboard_acoustic
Sequence: 	 D://4550//TEST//archive//albeniz//alb_esp2.mid
Output: 	 D://4550//TEST//output\keyboard_acoustic\alb_esp2.wav 

Preloading notes for string_acoustic_000




Notes loaded
Instrument: 	 string_acoustic
Sequence: 	 D://4550//TEST//archive//albeniz//alb_esp1.mid
Output: 	 D://4550//TEST//output\string_acoustic\alb_esp1.wav 

Note not fonund string_acoustic_000-097-050.wav
Note not fonund string_acoustic_000-105-025.wav
Note not fonund string_acoustic_000-100-025.wav
Instrument: 	 string_acoustic
Sequence: 	 D://4550//TEST//archive//albeniz//alb_esp2.mid
Output: 	 D://4550//TEST//output\string_acoustic\alb_esp2.wav 



# Generate CQT or SFTF freatures

In [19]:
def forward_transform(audio, nfft=1024, normalize=True, crop_hf=True):
    window = np.hanning(nfft)
    S = librosa.stft(audio, n_fft=nfft, hop_length=int(nfft/2), window=window)
    mag, phase = librosa.magphase(S) #np.abs(S), np.angle(S)
    if(crop_hf):
        mag = remove_hf(mag)
    if(normalize):
        mag = 2 * mag / np.sum(window)
    return mag, phase

def forward_cqt(audio,sr):
    cqt = librosa.cqt(audio, sr=sr,hop_length=hop_length, n_bins=8*bins_per_octave,bins_per_octave=bins_per_octave)
    mag, phase = librosa.magphase(cqt)
    return mag, phase

def slice_magnitude(mag, slice_size):
    magnitudes = np.stack([mag], axis=2)
    return slice_first_dim(magnitudes, slice_size)

def remove_hf(mag):
    return mag[0:int(mag.shape[0]/2), :]

def slice_first_dim(array, slice_size):
    n_sections = int(np.floor(array.shape[1]/slice_size))
    has_last_mag = n_sections*slice_size < array.shape[1]

    last_mag = np.zeros(shape=(1, array.shape[0], slice_size, array.shape[2]))
    last_mag[:,:,:array.shape[1]-(n_sections*slice_size),:] = array[:,n_sections*int(slice_size):,:]
    
    if(n_sections > 0):
        array = np.expand_dims(array, axis=0)
        sliced = np.split(array[:,:,0:n_sections*slice_size,:], n_sections, axis=2)
        sliced = np.concatenate(sliced, axis=0)
        if(has_last_mag): # Check for reminder
            sliced = np.concatenate([sliced, last_mag], axis=0)
    else:
        sliced = last_mag
    return sliced

In [34]:
args={'audios_path':'.//data//audios'
    ,'stft_features_path':'.//data//stft_features'
    ,'cqt_features_path':'.//data//cqt_features'
    , 'stft':False}

for instrument in os.listdir(args['audios_path']):
    print(instrument)
    audios_dir = args['audios_path']+'//'+instrument
    if args['stft']:
        features_dir = args['stft_features_path']+'//'+instrument
    else:
        features_dir = args['cqt_features_path']+'//'+instrument
    
    for file in os.listdir(audios_dir):
        name, _ = file.split('.')
        audio,sr = librosa.core.load(audios_dir+'//'+file, sr=44100)# get audio (return audio and sampling rate)
        
        if args['stft']:
            mag, _ = forward_transform(audio)
            mag = librosa.amplitude_to_db(mag, ref=np.min, amin=1/(2**16)) # amplitude to db
            mag /= 20*np.log1p(1/amin ) # normalize
            print('asd')
        else:
            mag, _ = forward_cqt(audio, DEFAULT_SAMPLING_RATE )
            mag = librosa.amplitude_to_db(mag, ref=np.min, amin=1/(2**16)) # amplitude to db
            mag /= 20*np.log1p(1/amin ) # normalize

        mag_sliced = slice_magnitude(mag, mag.shape[0])
        print(name, mag_sliced.shape)
        
        for i in range(mag_sliced.shape[0]):
            out_name = features_dir+'//'+name+'_'+str(i).zfill(3)+'.npy'
            if(not os.path.isfile(out_name)):
                np.save(out_name, mag_sliced[i,:,:,:])

keyboard_acoustic
alb_esp1 (33, 256, 256, 1)
alb_esp2 (43, 256, 256, 1)


KeyboardInterrupt: 