/
create_dataset.py
386 lines (332 loc) · 17 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
from mido import Message, MidiTrack, MidiFile
import time
from collections import defaultdict
from cqt import *
from encode_midi_segments import *
from decode_midi import *
from collections import namedtuple
import pickle
import random
random.seed(21)
import ntpath
# data types
AudioDecoded = namedtuple('AudioDecoded', ['time_series', 'sr'])
def find_audio_files(directory_str):
directory_str_audio = directory_str + "audio"
audio_files = librosa.util.find_files(directory_str_audio, recurse=False, case_sensitive=True) #recurse False
# means subfolders are not searched; case_sensitive True ultimately keeps songs from being
# listed twice
return audio_files
def load_audio(audio_file):
time_series, sr = librosa.core.load(audio_file, sr=22050*4)
time_series_and_sr = AudioDecoded(time_series, sr)
return time_series_and_sr
def audio_timestamps(audio_file, time_series_and_sr):
sr = time_series_and_sr[1]
duration_song = librosa.core.get_duration(y=time_series_and_sr[0], sr=sr)
audio_segment_length = 1
midi_segment_length = 0.5
padding = midi_segment_length / 2
audio_start_times_missing_first = np.arange(
padding,(duration_song-midi_segment_length), midi_segment_length)
# For ex,
# this will mean that for a song of len 157 seconds, the last needed MIDI file would be 156.5-157,
# so the last audio start time should be 156.25. In this example
# duration_song minus midi_segment_length is 156.5
audio_start_times = np.concatenate((np.asarray([0]), audio_start_times_missing_first),
axis=0)
return audio_start_times, audio_segment_length, midi_segment_length
def silent_audio(sr, padding):
zeros = int(sr * padding)
audio = np.zeros((zeros))
maxv = np.iinfo(np.int16).max #numpy.iinfo(type): Machine limits for integer types.
silent_audio_time_series = (audio * maxv).astype(np.int16)
return silent_audio_time_series
# padding_portion provides the option to set padding to zero, so that if a better temporally aligned dataset is used
# in the future, the code can segmented without padding
def load_segment_of_audio_and_save(audio_file, start, audio_segment_length, midi_segment_length, duration_song, sr,
padding_portion=.5):
padding = midi_segment_length * padding_portion
# If you don't want to pad the audio segments, comment out the following code and move the
# line inside the else condition out of the else condition
if start == 0:
segment_duration = (audio_segment_length - padding)
audio_segment_time_series_og, sr = librosa.core.load(audio_file, offset=0,
duration=segment_duration, sr=sr)
silence = silent_audio(sr, padding)
audio_segment_time_series = np.concatenate((silence, audio_segment_time_series_og), axis=0)
elif (duration_song - start) < audio_segment_length: #if we're at the end of a song and need
# padding
segment_duration = (duration_song - start)
audio_segment_time_series_og, sr = librosa.core.load(audio_file, offset=start,
duration=segment_duration, sr=sr)
audio_segment_time_series = np.concatenate(
(audio_segment_time_series_og, silent_audio(sr, (audio_segment_length-segment_duration))),axis=0)
else:
audio_segment_time_series, sr = librosa.core.load(audio_file, offset=start,
duration=audio_segment_length, sr=sr)
filename_format = "C:/Users/Lilly/audio_and_midi/segments/audio/{0}_start_time_{1}.wav"
filename = filename_format.format(ntpath.basename(audio_file)[:-4], str(start))
# for testing by listening to audio (currently written for windows)
# librosa.output.write_wav(filename, audio_segment_time_series, sr)
return audio_segment_time_series
def load_midi(directory_str, audio_file):
midi_base_str = directory_str + "midi/"
midi_str = midi_base_str + ntpath.basename(audio_file)[:-4] + ".mid"
midi_file = MidiFile(midi_str)
return midi_file
def create_simplified_midi(midi_file):
simplified_midi = []
ticks_since_start = 0
# debugging/for future use with a dataset other than Saarland Music Dataset:
# check for note offs without note ons and check control changes in song
print("midi file:", midi_file)
count_ons = 0
count_offs = 0
control_changes = []
for message in midi_file.tracks[0]:
if message.type == "set_tempo":
tempo_in_microsecs_per_beat = message.tempo
for message in midi_file.tracks[-1]:
# convert delta time (delta ticks) to TICKS since start
ticks_since_start += message.time
if message.type == "note_on" or message.type == "note_off":
simplified_midi.append([message.type, message.note, ticks_since_start])
# debugging/for future use with a dataset other than SMD
if message.type == "note_on":
count_ons += 1
else:
count_offs += 1
# debugging/for future use with a dataset other than SMD: check control changes in song; check for 123
if message.type == 'control_change':
control_changes.append(message.control)
if message.control == 123:
print("CONTROL ALL NOTES OFF MESSAGE!")
# debugging/for future use with a dataset other than SMD
if count_ons != count_offs:
print("INEQUAL NUM ONS AND OFFS (prior to simplified midi)")
print("count ons:", count_ons)
print("offs:", count_offs)
# convert ticks since start to absolute seconds
tempo_in_secs_per_beat = tempo_in_microsecs_per_beat / 1000000
ticks_per_beat = midi_file.ticks_per_beat
secs_per_tick = tempo_in_secs_per_beat / ticks_per_beat
length_in_secs_full_song = ticks_since_start * secs_per_tick
for message in simplified_midi:
message[-1] = message[-1] / ticks_since_start * length_in_secs_full_song
# For comparison
midi_length = midi_file.length
# print("midi_file.length:", midi_file.length)
# print("length based on last note off:", length_in_secs_full_song)
# debugging/for future use with a dataset other than SMD
print(set(control_changes))
return simplified_midi, ticks_since_start, length_in_secs_full_song
def chop_simplified_midi(midi_file, midi_segment_length, simplified_midi, absolute_ticks_last_note, midi_start_times):
# debugging/for future use with a dataset other than SMD: check for note ons and offs equal
count_note_on = 0
count_note_off = 0
for message in simplified_midi:
message_type = message[0]
if message_type == 'note_on':
count_note_on += 1
if message_type == 'note_off':
count_note_off += 1
if count_note_on != count_note_off:
print("midi file:", midi_file)
print("inequal num ons and offs")
# erase a redundant note_on or a redundant note off of the same pitch
pitches_on = []
messages_to_erase = []
for message in simplified_midi:
message_type = message[0]
pitch = message[1]
if message_type == 'note_on':
if pitch in pitches_on:
messages_to_erase.append(message)
else:
pitches_on.append(pitch)
elif message_type == 'note_off':
if pitch in pitches_on:
pitches_on.remove(pitch)
else:
print("redundant off message found")
messages_to_erase.append(message)
for message in messages_to_erase:
simplified_midi.remove(message)
time_so_far = 0
midi_segments =[]
for midi_start_time in midi_start_times:
midi_segment = []
for message in simplified_midi:
end_time = time_so_far + midi_segment_length
if message[-1] >= time_so_far and message[-1] < end_time:
midi_segment.append(message)
midi_segments.append([midi_start_time, midi_segment])
time_so_far = end_time
return midi_segments, absolute_ticks_last_note
def add_note_onsets_to_beginning_when_needed(midi_segments, midi_segment_length):
# account for notes which start before a segment and end after a segment. For ex, a note which is
# from .25 to 1.25. Without this special case accounting, this note would not be
# present in the simplified MIDI clip
pitches_to_set_to_on_at_beginning_of_segment = []
for start_time_and_messages in midi_segments:
start_time = start_time_and_messages[0]
messages = start_time_and_messages[1]
for pitch in pitches_to_set_to_on_at_beginning_of_segment:
messages.insert(0, ["note_on", pitch, start_time])
pitches_to_set_to_on_at_beginning_of_segment = []
# goal is to build a dict of dicts like so: {pitch: {num_ons: 4, num_offs: 3}}
pitch_on_and_off_counts = defaultdict(lambda: defaultdict(int))
for message in messages:
pitch = message[1]
message_type = message[0]
pitch_on_and_off_counts[pitch][message_type] += 1
for pitch, on_off_counts_dict in pitch_on_and_off_counts.items():
count_on = on_off_counts_dict["note_on"]
count_off = on_off_counts_dict["note_off"]
if count_on > count_off:
# this includes the "exclusive ending" (ie This will insert an end time of
# .5, when technically it should be just under .5)
pitches_to_set_to_on_at_beginning_of_segment.append(pitch)
end_time = start_time + midi_segment_length
messages.append(["note_off", pitch, end_time])
return midi_segments
def find_lowest_and_highest_midi_note_numbers(simplified_midi):
lowest_note_number_so_far = 127
highest_note_number_so_far = 0
for message in simplified_midi:
note_number = message[1]
if note_number < lowest_note_number_so_far:
lowest_note_number_so_far = note_number
if note_number > highest_note_number_so_far:
highest_note_number_so_far = note_number
return lowest_note_number_so_far, highest_note_number_so_far
def reconstruct_midi(midi_filename, midi_segments, absolute_ticks_last_note, length_in_secs_full_song):
time_so_far = 0
for midi_segment in midi_segments:
# time in seconds to absolute ticks
absolute_ticks_midi_segment = []
start_time = midi_segment[0]
messages = midi_segment[1]
for message in messages:
note_on_or_off = message[0]
pitch = message[1]
scaled_time = message[-1]
time = scaled_time + start_time
absolute_ticks = time * absolute_ticks_last_note / length_in_secs_full_song
absolute_ticks_midi_segment.append([note_on_or_off, pitch, absolute_ticks])
# time in absolute ticks to delta time
delta_time_midi_segment = []
for message in absolute_ticks_midi_segment:
note_on_or_off = message[0]
pitch = message[1]
time = message[-1]
delta_time = int(time - time_so_far)
delta_time_midi_segment.append([note_on_or_off, pitch, delta_time])
time_so_far = time
mid = MidiFile()
track = MidiTrack()
mid.tracks.append(track)
for message in delta_time_midi_segment:
note_on_or_off = message[0]
pitch = int(message[1])
delta_ticks = message[-1]
# debugging/for future use with a dataset other than SMD
if type(delta_ticks) != int or delta_ticks < 0:
print("time issue")
track.append(Message(note_on_or_off, note=pitch, time=delta_ticks))
# for testing by listening to midi (currently written for windows)
# str_start_time = str(midi_segment[0])
# filename_format = "C:/Users/Lilly/audio_and_midi/segments/midi/{0}_start_time_{1}.mid"
# filename = filename_format.format(midi_filename, str_start_time)
# mid.save(filename)
return
def done_beep():
import os
# Windows
if os.name == 'nt':
import winsound
duration = 1500 # millisecond
freq = 392 # Hz
winsound.Beep(freq, duration)
# Linux
if os.name == 'posix':
duration = 1 # second
freq = 392 # Hz
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))
def preprocess_audio_and_midi(directory_str):
audio_files = find_audio_files(directory_str)
cqt_segments = []
all_songs_encoded_midi_segments = []
midi_segments_count = 0
for audio_file in audio_files:
audio_decoded = load_audio(audio_file)
time_series, sr = audio_decoded
audio_start_times, audio_segment_length, midi_segment_length = audio_timestamps(
audio_file, audio_decoded)
midi_file = load_midi(directory_str, audio_file)
duration_song = librosa.core.get_duration(time_series, sr)
# print time differences
midi_len = midi_file.length
if midi_len != duration_song:
print(audio_file, "audio len - midi len:", duration_song-midi_len)
simplified_midi, absolute_ticks_last_note, length_in_secs_full_song = create_simplified_midi(
midi_file)
lowest_midi_note, highest_midi_note = find_lowest_and_highest_midi_note_numbers(simplified_midi)
midi_start_times = np.arange(0, length_in_secs_full_song, midi_segment_length)
if len(audio_start_times) != len(midi_start_times):
if len(audio_start_times) > len(midi_start_times):
num_start_times = len(midi_start_times)
audio_start_times_shortened = audio_start_times[:num_start_times]
audio_start_times = audio_start_times_shortened
else:
num_start_times = len(audio_start_times)
midi_start_times_shortened = midi_start_times[:num_start_times]
midi_start_times = midi_start_times_shortened
for start_time in audio_start_times:
padding_portion = .5 # padding on each side of audio is midi_segment_length * padding portion
audio_segment_time_series = load_segment_of_audio_and_save(audio_file, start_time,
audio_segment_length,
midi_segment_length,
duration_song, sr,
padding_portion)
cqt_of_segment = audio_segments_cqt(audio_segment_time_series, sr)
cqt_segments.append(cqt_of_segment)
midi_segments, absolute_ticks_last_note = chop_simplified_midi(midi_file, midi_segment_length, simplified_midi, absolute_ticks_last_note, midi_start_times)
midi_start_times_and_segments_incl_onsets = \
add_note_onsets_to_beginning_when_needed(midi_segments, midi_segment_length)
for midi_segment in midi_start_times_and_segments_incl_onsets:
midi_start_time = midi_segment[0]
messages = midi_segment[1]
encoded_segment, num_discrete_time_values, num_notes = encode_midi_segment(midi_start_time, messages, midi_segment_length, lowest_midi_note, highest_midi_note)
all_songs_encoded_midi_segments.append(encoded_segment)
# debugging equal num cqt and midi segment
midi_segments_count += 1
# # testing
# # encoded_segment = all_songs_encoded_midi_segments[1]
# decoded_segments = []
# for encoded_segment in all_songs_encoded_midi_segments:
# decoded_midi = decode_midi_segment(encoded_segment, midi_segment_length, num_discrete_time_values, lowest_midi_note)
# decoded_segments.append(decoded_midi)
# decoded_midi_start_times_and_segments = []
# for i in range(len(midi_start_times)):
# decoded_midi_start_times_and_segments.append([midi_start_times[i], decoded_segments[i]])
# code for midi reconstruction (for listening purposes)
# midi_filename = midi_file.filename[35:-4]
# reconstruct_midi(midi_filename, decoded_midi_start_times_and_segments, absolute_ticks_last_note, length_in_secs_full_song)
# pickle time
with open('cqt_segments_midi_segments.pkl', 'wb') as handle:
pickle.dump(cqt_segments, handle)
pickle.dump(all_songs_encoded_midi_segments, handle)
return
def main():
# You must set directory str to be the filepath to your directory containing 2 directories: one named "audio"
# (containing the
# audio files) and one named "midi" (containing the MIDI files). For example, "C:/Users/Lilly/audio_and_midi/" or
# "/home/lilly/Downloads/audio_midi/"
directory_str = "/home/lilly/Downloads/audio_midi/"
preprocess_audio_and_midi(directory_str)
if __name__ == '__main__':
start_time = time.time()
main()
print("--- %s seconds ---" % (time.time() - start_time))