In [2]:
# import the required libraries
# music21 is the library for editing music files, incl. MIDI.
from music21 import * 
import os
import pandas as pd
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
# Loading midi files whith metadata
Songs = []
directory = 'F:\MusicNet\КИШ\табы тест без ритма'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and filename.endswith('.mid'):
        f = converter.parse(f)
        f.insert(0, metadata.Metadata())
        f.metadata.title = filename
        Songs.append(f)
print('Total songs loaded', len(Songs))

Total songs loaded 18


In [4]:
# transpose function 
# we need to transpose all songs to one key: C-major or parralel minor (A-minor)
def transpose(song):
  
    key = song.analyze('key')
    if key.mode == 'major':
        i = interval.Interval(key.tonic, pitch.Pitch('C'))
    elif key.mode == 'minor':
        i = interval.Interval(key.tonic, pitch.Pitch('A'))
    transposed_song = song.transpose(i)
    # print(key, transposed_song.analyze('key'))
    return transposed_song

In [5]:
# list of transposed songs
transposed_songs = []
for song in Songs:
    song = transpose(song)
    transposed_songs.append(song)
print('total song transposed', len(transposed_songs))

total song transposed 18


In [6]:
def collect_events(song):
    # create an empty list which will contain information about notes and rests
    notes_in_song = []
    
    # Iterate on each part in song
    for part in song.parts:
        # iterate over each event in the part
        # create an empty list to collect information about notes and rests in each part
        notes_in_part = []
        
        for event in part.flat:
            # if the event is a note - the label is equal to the note number in midi, if it's a rest, then the label is a rest
            # we will be based on the assumption that the Measure is divided into 16 bits. To do this, we will divide each event by 0.25(4/16)
            if isinstance(event, note.Note):
                cells = int(event.duration.quarterLength / 0.25)
                for cell in range(cells):
                    if cell == 0:
                        notes_in_part.append(str(event.pitch.midi))
                    else:
                        notes_in_part.append('_')
       
            elif isinstance(event, note.Rest):
                cells = int(event.duration.quarterLength / 0.25)
                for cell in range(cells):
                    if cell == 0:
                        notes_in_part.append('rest')
                    else:
                        notes_in_part.append('_')
            elif isinstance(event, chord.Chord):
                cells = int(event.duration.quarterLength / 0.25)
                for cell in range(cells):
                    if cell == 0:
                        notes_in_part.append(str(event.pitchedCommonName))
                    else:
                        notes_in_part.append('_')                
                
        # separator for splitting songs
        notes_in_part.append('SEP')

        notes_in_song.append(notes_in_part)
        # print(len(notes_in_part), len(notes_in_song))
        
    return notes_in_song       

In [7]:
# list of collected events
events = []
for song in transposed_songs:
    collected = collect_events(song)
    events.append(collected)
    # print(song.metadata.title)
events = pd.DataFrame(events)

  collected = collect_events(song)


In [8]:
# make a dictionary of unique events in our list (in all loaded songs)
set_of_unique_keys = {x for l in events.sum() for x in l}
vocabulary = dict(zip(set_of_unique_keys, range(len(set_of_unique_keys))))

In [9]:
# encode final dataset with vocabulary
final_events = []
for i in range(len(events.sum())): 
    temporary = []
    for j in events.sum()[i]:
        temporary.append(vocabulary[j])
    final_events.append(temporary)
final_events = np.array(final_events)

In [10]:
np.save('F:\\MusicNet\\КИШ\\vocabulary\\vocabulary.npy', vocabulary)

In [11]:
# create training dataset
def create_training_set(lenght, part):
    x_temp = []
    y_temp = []
    for i in range(len(part) - lenght):
        if part[i+lenght] != vocabulary['SEP']:
            x_temp.append(part[i:i+lenght])
            y_temp.append(part[i+lenght])            
        else:
            i += (lenght + 1)       
    return x_temp, y_temp # np.array(x_temp), np.array(y_temp)  

In [12]:
# create trainig sequences, with a given lenght - seq_len
seq_len = 128
x_train = []
y_train = []
for i in range(len(final_events)):    
    x_temp, y_temp = create_training_set(seq_len, final_events[i])

    x_train.append(x_temp)
    y_train.append(y_temp)
    
x_train, y_train = np.array(x_train), np.array(y_train)

In [13]:
print(x_train.shape, y_train.shape)

(2, 19304, 128) (2, 19304)


In [14]:
X = to_categorical(x_train, num_classes=len(vocabulary))
y = to_categorical(y_train, num_classes=len(vocabulary))

In [15]:
# create train sequences and targets for each input of our network
solo_train, bass_train = X[0], X[1]

In [16]:
solo_target, bass_target = y[0], y[1]

In [17]:
solo_inputs = keras.Input(shape=(seq_len, len(vocabulary)), name='solo')
bass_inputs = keras.Input(shape=(seq_len, len(vocabulary)), name='bass')
# adding LSTM layer
solo_lstm = keras.layers.LSTM(seq_len)(solo_inputs)
bass_lstm = keras.layers.LSTM(seq_len)(bass_inputs)
# concatenate inputs 
x = keras.layers.concatenate([solo_lstm, bass_lstm])
x = keras.layers.Dropout(0.2)(x)
# 2 output layers
solo_pred = keras.layers.Dense(len(vocabulary), activation='softmax', name='solo_pred')(x)
bass_pred = keras.layers.Dense(len(vocabulary), activation='softmax', name='bass_pred')(x)
model = keras.Model(inputs=[solo_inputs, bass_inputs], outputs=[solo_pred, bass_pred])

In [18]:
keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [19]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss=keras.losses.categorical_crossentropy)

In [None]:
history = model.fit({'solo': solo_train, 'bass': bass_train},
          {'solo_pred': solo_target, 'bass_pred': bass_target},
          epochs=10, batch_size=32)

In [None]:
plt.figure(figsize=(12,6))
plt.plot(history.history['loss'])

In [20]:
# Loading test midi files whith metadata
Songs = []
directory = 'F:\MusicNet\КИШ\TEST'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and filename.endswith('.mid'):
        f = converter.parse(f)
        f.insert(0, metadata.Metadata())
        f.metadata.title = filename
        Songs.append(f)
        print(f.metadata.title, 'loaded')
print('Total songs loaded', len(Songs))

тест таб 124.mid loaded
Total songs loaded 1


In [23]:
# collecting events in test song
test_events = []
for song in Songs:
    collected = collect_events(song)
    test_events.append(collected)
test_events = pd.DataFrame(events) 

  collected = collect_events(song)


In [24]:
# encode test dataset with vocabulary
test_final_events = []
for i in range(len(test_events.sum())): 
    temporary = []
    for j in test_events.sum()[i]:
        temporary.append(vocabulary[j])
    test_final_events.append(temporary)
test_final_events = np.array(final_events)

In [25]:
# test to categorical and expand dimensions to network requirements
test = to_categorical(test_final_events, len(vocabulary))
test = np.expand_dims(test, axis=1)
test[1].shape

(1, 19450, 83)

In [None]:
# model predictions
solo_first, bass_first = test[0], test[1]
solo, bass = [], []
for i in range(64):
    solo_next, bass_next = model.predict({'solo': solo_first,'bass': bass_first})
    
    for k in solo_next, bass_next:
        k[np.where(k == np.random.choice(k[0], p=k[0]))] = 1
        k[np.where(k != np.float32(1))] = 0

    for j in solo_first, bass_first:
        j = np.delete(j, 0, 1)
        j = np.squeeze(j)
        for k in solo_next, bass_next:
            j = np.concatenate((j, k))
        j = np.expand_dims(j, axis=1)
        
    solo.append(solo_next), bass.append(bass_next)    
solo, bass = np.array(solo), np.array(bass)     

In [None]:
# decode list with vocabulary 
total = []
for i in solo, bass:
    index_list = np.where(i == 1)[2]
    temporary = []
    for j in index_list:
        temporary.append(list(k for k, v in vocabulary.items() if v == j))
    total.append(temporary)

In [None]:
# convert song to midi function
def convert_to_midi(list_of_events, step_duration):
    
    part = stream.Stream()
    first_symbol = None
    counter = 1

    for i, symbol in enumerate(list_of_events):

                # handle case in which we have a note/rest
                if symbol != "_" or i + 1 == len(list_of_events):

                    # ensure we're dealing with note/rest beyond the first one
                    if first_symbol is not None:

                        quarter_length_duration = step_duration * counter # 0.25 * 4 = 1

                        # handle rest
                        if first_symbol == "rest":
                            m21_event = note.Rest(quarterLength=quarter_length_duration)

                        # handle note
                        else:
                            m21_event = note.Note(int(first_symbol), quarterLength=quarter_length_duration)

                        part.append(m21_event)

                        # reset the step counter
                        counter = 1

                    first_symbol = symbol

                # handle case in which we have a prolongation sign "_"
                else:
                    counter += 1

    return part

In [None]:
S, B = np.squeeze(total[0]), np.squeeze(total[1])

In [None]:
# convert song with function
song = stream.Score()
for i in S, B:
    song.insert(convert_to_midi(i, 0.25))

In [None]:
# save midi
song.write('midi', 'F:\\MusicNet\\КИШ\\Generated\\generated128.mid')