In [1]:
# Harmony Team
# Nov. 10 - 2022

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle 
import json
import os
import itertools
from pathlib import Path
from IPython.display import Image, Audio
from music21 import note , chord , stream , instrument , converter   
import mido
# from midi2audio import FluidSynth          # to convert midi to wav file

In [33]:
# Network output are the classes, so encode into one hot vector
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

## Data preparation and visualization 

In [2]:
mid = mido.MidiFile('dataset_2/midi_songs/0fithos.mid')

In [3]:
mid.length # in seconds 

258.2724710000016

In [4]:
# Reading MidiFile
# parse the encoded data in a file object to midi stream
midi = converter.parse('dataset_2/midi_songs/0fithos.mid')
type(midi)

music21.stream.base.Score

In [5]:
#midi.show('text')

In [6]:
# Flat all the elements - notes/chords
notes_to_parse = midi.flat.notes
print(len(notes_to_parse))

1169


In [7]:
for element in notes_to_parse[:15]:
    print(element , element.offset)   # Offset refers to where the note is located in the piece

<music21.chord.Chord E3 A3> 4.0
<music21.note.Note E> 4.0
<music21.chord.Chord A1 E2> 4.0
<music21.chord.Chord E3 A3> 5.0
<music21.chord.Chord A1 E2> 5.0
<music21.chord.Chord A1 E2> 5.5
<music21.chord.Chord A1 E2> 6.0
<music21.chord.Chord A1 E2> 7.0
<music21.chord.Chord A1 E2> 7.5
<music21.chord.Chord E3 B3> 8.0
<music21.chord.Chord A1 E2> 8.0
<music21.chord.Chord E3 B3> 9.0
<music21.chord.Chord A1 E2> 9.0
<music21.chord.Chord A1 E2> 9.5
<music21.chord.Chord A1 E2> 10.0


In [8]:
str(notes_to_parse[1].pitch)

'E2'

In [9]:
notes_to_parse[50].normalOrder

[4, 9]

In [10]:
# Pitch refers to the frequency of the sound, or how high or low a particular note is 
# and is represented with the letters [A, B, C, D, E, F, G], with A being the highest and G being the lowest
notes_to_parse[1].pitch , str(notes_to_parse[1].pitch)

(<music21.pitch.Pitch E2>, 'E2')

In [11]:
notes_demo = []

for element in notes_to_parse:
    
    # if the element is a Note , then store it's Pitch
    if isinstance(element , note.Note):
        notes_demo.append(str(element.pitch))
        
    # if the element is a Chord , split each of the note of the chord and join them with +
    elif isinstance(element , chord.Chord):
        notes_demo.append('+'.join(str(n) for n in element.normalOrder))

In [25]:
len(notes_demo)

1171

In [26]:
print(notes_demo[32:50])

['4+9', '4+9', '4+9', '4+9', 'E2', '4+9', '4+9', '4+9', '4+9', '4+9', 'E5', 'F5', 'G#5', 'A5', '4+9', '4+9', '5+11', '4+9']


In [30]:
# Listing midi file ordered by name.
root_midi = "dataset_2/midi_songs/"
midi_file_dir = os.listdir(root_midi)
if '.ipynb_checkpoints' in midi_file_dir:
    midi_file_dir.remove('.ipynb_checkpoints')

In [32]:
len(midi_file_dir)

92

In [34]:
# Get all the notes and chords from the midi files in the ./midi_songs directory 
notes = []
#total_midi = len(p.glob("*.mid"))
for idx, file in enumerate(midi_file_dir):
    midi = converter.parse(root_midi+file)
    # print(f"parsing {file}" , end = "  ")
    
    elements_to_parse = midi.flat.notes
    # print(f"length {len(elements_to_parse)}")
    
    for element in elements_to_parse:
        
        # if the element is a Note, then store it's Pitch
        if isinstance(element , note.Note):
            notes.append(str(element.pitch))
            
        # if the element is a Chord , then split each of the note and join with +
        elif isinstance(element , chord.Chord):
            notes.append("+".join(str(n) for n in element.normalOrder))
    print('\r', 'Parcing: ', np.round((idx/len(midi_file_dir))*100,2), '% complete', end='')

 Parcing:  98.91 % complete

In [35]:
len(notes)

60866

In [36]:
with open("./dataset_2/notes" , "wb") as file:
    pickle.dump(notes , file)

In [12]:
with open("./dataset_2/notes" , "rb") as file:
    notes = pickle.load(file)

In [13]:
print("Total notes: " , len(notes))
print("Unique notes: " , len(set(notes)))

Total notes:  60866
Unique notes:  359


In [14]:
n_vocab = len(set(notes))

In [15]:
n_vocab

359

## PREPARE SEQUENTIAL DATA FOR LSTM

In [16]:
# get all pitch names (unique classes)
pitchnames = sorted(set(notes))

# create a dictionary to map pitches to integers
note_to_int = dict((element , idx) for idx , element in enumerate(pitchnames))

# create a reverse mapping
int_to_note = {idx:element for element , idx in note_to_int.items()}

assert len(note_to_int) == n_vocab

In [17]:
# get all pitch names (unique classes)
pitchnames = sorted(set(notes))

# create a dictionary to map pitches to integers
note_to_int = dict((element , idx) for idx , element in enumerate(pitchnames))

# create a reverse mapping
int_to_note = {idx:element for element , idx in note_to_int.items()}

assert len(note_to_int) == n_vocab

In [21]:
def prepare_training_set(notes, sequence_len = 100):
    
    # sequence_len -  How many elements LSTM input should consider
    
    network_input = []     # input sequence data
    network_output = []    # output data

    for i in range(len(notes) - sequence_len):
        seq_in = notes[i : i+sequence_len]         # contains 100 values
        seq_out = notes[i+sequence_len]

        network_input.append([note_to_int[n] for n in seq_in])
        network_output.append(note_to_int[seq_out])
        
    return network_input, network_output

In [22]:
network_input,network_output= prepare_training_set(notes, sequence_len = 100)
len(network_input) , len(network_output)

(60766, 60766)

In [27]:
# reshape input data into a shape compatible with LSTM layers
_network_input = np.reshape(network_input , (*(np.asarray(network_input).shape) , 1))  # input_samples, sequence_len, 1
print(_network_input.shape)

(60766, 100, 1)


In [28]:
normalised_network_input = _network_input/float(n_vocab)

In [29]:
normalised_network_input[0][:10]

array([[0.97493036],
       [0.93593315],
       [0.79387187],
       [0.84679666],
       [0.84958217],
       [0.93871866],
       [0.97493036],
       [0.83008357],
       [0.97214485],
       [0.84958217]])

In [30]:
network_output = to_categorical(network_output)

In [31]:
print(normalised_network_input.shape)
print(network_output.shape)

(60766, 100, 1)
(60766, 359)


In [32]:
print(normalised_network_input.shape)
print(network_output.shape)

(60766, 100, 1)
(60766, 359)


#### DEFINE MODEL ARCHITECTURE

In [37]:
def create_model(unit_layer1=64,unit_layer2=32,unit_layer3=512, dense_layer=128,drop_out=0.3):
    model = Sequential()
    model.add(LSTM(units = unit_layer1 , input_shape = (normalised_network_input.shape[1], normalised_network_input.shape[2])
                   , return_sequences = True))
    model.add(Dropout(drop_out))

    model.add(LSTM(units = unit_layer2 , return_sequences = True))
    model.add(Dropout(drop_out))

    model.add(LSTM(units = unit_layer3))
    model.add(Dense(dense_layer))
    model.add(Dropout(drop_out))

    model.add(Dense(n_vocab , activation = 'softmax'))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam")
    return model

In [39]:
model = create_model(unit_layer1=64,unit_layer2=32,unit_layer3=512, dense_layer=128,drop_out=0.3)

In [40]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 100, 64)           16896     
                                                                 
 dropout_6 (Dropout)         (None, 100, 64)           0         
                                                                 
 lstm_7 (LSTM)               (None, 100, 32)           12416     
                                                                 
 dropout_7 (Dropout)         (None, 100, 32)           0         
                                                                 
 lstm_8 (LSTM)               (None, 512)               1116160   
                                                                 
 dense_4 (Dense)             (None, 128)               65664     
                                                                 
 dropout_8 (Dropout)         (None, 128)              

In [41]:
#checkpoint = ModelCheckpoint("weights.h5", monitor = 'loss', save_best_only=True, mode = 'min')
hist = model.fit(normalised_network_input, network_output, epochs = 100, batch_size = 64)#, callbacks = [checkpoint])

## Hyperparameter search using grid search.


Hyperparameter tuning involves adjusting certain parameters of a machine learning model to improve its performance. In the case of an LSTM (Long Short-Term Memory) model, these parameters may include the number of hidden units, the presence and size of dense layers, and the dropout percentage. By experimenting with different combinations of these hyperparameters, it is possible to achieve better model accuracy and generalization to new data

In [59]:
unit_layer1 = [128,64,32]
unit_layer2 = [128,64,32]
unit_layer3 = [128,64,32]
dense_layer = [128,64]
drop_out = [0.5,0.3,0.2]

In [60]:
possible_combinations = list(itertools.product(unit_layer1,unit_layer2,unit_layer3,dense_layer,drop_out))

In [66]:
hist = []
for _conf in possible_combinations:
    unit_layer1=_conf[0]
    unit_layer2=_conf[1]
    unit_layer3=_conf[2]
    dense_layer=_conf[3]
    drop_out=_conf[4]
    model = create_model(unit_layer1,unit_layer2,unit_layer3, dense_layer,drop_out)
    _conf_hist = model.fit(normalised_network_input, network_output, epochs = 100, batch_size = 64) #, callbacks = [checkpoint]
    hist.append(_conf_hist)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (None, 100, 128)          66560     
                                                                 
 dropout_15 (Dropout)        (None, 100, 128)          0         
                                                                 
 lstm_17 (LSTM)              (None, 100, 128)          131584    
                                                                 
 dropout_16 (Dropout)        (None, 100, 128)          0         
                                                                 
 lstm_18 (LSTM)              (None, 128)               131584    
                                                                 
 dense_10 (Dense)            (None, 128)               16512     
                                                                 
 dropout_17 (Dropout)        (None, 128)              