import sys
!{sys.executable} -m pip install music21
!{sys.executable} -m pip install tqdm



## Methodology:

We write functions to:

1- Encode Each song.

2- Decode each song and save it as midi.

3- Obtain a dict that translates each event into a numerical value.

4- Prepare input sequences and outputs for the three LSTMs to be trained.

5- Train the LSTMs.

6- Use the three trained LSTMs to generate music.

In [1]:
import music21
import numpy as np
import glob
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import pandas as pd
from keras.utils import np_utils
from collections import defaultdict

from keras.models import Sequential
from keras.layers import Activation, Dense, LSTM, Dropout, Flatten


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def encode_song(filename):
    """Encodes song in filename. See milestone1 report for encoding"""
    notesRaw=music21.converter.parse(filename)
    notesRaw=notesRaw.flat.notes
        
    pitches=[]
    offsets=[]
        
    for note in notesRaw:
        if isinstance(note,music21.note.Note):
            pitches.append(note.pitch.midi)
            offsets.append(note.offset)
        else:
            chordNotes=[int(b.midi) for b in note.pitches]
            pitches.extend(chordNotes)
            offsets.extend([note.offset]*len(chordNotes))

    pitches=np.array(pitches)
    offsets=np.array(offsets)
    uniqueSortedOffsets=np.sort(np.unique(offsets))
    
    encoding=[]
    
    for i in range(len(uniqueSortedOffsets)-1):
        time=uniqueSortedOffsets[i]
        pitchesHere=pitches[offsets==time]
        goforward='gf'+str(uniqueSortedOffsets[i+1]-time)
        
        encoding.extend(list(np.sort(pitchesHere)))
        encoding.append(goforward)
        
        
    
    return encoding   

In [3]:
def decode_song(song,filename):
    """Decodes song encoded by encode_song(). song is an encoded song of type list, and filename is
    the path where the new decoded midi file is to reside"""
    stream=[]
    offset=0
    for event in song:
        if 'gf' in str(event):
            offset+=float(event[2:])
        else:
            newNote=music21.note.Note(int(event))
            newNote.offset=offset
            newNote.storedInstrument=music21.instrument.Piano()
            stream.append(newNote)
    midi_stream=music21.stream.Stream(stream)
    midi_stream.write('midi', fp=filename)
    return 1
    
    
    

In [2]:
#we get the the directories of all the songs in our dataset. 

artists=glob.glob('Music/*') #files are sorted via artist
 
songs=[] #the song filename

for artist in artists:
    songs.extend(glob.glob(artist+'/*'))


In [26]:
#we encode each song, save them in a dataframe, and then pickle the dataframe.
information=defaultdict(list)

for song in tqdm(songs):
    encoded=encode_song(song)
    _,artist,songname=song.split('\\')
    information['SongName'].append(songname)
    information['Artist'].append(artist)
    information['Encoded'].append(encoded)
    



100%|████████████████████████████████████████| 266/266 [28:20<00:00,  2.48s/it]


In [28]:
DF_to_save=pd.DataFrame.from_dict(information)
DF_to_save.head()

with open('encodings/information.pickle','wb') as file:
    pickle.dump(DF_to_save,file)

In [30]:
DF=DF_to_save
DF_to_save=[]

In [41]:
#inWhat follows we create three dictionaries: one for notes and gf (which will be used in the input)
#another for just gfs (output of get_gf), and another for just notes (for get_note)

#we first get all the unique elements of our vocabulary:

vocab=[]
for i in tqdm(range(len(DF))):
    vocab.extend(DF.iloc[i,2])
    vocab=list(set(vocab))
    
notesOnly=[]
gfOnly=[]

for word in vocab:

    if 'gf' in str(word):
        gfOnly.append(word)
    else:
        notesOnly.append(word)

100%|██████████████████████████████████████| 266/266 [00:00<00:00, 3410.24it/s]


In [49]:
#we check to see what is the quarter lengths duration of the unique go forward events
numbers=[]
for gf in gfOnly:
    if '/' in gf:
        num,denom=gf.split('/')
        num=num[2:]
        numbers.append(float(num)/float(denom))
    else:
        numbers.append(float(gf[2:]))

In [52]:
print(sorted(numbers))

[0.08333333333325754, 0.08333333333331439, 0.0833333333333286, 0.08333333333333215, 0.08333333333333304, 0.08333333333333326, 0.08333333333333331, 0.08333333333333337, 0.08333333333333348, 0.08333333333333393, 0.0833333333333357, 0.08333333333334281, 0.08333333333337123, 0.16666666666662877, 0.1666666666666572, 0.1666666666666643, 0.16666666666666607, 0.16666666666666652, 0.16666666666666663, 0.16666666666666669, 0.16666666666666674, 0.16666666666666696, 0.16666666666666785, 0.1666666666666714, 0.16666666666668561, 0.16666666666674246, 0.25, 0.33333333333325754, 0.3333333333333144, 0.3333333333333286, 0.33333333333333215, 0.33333333333333304, 0.33333333333333326, 0.3333333333333333, 0.3333333333333333, 0.33333333333333337, 0.3333333333333335, 0.3333333333333339, 0.3333333333333357, 0.3333333333333428, 0.33333333333337123, 0.41666666666662877, 0.4166666666666572, 0.4166666666666643, 0.41666666666666663, 0.41666666666666785, 0.4166666666666714, 0.4166666666666856, 0.5, 0.6666666666666288

From above we see that alot of the unique offsets are essentially the same but differ slightly due to representation by the author of the midi file. We thus have to fix something in the representation of the gfs. This is something we could have fixed in the encode_song function. However to avoid having to open the files again, we ammend the issue in the dataframe.

In [55]:
np.round(1,4)

1

In [84]:
NewColumn=[]
for i in tqdm(range(len(DF))):
    encoding=DF.iloc[i,2]
    #now loop through every word of the encoding
    replacement=[]
    for i,word in enumerate(encoding):
        if 'gf' in str(word):
            #we have a gf, we isolate the word from the letters 'gf'
            keep=word[2:]
            #we now check to see if there is a division symbol:
            if '/' in keep:
                numerator,denominator=keep.split('/')
                keep=np.round(float(numerator)/float(denominator),4)
                replacement.append('gf'+str(keep))
            else:
                replacement.append('gf'+str(np.round(float(keep),4)))
                
        else:
            #its just a note.
            replacement.append(word)
    NewColumn.append(replacement)

100%|████████████████████████████████████████| 266/266 [00:03<00:00, 76.09it/s]


In [87]:
DF['ModedEncoded']=NewColumn


In [None]:
DF=DF.drop(columns='Encoded')

In [95]:
with open('encodings/information.pickle','wb') as file:
    pickle.dump(DF,file)

In [3]:
with open('encodings/information.pickle','rb') as file:
    DF=pickle.load(file)

In [3]:
DF.head()

Unnamed: 0,SongName,Artist,ModedEncoded
0,alb_esp1.mid,albeniz,"[57, 81, gf0.5, 64, 88, gf3.25, 62, 86, gf0.08..."
1,alb_esp2.mid,albeniz,"[38, 50, gf0.75, 57, gf0.25, 62, 66, 69, gf0.5..."
2,alb_esp3.mid,albeniz,"[59, 71, gf0.5, 63, 75, gf0.5, 66, 78, gf0.5, ..."
3,alb_esp4.mid,albeniz,"[79, gf0.5, 71, 74, gf0.5, 72, 75, gf0.5, 69, ..."
4,alb_esp5.mid,albeniz,"[51, 58, gf0.5, 58, gf1.0, 58, gf0.5, 51, 58, ..."


In [4]:
DF=DF[DF['Artist']=='chopin']

In [20]:
DF=DF.iloc[[0],:]

In [21]:
#we first get all the unique elements of our vocabulary:

vocab=[]
for i in tqdm(range(len(DF))):
    vocab.extend(DF.iloc[i,2])
    vocab=list(set(vocab))
    
notesOnly=[]
gfOnly=[]

for word in vocab:

    if 'gf' in str(word):
        gfOnly.append(word)
    else:
        notesOnly.append(word)

100%|██████████| 1/1 [00:00<00:00, 1719.68it/s]


In [22]:
len(vocab)

47

In [102]:
#we check to see what is the quarter lengths duration of the unique go forward events
numbers=[]
for gf in gfOnly:
    if '/' in gf:
        num,denom=gf.split('/')
        num=num[2:]
        numbers.append(float(num)/float(denom))
    else:
        numbers.append(float(gf[2:]))
sorted(numbers)

[0.0833,
 0.1667,
 0.25,
 0.3333,
 0.4167,
 0.5,
 0.6667,
 0.75,
 0.8333,
 0.9167,
 1.0,
 1.0833,
 1.1667,
 1.25,
 1.3333,
 1.4167,
 1.5,
 1.5833,
 1.6667,
 1.75,
 1.8333,
 2.0,
 2.0833,
 2.1667,
 2.25,
 2.3333,
 2.5,
 2.6667,
 2.75,
 2.8333,
 3.0,
 3.25,
 3.5,
 3.6667,
 3.75,
 3.8333,
 4.0,
 4.25,
 4.5,
 4.6667,
 4.75,
 5.0,
 5.3333,
 5.5,
 5.6667,
 5.8333,
 6.0,
 6.25,
 6.5,
 7.0,
 8.0]

In [None]:
#we now can start preparing inputs and outputs for the various neural networks we want to train.

In [23]:
#first step is getting a representation of the vocabulary:
vocab=[]
for i in tqdm(range(len(DF))):
    vocab.extend(DF.iloc[i,2])
    vocab=list(set(vocab))

100%|██████████| 1/1 [00:00<00:00, 2006.84it/s]


In [24]:
#we create a dict to translate each vocab element to a number and vice versa:
WordToNumber={}
NumberToWord={}

GfToNumber={}
NumberToGf={}

NoteToNumber={}
NumberToNote={}


gf_index=0
note_index=0

for i,word in enumerate(vocab):
    WordToNumber[word]=i
    NumberToWord[i]=word
    if 'gf' in str(word):
        GfToNumber[word]=gf_index
        NumberToGf[gf_index]=word
        gf_index+=1
    else:
        NoteToNumber[word]=note_index
        NumberToNote[note_index]=word
        note_index+=1




## LSTM Architecture:

We have 3 neural networks:

1- gf_or_note

2- get_gf

3- get_note



The inputs for all three of the nets are in the same space and have the same representation. The outputs differ. The output of gf_or_note is binary, the output of get_gf is limited to the different available gf's while the output of get_note is limited to the keys on the piano.

We begin by getting a string

In [25]:
def prepare_sequences_gf_or_note(DF, n_vocab,WordToNumber,sequence_length=100): 
    """Given a list of locations for all the midi files in the dataset, this function encodes each song"""
    # Extract the unique pitches in the list of notes.

    network_input = []
    network_output = []
    for i in tqdm(range(len(DF))):
        song=DF.iloc[i,2]
        # create input sequences and the corresponding outputs
        
        for i in range(0, len(song) - sequence_length, 1):
            #we only use the sequence if the last event is not a gf event:
            sequence_in = song[i: i + sequence_length]
            if 'gf' not in str(sequence_in[-1]):
                sequence_out = float('gf' in str(song[i + sequence_length]))
                network_input.append([WordToNumber[char] for char in sequence_in])
                network_output.append(sequence_out)
                

    n_patterns = len(network_input)
    
    # reshape the input into a format comatible with LSTM layers 
    network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
    
    # normalize input
    network_input = network_input / float(n_vocab)
    
    # one hot encode the output vectors
    network_output = np_utils.to_categorical(network_output)
    
    return (network_input, network_output)

In [26]:
inp,out=prepare_sequences_gf_or_note(DF,len(WordToNumber),WordToNumber)
print(inp.shape)
print(out.shape)

100%|██████████| 1/1 [00:00<00:00, 269.99it/s]

(264, 100, 1)
(264, 2)





In [27]:
def prepare_sequences_get_gf(DF, n_vocab,WordToNumber,GfToNumber,sequence_length=100): 
    """Given a list of locations for all the midi files in the dataset, this function encodes each song"""
    # Extract the unique pitches in the list of notes.

    network_input = []
    network_output = []
    for i in tqdm(range(len(DF))):
        song=DF.iloc[i,2]
        # create input sequences and the corresponding outputs
        
        for i in range(0, len(song) - sequence_length, 1):
            #we only use the sequence if the last event is not a gf event:
            sequence_in = song[i: i + sequence_length]
            sequence_out=song[i + sequence_length]
            if 'gf' in str(sequence_out):
                network_input.append([WordToNumber[char] for char in sequence_in])
                network_output.append(GfToNumber[sequence_out])
                

    n_patterns = len(network_input)
    
    # reshape the input into a format comatible with LSTM layers 
    network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
    
    # normalize input
    network_input = network_input / float(n_vocab)
    
    # one hot encode the output vectors
    network_output = np_utils.to_categorical(network_output)
    
    return (network_input, network_output)

In [28]:
inp,out=prepare_sequences_get_gf(DF,len(WordToNumber),WordToNumber,GfToNumber)
print(inp.shape)
print(out.shape)

100%|██████████| 1/1 [00:00<00:00, 199.59it/s]

(161, 100, 1)
(161, 3)





In [29]:
def prepare_sequences_get_note(DF, n_vocab,WordToNumber,NoteToNumber,sequence_length=100): 
    """Given a list of locations for all the midi files in the dataset, this function encodes each song"""
    # Extract the unique pitches in the list of notes.

    network_input = []
    network_output = []
    for i in tqdm(range(len(DF))):
        song=DF.iloc[i,2]
        # create input sequences and the corresponding outputs
        
        for i in range(0, len(song) - sequence_length, 1):
            #we only use the sequence if the last event is not a gf event:
            sequence_in = song[i: i + sequence_length]
            sequence_out=song[i + sequence_length]
            if 'gf' not in str(sequence_out):
                network_input.append([WordToNumber[char] for char in sequence_in])
                network_output.append(NoteToNumber[sequence_out])
                

    n_patterns = len(network_input)
    
    # reshape the input into a format comatible with LSTM layers 
    network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
    
    # normalize input
    network_input = network_input / float(n_vocab)
    
    # one hot encode the output vectors
    network_output = np_utils.to_categorical(network_output)
    
    return (network_input, network_output)

In [30]:
inp,out=prepare_sequences_get_note(DF,len(WordToNumber),WordToNumber,NoteToNumber)
print(inp.shape)
print(out.shape)

100%|██████████| 1/1 [00:00<00:00, 182.34it/s]

(263, 100, 1)
(263, 44)





In [31]:
def create_network_gf_or_note(network_in, n_vocab_out): 
    """Create the model architecture"""
    model = Sequential()
    model.add(LSTM(50, input_shape=network_in.shape[1:], return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(50,return_sequences=True))
    model.add(Flatten())
    model.add(Dense(50))
    model.add(Dropout(0.3))
    model.add(Dense(n_vocab_out))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

In [32]:
def create_network_get_gf(network_in, n_vocab_out): 
    """Create the model architecture"""
    model = Sequential()
    model.add(LSTM(50, input_shape=network_in.shape[1:], return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(50,return_sequences=True))
    model.add(Flatten())
    model.add(Dense(50))
    model.add(Dropout(0.3))
    model.add(Dense(n_vocab_out))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

In [33]:
def create_network_get_note(network_in, n_vocab_out): 
    """Create the model architecture"""
    model = Sequential()
    model.add(LSTM(100, input_shape=network_in.shape[1:], return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(100,return_sequences=True))
    model.add(Flatten())
    model.add(Dense(100))
    model.add(Dropout(0.3))
    model.add(Dense(n_vocab_out))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

    return model

In [34]:
from keras.callbacks import ModelCheckpoint
def train_gf_or_note(model, network_input, network_output, epochs): 
    """
    Train the neural network
    """
    # Create checkpoint to save the best model weights.
    filepath = 'SavedModels/weights.gf_or_note.hdf5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True)
    
    model.fit(network_input, network_output, epochs=epochs, batch_size=10000, callbacks=[checkpoint])

In [35]:
def train_get_gf(model, network_input, network_output, epochs): 
    """
    Train the neural network
    """
    # Create checkpoint to save the best model weights.
    filepath = 'SavedModels/weights.get_gf.hdf5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True)
    
    model.fit(network_input, network_output, epochs=epochs, batch_size=10000, callbacks=[checkpoint])

In [36]:
def train_get_note(model, network_input, network_output, epochs): 
    """
    Train the neural network
    """
    # Create checkpoint to save the best model weights.
    filepath = 'SavedModels/weights.get_note.hdf5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True)
    
    model.fit(network_input, network_output, epochs=epochs, batch_size=10000, callbacks=[checkpoint])

In [38]:
epochs=1

gf_or_note_input,gf_or_note_output=prepare_sequences_gf_or_note(DF,len(WordToNumber),WordToNumber)
get_gf_input,get_gf_output=prepare_sequences_get_gf(DF,len(WordToNumber),WordToNumber,GfToNumber)
get_note_input,get_note_output=prepare_sequences_get_note(DF,len(WordToNumber),WordToNumber,NoteToNumber)

print('Sequences Prepared, creating models')

gf_or_note=create_network_gf_or_note(gf_or_note_input,gf_or_note_output.shape[1])
get_gf=create_network_get_gf(get_gf_input,get_gf_output.shape[1])
get_note=create_network_get_note(get_note_input,get_note_output.shape[1])

print('Models Created, training in progress')

train_gf_or_note(gf_or_note,gf_or_note_input,gf_or_note_output,epochs)
train_get_gf(get_gf,get_gf_input,get_gf_output,epochs)
train_get_note(get_note,get_note_input,get_note_output,epochs)

print('Training completed')

100%|██████████| 1/1 [00:00<00:00, 123.62it/s]
100%|██████████| 1/1 [00:00<00:00, 215.65it/s]
100%|██████████| 1/1 [00:00<00:00, 241.33it/s]


Sequences Prepared, creating models
Models Created, training in progress
Epoch 1/1
Epoch 1/1
Epoch 1/1
Training completed
