## Music Generation V1


This script builds a model to learn music sequences from MIDI files. The model is a stacked LSTM model which takes as input a set of temporal notes and predicts the next note in the sequence

In [21]:
import os, random
import numpy as np
import pandas as pd
import glob

In [162]:
# Libs for music processing 
import msgpack
import mido
from mido import MidiFile

In [163]:
# Using music 21

from music21 import midi
from music21 import converter, instrument, note, chord, stream

In [164]:
# For preprocessing and modeling
from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
from keras.layers import LSTM, Input, Dense, Embedding, Flatten, Reshape
from keras.activations import relu, tanh
from keras.models import Model
from keras.preprocessing.text import one_hot

## Fetching data from midi files

They are extracted as notes

### Appendix

### Alphabet simluation

Before predicting music, let's build a simple aphabet predictor and make sure it works.

Predict ['A'] --> ['B']

In [237]:
LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
c2i = {}
i2c = {}
for i in range(len(LETTERS)):
    c2i[LETTERS[i]] = i
    i2c[i] = LETTERS[i]

In [325]:
time_step_length = 3
train_X = []
train_Y = []
for i in range(len(LETTERS) - time_step_length):
    train_X.append(list(LETTERS[i: i+time_step_length]))
    train_Y.append(LETTERS[i+time_step_length])

train_Y = np.reshape(train_Y, (len(train_Y), 1))
print(train_X[0], train_Y[0])

['A', 'B', 'C'] ['D']


In [386]:
X = []
Y = []
max_val = np.max(list(c2i.values()))

# Encode and transform X
for e in train_X:
    X.append([c2i[se] for se in e])
X = np.reshape(np.array(X), (len(X), time_step_length, 1))
X = X/max_val

# Encode and transform Y
Y = [c2i[e] for e in train_Y.flatten()]
Y = np_utils.to_categorical(Y)

print(X.shape)
print(X[[1]])
print(Y[1])

(23, 3, 1)
[[[0.04]
  [0.08]
  [0.12]]]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


In [387]:
Y.shape[0]

23

In [404]:
# Simple model to encode the categorical variable and pass it though a dense layer
model_input = Input(shape=(time_step_length, 1))
# emb_layer_1 = Embedding(input_dim=26, output_dim=8)(model_input)
# flat_1 = Flatten()(emb_layer_1)
# shape_1 = Reshape(target_shape=(time_step_length,8))(emb_layer_1)
lstm_1 = LSTM(1, return_sequences=False, return_state=False)(model_input)
dense_1 = Dense(Y.shape[1], activation='softmax')(lstm_1)
model = Model(model_input, dense_1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_65 (InputLayer)        (None, 3, 1)              0         
_________________________________________________________________
lstm_42 (LSTM)               (None, 1)                 12        
_________________________________________________________________
dense_17 (Dense)             (None, 26)                52        
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________


In [405]:
# print(m-odel.predict(X).shape)
a = model.predict(X[[1]])
a

array([[0.03873375, 0.03813245, 0.03835814, 0.03849081, 0.03874741,
        0.03856536, 0.03857245, 0.03841167, 0.03821522, 0.03832662,
        0.03850126, 0.03878108, 0.03850385, 0.03874188, 0.03871085,
        0.03837019, 0.03820381, 0.03803897, 0.03817413, 0.03826901,
        0.03863152, 0.03810645, 0.03865402, 0.03855532, 0.03863788,
        0.03856586]], dtype=float32)

In [406]:
model.compile(loss='categorical_crossentropy', optimizer='sgd')
model.fit(X, Y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1361faeb8>

In [407]:
print(Y[0])
model.predict(X[[0]])*max_val

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


array([[0.9616774 , 0.95320845, 0.95639545, 0.9624334 , 0.9660541 ,
        0.96348774, 0.9635884 , 0.96131665, 0.9585336 , 0.96011347,
        0.96258485, 0.9665317 , 0.9626223 , 0.9659805 , 0.96554375,
        0.9607327 , 0.9583746 , 0.956033  , 0.95795393, 0.95930016,
        0.9644268 , 0.9569931 , 0.9647444 , 0.96335167, 0.96451706,
        0.9635008 ]], dtype=float32)

In [408]:
model.predict(X[[1]])*max_val

array([[0.9645791 , 0.94969904, 0.9552858 , 0.96274006, 0.9691164 ,
        0.96459484, 0.964772  , 0.9607774 , 0.9558947 , 0.958665  ,
        0.9630064 , 0.96995896, 0.9630721 , 0.96898663, 0.96821636,
        0.95975184, 0.95561594, 0.9515176 , 0.95487905, 0.9572383 ,
        0.96624804, 0.953197  , 0.9668075 , 0.9643553 , 0.966407  ,
        0.9646176 ]], dtype=float32)