In [None]:
!pip install mido
!pip install pygame
!pip install music21
!pip install scikit-learn==1.3.0

In [1]:
# import some useful libraries
import glob, nltk, joblib
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import metrics

from PIL import Image
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer

from music21 import midi
from plugins.midi2img import midi2img
from plugins.img2midi import img2midi
from IPython.display import clear_output

# Load In museGAN dataset for visualization purposes
It turned out that the people at museGAN is leveraging midi -> image conversion. The image consisted of bar of a multi track piano roll. From the below image, the horizontal represent time and the vericle represent the instrument used. In this dataset the instrument are layered from bottom to top as piano, strings, guitar, drums, bass.


In [2]:
# download the punkt tokenizer from nltk to tokenize the piece caption
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ktrin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Construction of museGAN
## External Data Source
For whatever reason, if we wanted to perform GAN modeling, we can leverage conversion of MIDI data to that of the piano roll. Download the data from piano repo in README and start performing the things below. [Convert-MIDI-TO-NP-ARRAY](https://medium.com/analytics-vidhya/convert-midi-file-to-numpy-array-in-python-7d00531890c)

# Caption Processing

In [3]:
MAX_SEQ_LENGTH = 20 # 18 + start, end
EMBED_DIM = 100 
MAX_VOCAB_SIZE = 20000

In [4]:
# load in the metadata
# create a list of captions that concatenate the piece description and arousal
# lower case te caption list
midi_meta = pd.read_csv('../data/piano-labelled/labelled_piano_midi_metadata.csv')
midi_meta['caption_list'] = midi_meta['piece_description'].str.lower()+ ". " + midi_meta['piece_arousal'].str.lower()
midi_meta

Unnamed: 0,piece_id,piece_description,piece_arousal,piece_name,midi_file,caption_list
0,0,very upbeat,Delighted,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,very upbeat. delighted
1,1,I could tell the valence of the example was in...,Valence started out moderately negative and pr...,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,i could tell the valence of the example was in...
2,2,For a second I thought this piece was going to...,This piece seemed to have a positive valence t...,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,for a second i thought this piece was going to...
3,3,Bouncy and fun,Kind of sparatic,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,bouncy and fun. kind of sparatic
4,4,nice,nice,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,nice. nice
...,...,...,...,...,...,...
6100,6100,It started off slowly but happy and then built...,Seemed to remain consistent almost like it was...,One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,it started off slowly but happy and then built...
6101,6101,This starts off a certain way then changes in ...,This is nostalgic because I recognize this and...,One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,this starts off a certain way then changes in ...
6102,6102,The piece begins slow in tempo and then become...,"The beginning rhythm sounds suspenseful, makin...",One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,the piece begins slow in tempo and then become...
6103,6103,started slow but picked up.,I feel it stayed the same,One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,started slow but picked up. . i feel it stayed...


In [5]:
# build a vocabulary using sklearn count vectorizer to create a vocab from the most frequent words
input_captions = []
max_caption_length = -1 

for caption in tqdm(midi_meta['caption_list'].values):
    tokenized_caption = nltk.word_tokenize(caption, language='english')

    if len(tokenized_caption) > max_caption_length:
        max_caption_length = len(tokenized_caption)

    caption = (' '.join(tokenized_caption)).lower()
    input_captions.append(caption)


vectorizer = CountVectorizer(max_features=MAX_VOCAB_SIZE)
vectorizer.fit(input_captions)
vocab = vectorizer.get_feature_names_out()
MAX_VOCAB_SIZE = len(vocab)

  0%|          | 0/6105 [00:00<?, ?it/s]

In [6]:
# turn vocab into a dictionary of words and token id
# replace some words with special tokens like start/end/unk
# if the caption is too short, pad it with <pad> token
id_vocab_dict = {}
vocab_id_dict = {}

for sid, svocab in enumerate(vocab):
    id_vocab_dict[sid] = svocab
    vocab_id_dict[svocab] = sid

id_vocab_dict[MAX_VOCAB_SIZE] = "<unk>"
id_vocab_dict[MAX_VOCAB_SIZE + 1] = "<start>"
id_vocab_dict[MAX_VOCAB_SIZE + 2] = "<end>"
id_vocab_dict[MAX_VOCAB_SIZE + 3] = "<pad>"

vocab_id_dict["<unk>"] = MAX_VOCAB_SIZE
vocab_id_dict["<start>"] = MAX_VOCAB_SIZE + 1
vocab_id_dict["<end>"] = MAX_VOCAB_SIZE + 2
vocab_id_dict["<pad>"] = MAX_VOCAB_SIZE + 3

In [7]:
# tokenization - take the input caption and tokenize it
# declare a max sequence length 
def convert_text_to_data(texts, 
                         vocab_id_dict, 
                         max_length=20, 
                         type=None):
    """
        Function to convert text based data into tokenized data with proper padding
    """

    processed_data = []
    for text_num, text in enumerate(texts):
        sentence_ids = []

        # split the sentence into token
        # use the vocab to turn the word token into number
        for token in text.split():
            if token in vocab_id_dict.keys():
                sentence_ids.append(vocab_id_dict[token])
            else:
                sentence_ids.append(vocab_id_dict["<unk>"])

        vocab_size = len(vocab_id_dict.keys())

        # for decoder cases:
        # input sentence: <start>, [tokenize words from vocab], <end>, padded with <unk>
        # ouput sentence has: [tokenize words from vocab], <end>, padded with <unk>
        if type == 'input_target':
            ids = ([vocab_size - 3] + sentence_ids + [vocab_size - 2] + [vocab_size - 1] * max_length)[:max_length]
        elif type == 'output_target':
            ids = (sentence_ids + [vocab_size - 2] + [vocab_size - 1] * max_length)[:max_length]
        processed_data.append(ids)

    return np.array(processed_data)


train_target_input_data = convert_text_to_data(input_captions,
                                                vocab_id_dict,
                                                type='input_target',
                                                max_length=MAX_SEQ_LENGTH)
len(train_target_input_data)

6105

In [8]:
# added the tokenized caption to the metadata
midi_meta['tokenized_captions'] = train_target_input_data.tolist()
midi_meta

Unnamed: 0,piece_id,piece_description,piece_arousal,piece_name,midi_file,caption_list,tokenized_captions
0,0,very upbeat,Delighted,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,very upbeat. delighted,"[3452, 3319, 3258, 3451, 762, 3453, 3454, 3454..."
1,1,I could tell the valence of the example was in...,Valence started out moderately negative and pr...,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,i could tell the valence of the example was in...,"[3452, 3451, 661, 3025, 3053, 3290, 2095, 3053..."
2,2,For a second I thought this piece was going to...,This piece seemed to have a positive valence t...,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,for a second i thought this piece was going to...,"[3452, 1220, 3451, 2627, 3451, 3078, 3074, 224..."
3,3,Bouncy and fun,Kind of sparatic,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,bouncy and fun. kind of sparatic,"[3452, 364, 138, 1262, 3451, 1686, 2095, 2820,..."
4,4,nice,nice,Lurking In The Darkness,Final Fantasy_PS1_Final Fantasy VII_Lurking In...,nice. nice,"[3452, 2044, 3451, 2044, 3453, 3454, 3454, 345..."
...,...,...,...,...,...,...,...
6100,6100,It started off slowly but happy and then built...,Seemed to remain consistent almost like it was...,One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,it started off slowly but happy and then built...,"[3452, 1623, 2869, 2096, 2756, 413, 1374, 138,..."
6101,6101,This starts off a certain way then changes in ...,This is nostalgic because I recognize this and...,One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,this starts off a certain way then changes in ...,"[3452, 3074, 2872, 2096, 3451, 468, 3366, 3061..."
6102,6102,The piece begins slow in tempo and then become...,"The beginning rhythm sounds suspenseful, makin...",One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,the piece begins slow in tempo and then become...,"[3452, 3053, 2241, 304, 2751, 1510, 3031, 138,..."
6103,6103,started slow but picked up.,I feel it stayed the same,One Winged Angel,Final Fantasy_PS1_Final Fantasy VII_One Winged...,started slow but picked up. . i feel it stayed...,"[3452, 2869, 2751, 413, 2233, 3451, 3451, 3451..."


# GAN Definition

GAN model consists of two part:
1. Generator
2. Discriminator

In [9]:
# check to see if tensorflow mount to GPU properly
print(tf.config.list_physical_devices('GPU'))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
def caption_enhanced_generator(latent_dim=100, 
                               caption_dim=MAX_SEQ_LENGTH, 
                               vocab_size=len(vocab_id_dict.keys()), 
                               embed_dim=EMBED_DIM):
    """Define the generator model
        Inputs:
            latent_dim: dimension of the latent space
        Output:
            model: the generator model
    """
    n_nodes = 128 * 53 * 53

    # vectorized input layers
    input_layer = keras.layers.Input(shape=(latent_dim,), name='input_layer')
    
    # # vectorized caption input layers
    # # apply word embedding to the caption
    caption_input_layer = keras.layers.Input(shape=(caption_dim,), name='caption_input_layer')
    embedding_layer  = keras.layers.Embedding(input_dim=vocab_size,
                                                output_dim=embed_dim,
                                                name='caption_embedding_layer')
    embed_caption = embedding_layer(caption_input_layer)

    # # source_image_encoding = keras.layers.GlobalAveragePooling2D()(dense4)
    # # using LSTM to encode the caption with the input layer
    lstm_layer = keras.layers.LSTM(100, return_sequences=True, return_state=True, name="decoder_lstm_layer")
    decoder_output, decoder_state_h_output, decoder_state_c_output = lstm_layer(embed_caption, initial_state=[input_layer, input_layer])

    # apply 1D Global Average Pooling to the output of the dense layer on the caption decoded
    # global_average_pooling1d_layer = keras.layers.GlobalAveragePooling1D()(decoder_output)

    # Dense Layer 1
    dense1 = keras.layers.Dense(n_nodes)(decoder_state_c_output)
    leaky_relu1 = keras.layers.LeakyReLU(alpha=0.35)(dense1)
    reshape_layer = keras.layers.Reshape((53, 53, 128))(leaky_relu1)

    # Dense Layer 2
    dense2 =  keras.layers.Dense(1024)(reshape_layer)

    # Conv2DTranspose Layer
    conv2d_transpose = keras.layers.Conv2DTranspose(1024, (4, 4), strides=(2, 2), padding='same')(dense2)

    # Dense Layer 3
    dense3 =  keras.layers.Dense(1024)(conv2d_transpose)
    leaky_relu2 = keras.layers.LeakyReLU(alpha=0.35)(dense3)

    # Dense Layer 4
    dense4 =  keras.layers.Dense(512)(leaky_relu2)

    # Conv2D Layer
    conv2d = keras.layers.Conv2D(1, (7, 7), padding='same', activation='sigmoid')(dense4)

    # Create the model
    model = keras.Model(inputs=[input_layer,caption_input_layer], outputs=conv2d, name='generator')
    return model


In [11]:
def caption_enhanced_discriminator(in_shape = (106,106,1)):
    """
        GAN discriminator model
        Inputs:
            in_shape: shape of the input image
        Output:
            model: discriminator model with binary crossentropy loss to denotes if the image is real or fake
    """
    # Input Layer
    input_layer = keras.layers.Input(shape=in_shape, name='input_layer')
    
    # 2D Convlution Layer 1
    conv1 = keras.layers.Conv2D(64, (3,3), strides=(2, 2), padding='same')(input_layer)
    leaky_relu1 = keras.layers.LeakyReLU(alpha=0.2)(conv1)
    dropout1 = keras.layers.Dropout(0.5)(leaky_relu1)

    # 2D Convlution Layer 2
    conv2 = keras.layers.Conv2D(64, (3,3), strides=(2, 2), padding='same')(dropout1)
    leaky_relu2 = keras.layers.LeakyReLU(alpha=0.2)(conv2)
    dropout2 = keras.layers.Dropout(0.5)(leaky_relu2)

    # Flatten Layer
    flatten_layer = keras.layers.Flatten()(dropout2)

    # Batch Normalization Layer
    batch_normalization = keras.layers.BatchNormalization()(flatten_layer)

    # Dense Output Disminator Layer
    discriminate_layer = keras.layers.Dense(1, activation='sigmoid')(batch_normalization)

    # Create the model
    model = keras.Model(inputs=input_layer, outputs=discriminate_layer, name='discriminator_model')
    
    # model compile
    opt = keras.optimizers.Adam(lr=0.0002, beta_1=0.5, clipnorm=1.0)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

In [12]:
def caption_enhanced_miniGAN(g_model, d_model,
                             g_model_input_shape=100, 
                             g_model_caption_input_shape=MAX_SEQ_LENGTH):
    """
        GAN model architecture
        Inputs:
            g_model: generator model
            d_model: discriminator model
            g_model_input_shape: shape of the input to the generator model
            g_model_caption_input_shape: shape of the input caption to the generator model
        Output:
            model: GAN model
    """
    # Pause the training of the discriminator
    d_model.trainable = False

    # Define the input layer for the generator
    generator_input = keras.layers.Input(shape=(g_model_input_shape))  # Specify the shape of the generator's input
    caption_input = keras.layers.Input(shape=(g_model_caption_input_shape))  # Specify the shape of the generator's input

    # Define the output of the generator
    generator_output = g_model([generator_input, caption_input])

    # Define the output of the discriminator
    discriminator_output = d_model(generator_output)

    # Create the model
    model = keras.Model(inputs=[generator_input, caption_input], outputs=discriminator_output)

    # Compile the model
    opt = keras.optimizers.Adam(lr=0.0002, beta_1=0.5, beta_2=0.9, clipnorm=2.0)
    model.compile(loss='binary_crossentropy', optimizer=opt)
    return model

In [13]:
def generate_real_samples(dataset, n_samples):
    # slice the image and caption from the dataset
    # generate 'real' class labels (1)
    images, captions = zip(*dataset)
    images = np.array(images)
    captions = np.array(captions)
    ix = np.random.randint(0, images.shape[0], n_samples)
    X_img = images[ix]   
    X_cap = captions[ix]
    y = np.ones((n_samples, 1))
    return X_img, X_cap, y
 
def generate_latent_points(latent_dim, n_samples):
    x_input = np.random.randn(latent_dim * n_samples)
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input

def generate_fake_samples(g_model, latent_dim, caption, n_samples):
    x_input = generate_latent_points(latent_dim, n_samples)
    X = g_model.predict([x_input, caption])
    y = np.zeros((n_samples, 1))
    return X, y

In [14]:
latent_dim = 100
g_model = caption_enhanced_generator(latent_dim)
d_model = caption_enhanced_discriminator()
gan_model = caption_enhanced_miniGAN(g_model, d_model)
g_model.summary(), d_model.summary(), gan_model.summary()



Model: "generator"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
caption_input_layer (InputLayer [(None, 20)]         0                                            
__________________________________________________________________________________________________
caption_embedding_layer (Embedd (None, 20, 100)      345500      caption_input_layer[0][0]        
__________________________________________________________________________________________________
input_layer (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
decoder_lstm_layer (LSTM)       [(None, 20, 100), (N 80400       caption_embedding_layer[0][0]    
                                                                 input_layer[0][0]        

(None, None, None)

In [15]:
# # plot the generated model
# keras.utils.plot_model(g_model, show_shapes=True, dpi=90)

In [16]:
# # plot the generated model
# keras.utils.plot_model(d_model, show_shapes=True, dpi=90)

In [17]:
# # plot the generated model
# keras.utils.plot_model(gan_model, show_shapes=True, dpi=90)

In [18]:
# # generate samples and save as a plot and save the model
# def summarize_performance(step, g_model, gan_model, latent_dim, x_cap, n_samples=100):
# 	# prepare fake examples
# 	X, _ = generate_fake_samples(g_model, latent_dim, x_cap, n_samples)
#     # save plot
# 	# scale from [-1,1] to [0,1]
# 	X = (X + 1) / 2.0
# 	# plot images
# 	for i in range(100):
# 		# define subplot
# 		plt.subplot(10, 10, 1 + i)
# 		# turn off axis
# 		plt.axis('off')
# 		# plot raw pixel data
# 		plt.imshow(X[i, :, :, 0], cmap='gray_r')
# 	# save plot to file
# 	filename1 = 'generated_plot_%04d.png' % (step+1)
# 	plt.savefig(filename1)
# 	plt.close()
# 	# save the generator model
# 	filename2 = 'model_%04d.h5' % (step+1)
# 	g_model.save(filename2)
# 	# save the gan model
# 	filename3 = 'gan_model_%04d.h5' % (step+1)
# 	gan_model.save(filename3)
# 	print('>Saved: %s, %s, and %s' % (filename1, filename2, filename3))

In [19]:
def train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs=50, n_batch=16):
    bat_per_epo = int(len(dataset) / n_batch)
    half_batch = int(n_batch / 2)

    for i in range(n_epochs):
        for j in range(bat_per_epo):
            # prepare a mini batch of real and fake sample
            X_img_real, x_cap_real, y_real = generate_real_samples(dataset, half_batch)
            X_img_fake, y_fake = generate_fake_samples(g_model, latent_dim, x_cap_real, half_batch)

            print(np.unique(X_img_real), np.unique(X_img_fake))
            # print(X_img_real, X_img_fake)
            
            # use real caption input to generate a fake image
            # train the discriminator on fake images generate from real caption and a set of latent points
            X, y = np.vstack((X_img_real, X_img_fake)), np.vstack((y_real, y_fake))
            d_loss, _ = d_model.train_on_batch(X, y)


            # print("Discriminator Loss: ", (d_loss, _))
            # prepare points in latent space and fetch some real caption input

            X_img_real_full, x_cap_real_full, y_real_full = generate_real_samples(dataset, n_batch)
            X_gan = generate_latent_points(latent_dim, n_batch)
            y_gan = np.ones((n_batch, 1))

            # perform full batch training on e2e GAN model 
            # using the latent point and real caption as input
            g_loss = gan_model.train_on_batch([X_gan, x_cap_real_full], y_gan)
            print('>%d, %d/%d, d=%.3f, g=%.3f' % (i+1, j+1, bat_per_epo, d_loss, g_loss))

            # test print the image
            # X = g_model.predict([X_gan, x_cap_real_full])
            # print(np.unique(X))

        if (i+1) % 2 == 0:
            # summarize_performance(i, g_model, d_model, dataset, latent_dim)
            clear_output()


In [None]:
# read in the training data
import random
random.seed(5634)

training_set = joblib.load('../data/piano-labelled/training_set.joblib')
training_set = random.sample(training_set, 2048) #5 is the lenth of the sample

In [None]:
train(g_model, d_model, gan_model, training_set, latent_dim, n_epochs=3, n_batch=18)

In [None]:
# # save the model
# g_model.save('../models/mini-gan/caption_piano_generator_model.h5')
# d_model.save('../models/mini-gan/caption_piano_discriminator_model.h5')
# gan_model.save('../models/mini-gan/caption_piano_gan_model.h5')

In [20]:
# read in the training data
import random
random.seed(6372)
training_set = joblib.load('../data/piano-labelled/training_set.joblib')
training_set = random.sample(training_set, 3072) #5 is the lenth of the sample

In [21]:
latent_dim = 100
g_model = caption_enhanced_generator(latent_dim)
d_model = caption_enhanced_discriminator()
gan_model = caption_enhanced_miniGAN(g_model, d_model)
# g_model.summary(), d_model.summary(), gan_model.summary()

In [22]:
train(g_model, d_model, gan_model, training_set, latent_dim, n_epochs=5, n_batch=18)

[0. 1.] [0.0000000e+00 1.3142993e-38 1.3747493e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 1/170, d=0.000, g=0.002
[0. 1.] [0.0000000e+00 1.2324019e-38 1.2430062e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 2/170, d=0.000, g=0.002
[0. 1.] [0.0000000e+00 1.2533002e-38 1.3262560e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 3/170, d=0.001, g=0.008
[0. 1.] [0.0000000e+00 1.2419445e-38 1.3098148e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 4/170, d=0.000, g=0.013
[0. 1.] [0.0000000e+00 1.2002767e-38 1.2125364e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 5/170, d=0.000, g=0.009
[0. 1.] [0.0000000e+00 1.1866823e-38 1.2171058e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 6/170, d=0.000, g=0.003
[0. 1.] [0.0000000e+00 1.2211984e-38 1.3887487e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 7/170, d=0.000, g=0.003
[0. 1.] [0.0000000e+00 1.1997640e-38 1.2078093e-38 ... 9.9999976e-01 9.9999988e-01
 1.0000000e+00]
>5, 8/170, d

In [23]:
# save the model
g_model.save('../models/mini-gan/caption_piano_generator_model_g5.h5')
d_model.save('../models/mini-gan/caption_piano_discriminator_model_g5.h5')
gan_model.save('../models/mini-gan/caption_piano_gan_model_g5.h5')



In [None]:
# Load model
g_model = keras.models.load_model('../models/mini-gan/caption_piano_generator_model_g5.h5')

In [24]:
# select a random row from the metadata to get the caption
row = midi_meta.sample(1, random_state=65)

# get a random image tokenize caption and actual caption
NLP_caption = row['caption_list'].values
caption = [np.array(a) for a in row['tokenized_captions'].values]
caption = np.array(caption)
NLP_caption, caption

(array(["started off a bit negative and then transitioned to a more upbeat timbre but maintained a tense mood. . there was a tautness to this piece and i would label it tense/excited as i didn't feel a large shift in the tone. "],
       dtype=object),
 array([[3452, 2869, 2096, 3451,  321, 2027,  138, 3061, 3164, 3122, 3451,
         1970, 3258, 3110,  413, 1837, 3451, 3038, 3451, 3451]]))

In [25]:
def generate_latent_points(latent_dim, n_samples):
    x_input = np.random.randn(latent_dim * n_samples)
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input


# declare a latent space
latent_dim = 100
latent_points = generate_latent_points(latent_dim, 1)
latent_points.shape, caption.shape

((1, 100), (1, 20))

In [26]:
model = g_model
X = g_model.predict([latent_points, caption])#*400
array = np.array(X.reshape(106,106),dtype = np.uint8)
np.unique(array)

array([0, 1], dtype=uint8)

In [27]:
array*=255
new_image = Image.fromarray(array,'L')
new_image = new_image.save(f'../data/midi_reconstruction/images/captioned_piece_test.png')

In [28]:
# reconvert MIDI images to MIDI files
image_path = "../data/midi_reconstruction/images/captioned_piece_test.png"
output_path = "../data/midi_reconstruction"

img2midi_obj = img2midi(image_path, output_path, resolution=0.25)
img2midi_obj.convert_to_midi()

In [29]:
# run to cell to play
# stop the cell and run sp.stop to stop the music
from music21 import midi, converter, instrument, note, chord

mf = midi.MidiFile()
mf.open(f"{output_path}/music.mid")
mf.read()
mf.close()
s = midi.translate.midiFileToStream(mf)
sp = midi.realtime.StreamPlayer(s)
sp.play()

pygame 2.5.1 (SDL 2.28.2, Python 3.9.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [None]:
sp.stop