In [1]:
!pip install mido
!pip install pygame
!pip install music21
!pip install scikit-learn==1.3.0



In [1]:
# import some useful libraries
import glob, nltk, joblib
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import metrics

from PIL import Image
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer

from music21 import midi
from plugins.midi2img import midi2img
from plugins.img2midi import img2midi
from IPython.display import clear_output

# Load In museGAN dataset for visualization purposes
It turned out that the people at museGAN is leveraging midi -> image conversion. The image consisted of bar of a multi track piano roll. From the below image, the horizontal represent time and the vericle represent the instrument used. In this dataset the instrument are layered from bottom to top as piano, strings, guitar, drums, bass.


In [2]:
DATA_PATH = "../data/dataset/"

In [3]:
# download the punkt tokenizer from nltk to tokenize the piece caption
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ktrin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# load the training data
# read in the training data
training_set = joblib.load(f'{DATA_PATH}/training_set_desc.joblib')
training_set[0]

# conver tuple to DF
training_set = pd.DataFrame(training_set, columns=['id', 'image', 'caption_list'])
training_set.head()

Unnamed: 0,id,image,caption_list
0,commu00343,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of lucid skies traversing ethereal re...
1,commu07280,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",an enchanting symphony of soaring dreams where...
2,commu02735,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",uncharted melody a harmonious journey through ...
3,commu06179,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of shadows a melodys dance in the twi...
4,commu03042,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of shadows a haunting symphony where ...


# Construction of museGAN
## External Data Source
For whatever reason, if we wanted to perform GAN modeling, we can leverage conversion of MIDI data to that of the piano roll. Download the data from piano repo in README and start performing the things below. [Convert-MIDI-TO-NP-ARRAY](https://medium.com/analytics-vidhya/convert-midi-file-to-numpy-array-in-python-7d00531890c)

# Caption Processing

In [5]:
MAX_SEQ_LENGTH = 20 # 18 + start, end
EMBED_DIM = 100 
MAX_VOCAB_SIZE = 20000

In [6]:
# # load in the metadata
# # create a list of captions that concatenate the piece description and arousal
# # lower case te caption list
# midi_meta = pd.read_csv('../data/piano-labelled/labelled_piano_midi_metadata.csv')
# midi_meta['caption_list'] = midi_meta['piece_description'].str.lower()+ ". " + midi_meta['piece_arousal'].str.lower()
# midi_meta

In [7]:
# build a vocabulary using sklearn count vectorizer to create a vocab from the most frequent words
input_captions = []
max_caption_length = -1 

for caption in tqdm(training_set['caption_list'].values):
    tokenized_caption = nltk.word_tokenize(caption, language='english')

    if len(tokenized_caption) > max_caption_length:
        max_caption_length = len(tokenized_caption)

    caption = (' '.join(tokenized_caption)).lower()
    input_captions.append(caption)


vectorizer = CountVectorizer(max_features=MAX_VOCAB_SIZE)
vectorizer.fit(input_captions)
vocab = vectorizer.get_feature_names_out()
MAX_VOCAB_SIZE = len(vocab)

  0%|          | 0/195 [00:00<?, ?it/s]

In [8]:
# turn vocab into a dictionary of words and token id
# replace some words with special tokens like start/end/unk
# if the caption is too short, pad it with <pad> token
id_vocab_dict = {}
vocab_id_dict = {}

for sid, svocab in enumerate(vocab):
    id_vocab_dict[sid] = svocab
    vocab_id_dict[svocab] = sid

id_vocab_dict[MAX_VOCAB_SIZE] = "<unk>"
id_vocab_dict[MAX_VOCAB_SIZE + 1] = "<start>"
id_vocab_dict[MAX_VOCAB_SIZE + 2] = "<end>"
id_vocab_dict[MAX_VOCAB_SIZE + 3] = "<pad>"

vocab_id_dict["<unk>"] = MAX_VOCAB_SIZE
vocab_id_dict["<start>"] = MAX_VOCAB_SIZE + 1
vocab_id_dict["<end>"] = MAX_VOCAB_SIZE + 2
vocab_id_dict["<pad>"] = MAX_VOCAB_SIZE + 3

In [9]:
# tokenization - take the input caption and tokenize it
# declare a max sequence length 
def convert_text_to_data(texts, 
                         vocab_id_dict, 
                         max_length=20, 
                         type=None):
    """
        Function to convert text based data into tokenized data with proper padding
    """

    processed_data = []
    for text_num, text in enumerate(texts):
        sentence_ids = []

        # split the sentence into token
        # use the vocab to turn the word token into number
        for token in text.split():
            if token in vocab_id_dict.keys():
                sentence_ids.append(vocab_id_dict[token])
            else:
                sentence_ids.append(vocab_id_dict["<unk>"])

        vocab_size = len(vocab_id_dict.keys())

        # for decoder cases:
        # input sentence: <start>, [tokenize words from vocab], <end>, padded with <unk>
        # ouput sentence has: [tokenize words from vocab], <end>, padded with <unk>
        if type == 'input_target':
            ids = ([vocab_size - 3] + sentence_ids + [vocab_size - 2] + [vocab_size - 1] * max_length)[:max_length]
        elif type == 'output_target':
            ids = (sentence_ids + [vocab_size - 2] + [vocab_size - 1] * max_length)[:max_length]
        processed_data.append(ids)

    return np.array(processed_data)


train_target_input_data = convert_text_to_data(input_captions,
                                                vocab_id_dict,
                                                type='input_target',
                                                max_length=MAX_SEQ_LENGTH)
len(train_target_input_data)

195

In [10]:
# added the tokenized caption to the metadata
training_set['tokenized_captions'] = train_target_input_data.tolist()
training_set

Unnamed: 0,id,image,caption_list,tokenized_captions
0,commu00343,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of lucid skies traversing ethereal re...,"[1115, 1092, 669, 560, 822, 986, 353, 743, 111..."
1,commu07280,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",an enchanting symphony of soaring dreams where...,"[1115, 61, 320, 924, 669, 833, 272, 1083, 438,..."
2,commu02735,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",uncharted melody a harmonious journey through ...,"[1115, 995, 592, 1114, 439, 511, 952, 319, 743..."
3,commu06179,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of shadows a melodys dance in the twi...,"[1115, 1092, 669, 810, 1114, 593, 226, 484, 94..."
4,commu03042,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of shadows a haunting symphony where ...,"[1115, 1092, 669, 810, 1114, 446, 924, 1083, 5..."
...,...,...,...,...
190,commu05422,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",melodic whispers under a starlit sky embracing...,"[1115, 590, 1092, 996, 1114, 874, 825, 307, 94..."
191,commu03260,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers in shadows an enchanting melody awake...,"[1115, 1092, 484, 810, 61, 320, 592, 82, 1035,..."
192,commu04717,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",whispers of cinematic dreams unfold as a young...,"[1115, 1092, 669, 189, 272, 1002, 68, 1114, 11..."
193,commu02652,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...",lost in shadows the enigmatic serenade paintin...,"[1115, 556, 484, 810, 943, 327, 801, 694, 1059..."


In [11]:
training_set['image'].values[0].shape

(106, 106, 1)

In [12]:
# create image, tokenized pair

# create image-caption pairs
datasets = []
for i, row in training_set.iterrows():
    caption = np.array(row['tokenized_captions'])
    images = np.array(row['image'])
    try:
        datasets.append((images, caption))
    except:
        pass

In [13]:
# # select a random row from the metadata to get the caption
# row = midi_meta.sample(1, random_state=22)

# # get a random image tokenize caption and actual caption
# NLP_caption = row['caption_list'].values
# caption = [np.array(a) for a in row['tokenized_captions'].values]
# caption = np.array(caption)
# NLP_caption, caption

# GAN Definition

GAN model consists of two part:
1. Generator
2. Discriminator

In [25]:
# check to see if tensorflow mount to GPU properly
print(tf.config.list_physical_devices('GPU'))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [26]:
def caption_enhanced_generator(latent_dim=100, 
                               caption_dim=MAX_SEQ_LENGTH, 
                               vocab_size=len(vocab_id_dict.keys()), 
                               embed_dim=EMBED_DIM):
    """Define the generator model
        Inputs:
            latent_dim: dimension of the latent space
        Output:
            model: the generator model
    """
    n_nodes = 128 * 53 * 53

    # vectorized input layers
    input_layer = keras.layers.Input(shape=(latent_dim,), name='input_layer')
    
    # # vectorized caption input layers
    # # apply word embedding to the caption
    caption_input_layer = keras.layers.Input(shape=(caption_dim,), name='caption_input_layer')
    embedding_layer  = keras.layers.Embedding(input_dim=vocab_size,
                                                output_dim=embed_dim,
                                                name='caption_embedding_layer')
    embed_caption = embedding_layer(caption_input_layer)

    # # source_image_encoding = keras.layers.GlobalAveragePooling2D()(dense4)
    # # using LSTM to encode the caption with the input layer
    lstm_layer = keras.layers.LSTM(100, return_sequences=True, return_state=True, name="decoder_lstm_layer")
    decoder_output, decoder_state_h_output, decoder_state_c_output = lstm_layer(embed_caption, initial_state=[input_layer, input_layer])

    # apply 1D Global Average Pooling to the output of the dense layer on the caption decoded
    # global_average_pooling1d_layer = keras.layers.GlobalAveragePooling1D()(decoder_output)

    # Dense Layer 1
    dense1 = keras.layers.Dense(n_nodes)(decoder_state_c_output)
    leaky_relu1 = keras.layers.LeakyReLU(alpha=0.35)(dense1)
    reshape_layer = keras.layers.Reshape((53, 53, 128))(leaky_relu1)

    # Dense Layer 2
    dense2 =  keras.layers.Dense(1024)(reshape_layer)

    # Conv2DTranspose Layer
    conv2d_transpose = keras.layers.Conv2DTranspose(1024, (4, 4), strides=(2, 2), padding='same')(dense2)

    # Dense Layer 3
    dense3 =  keras.layers.Dense(1024)(conv2d_transpose)
    leaky_relu2 = keras.layers.LeakyReLU(alpha=0.35)(dense3)

    # Dense Layer 4
    dense4 =  keras.layers.Dense(1024)(leaky_relu2)

    # Conv2D Layer
    conv2d = keras.layers.Conv2D(1, (7, 7), padding='same', activation='sigmoid')(dense4)

    # Create the model
    model = keras.Model(inputs=[input_layer,caption_input_layer], outputs=conv2d, name='generator_model')
    return model


In [27]:
def caption_enhanced_discriminator(in_shape = (106,106,1)):
    """
        GAN discriminator model
        Inputs:
            in_shape: shape of the input image
        Output:
            model: discriminator model with binary crossentropy loss to denotes if the image is real or fake
    """
    # Input Layer
    input_layer = keras.layers.Input(shape=in_shape, name='input_layer')
    
    # 2D Convlution Layer 1
    conv1 = keras.layers.Conv2D(64, (3,3), strides=(2, 2), padding='same')(input_layer)
    leaky_relu1 = keras.layers.LeakyReLU(alpha=0.2)(conv1)
    dropout1 = keras.layers.Dropout(0.5)(leaky_relu1)
 
    # 2D Convlution Layer 2
    conv2 = keras.layers.Conv2D(64, (3,3), strides=(2, 2), padding='same')(dropout1)
    leaky_relu2 = keras.layers.LeakyReLU(alpha=0.2)(conv2)
    dropout2 = keras.layers.Dropout(0.5)(leaky_relu2)

    # Flatten Layer
    flatten_layer = keras.layers.Flatten()(dropout2)

    # Batch Normalization Layer
    batch_normalization = keras.layers.BatchNormalization()(flatten_layer)

    # Dense Output Disminator Layer
    discriminate_layer = keras.layers.Dense(1, activation='sigmoid')(batch_normalization)

    # Create the model
    model = keras.Model(inputs=input_layer, outputs=discriminate_layer, name='discriminator_model')
    
    # model compile
    # opt = keras.optimizers.Adam(lr=0.0002, beta_1=0.5)
    # model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

In [28]:
def caption_enhanced_miniGAN(g_model, d_model,
                             g_model_input_shape=100, 
                             g_model_caption_input_shape=MAX_SEQ_LENGTH):
    """
        GAN model architecture
        Inputs:
            g_model: generator model
            d_model: discriminator model
            g_model_input_shape: shape of the input to the generator model
            g_model_caption_input_shape: shape of the input caption to the generator model
        Output:
            model: GAN model
    """
    # Pause the training of the discriminator
    d_model.trainable = False

    # Define the input layer for the generator
    generator_input = keras.layers.Input(shape=(g_model_input_shape))  # Specify the shape of the generator's input
    caption_input = keras.layers.Input(shape=(g_model_caption_input_shape))  # Specify the shape of the generator's input

    # Define the output of the generator
    generator_output = g_model([generator_input, caption_input])

    # Define the output of the discriminator
    discriminator_output = d_model(generator_output)

    # Create the model
    model = keras.Model(inputs=[generator_input, caption_input], outputs=discriminator_output)

    # # Compile the model
    # opt = keras.optimizers.Adam(lr=0.0002, beta_1=0.5, beta_2=0.9, clipnorm=1.0)
    # model.compile(loss='binary_crossentropy', optimizer=opt)
    return model

In [29]:
class GAN(keras.Model):
    def __init__(
        self,
        discriminator,
        generator,
        latent_dim,
        discriminator_extra_steps=3,
        gp_weight=10.0,
    ):
        super().__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim
        self.d_steps = discriminator_extra_steps
        self.gp_weight = gp_weight
        self.d_gradient = []
        self.g_gradient = []

    def compile(self, d_optimizer, g_optimizer, d_loss_fn, g_loss_fn):
        super().compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.d_loss_fn = d_loss_fn
        self.g_loss_fn = g_loss_fn


    def train_step(self, dataset):
        # Unpack the data
        real_images, real_captions = dataset

        if isinstance(real_images, tuple):
            real_images = real_images[0]

        # Get the batch size
        batch_size = tf.shape(real_images)[0]

        # For each batch, we are going to perform the
        # following steps as laid out in the original paper:
        # 1. Train the generator and get the generator loss
        # 2. Train the discriminator and get the discriminator loss
        # 3. Calculate the gradient penalty
        # 4. Multiply this gradient penalty with a constant weight factor
        # 5. Add the gradient penalty to the discriminator loss
        # 6. Return the generator and discriminator losses as a loss dictionary

        # Train the discriminator first. The original paper recommends training
        # the discriminator for `x` more steps (typically 5) as compared to
        # one step of the generator. Here we will train it for 3 extra steps
        # as compared to 5 to reduce the training time.
        for i in range(self.d_steps):

            # Get the latent vector (random noise)
            random_latent_vectors = tf.random.normal(
                shape=(batch_size, self.latent_dim)
            )


            with tf.GradientTape() as tape:
                # Generate fake images from the latent vector
                fake_images = self.generator([random_latent_vectors, real_captions], training=True)

                # Get the logits for the fake images
                fake_logits = self.discriminator(fake_images, training=True)
                
                # Get the logits for the real images
                real_logits = self.discriminator(real_images, training=True)

                # Calculate the discriminator loss using the fake and real image logits
                d_loss = self.d_loss_fn(real_img=real_logits, fake_img=fake_logits)


            # Get the gradients w.r.t the discriminator loss
            d_gradient = tape.gradient(d_loss, self.discriminator.trainable_variables)
            self.d_gradient.append((i, d_gradient))

            # Update the weights of the discriminator using the discriminator optimizer
            self.d_optimizer.apply_gradients(
                zip(d_gradient, self.discriminator.trainable_variables)
            )

        # Train the generator
        # Get the latent vector
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
        with tf.GradientTape() as tape:
            # Generate fake images using the generator
            generated_images = self.generator([random_latent_vectors, real_captions], training=True)

            # Get the discriminator logits for fake images
            gen_img_logits = self.discriminator(generated_images, training=True)

            # Calculate the generator loss
            g_loss = self.g_loss_fn(gen_img_logits)

        # Get the gradients w.r.t the generator loss
        gen_gradient = tape.gradient(g_loss, self.generator.trainable_variables)
        # Update the weights of the generator using the generator optimizer
        self.g_optimizer.apply_gradients(
            zip(gen_gradient, self.generator.trainable_variables)
        )
        self.g_gradient.append(gen_gradient)
        return {"d_loss": d_loss, "g_loss": g_loss}

In [30]:
def generate_real_samples(dataset, n_samples):
    # slice the image and caption from the dataset
    # generate 'real' class labels (1)
    images, captions = zip(*dataset)
    images = np.array(images)
    captions = np.array(captions)
    ix = np.random.randint(0, images.shape[0], n_samples)
    X_img = images[ix]   
    X_cap = captions[ix]
    y = np.ones((n_samples, 1))
    return X_img, X_cap, y
 
def generate_latent_points(latent_dim, n_samples):
    x_input = np.random.randn(latent_dim * n_samples)
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input

def generate_fake_samples(g_model, latent_dim, caption, n_samples):
    x_input = generate_latent_points(latent_dim, n_samples)
    X = g_model.predict([x_input, caption])
    y = np.zeros((n_samples, 1))
    return X, y

In [31]:
latent_dim = 100
g_model = caption_enhanced_generator(latent_dim)
d_model = caption_enhanced_discriminator()
g_model.summary(), d_model.summary()

Model: "generator_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
caption_input_layer (InputLayer [(None, 10)]         0                                            
__________________________________________________________________________________________________
caption_embedding_layer (Embedd (None, 10, 100)      6000        caption_input_layer[0][0]        
__________________________________________________________________________________________________
input_layer (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
decoder_lstm_layer (LSTM)       [(None, 10, 100), (N 80400       caption_embedding_layer[0][0]    
                                                                 input_layer[0][0]  

(None, None)

In [32]:
# # plot the generated model
# keras.utils.plot_model(g_model, show_shapes=True, dpi=90)

In [33]:
# # plot the generated model
# keras.utils.plot_model(d_model, show_shapes=True, dpi=90)

In [34]:
# # plot the generated model
# keras.utils.plot_model(gan_model, show_shapes=True, dpi=90)

In [35]:
# read in the training data
import random
random.seed(5634)

# training_set = joblib.load('../data/piano-labelled/training_set.joblib')
training_dataset = random.sample(datasets, 4096) #5 is the lenth of the sample

In [36]:
# unzip the dataset
images, captions = zip(*training_dataset)
images = np.array(images)
captions = np.array(captions)
type(images), images.shape

(numpy.ndarray, (4096, 106, 106, 1))

In [37]:
# Set some training parameters
BATCH_SIZE = 8
noise_dim = 100
epochs = 3


# Instantiate the optimizer for both networks
# (learning_rate=0.0002, beta_1=0.5 are recommended)
generator_optimizer = keras.optimizers.Adam(
    learning_rate=0.0002, beta_1=0.5, beta_2=0.9,
    clipnorm=1.0
)
discriminator_optimizer = keras.optimizers.Adam(
    learning_rate=0.0002, beta_1=0.5, beta_2=0.9
)

cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Define the loss functions for the discriminator,
# which should be (fake_loss - real_loss).
# We will add the gradient penalty later to this loss function.
def discriminator_loss(real_img, fake_img):
    # real_loss = tf.reduce_mean(real_img)
    # fake_loss = tf.reduce_mean(fake_img)

    real_loss = cross_entropy(tf.ones_like(real_img), real_img)
    fake_loss = cross_entropy(tf.zeros_like(fake_img), fake_img) 
    return fake_loss + real_loss


# Define the loss functions for the generator.
def generator_loss(fake_img):
    # return -tf.reduce_mean(fake_img)
    return cross_entropy(tf.ones_like(fake_img), fake_img)



# Instantiate the customer `GANMonitor` Keras callback.
# cbk = GANMonitor(num_img=3, latent_dim=noise_dim, filepath='../model/wgan-callbacks')

# Get the wgan model
gan = GAN(
    discriminator=d_model,
    generator=g_model,
    latent_dim=noise_dim,
    discriminator_extra_steps=3,
)

# Compile the wgan model
gan.compile(
    d_optimizer=discriminator_optimizer,
    g_optimizer=generator_optimizer,
    g_loss_fn=generator_loss,
    d_loss_fn=discriminator_loss,
)

# Start training
gan.fit(images, captions, batch_size=BATCH_SIZE, epochs=epochs)

Epoch 1/3




Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1e922910220>

In [39]:
# save the model
g_model.save('../models/mini-gan/comMU_generator_model_g3.h5')
d_model.save('../models/mini-gan/comMU_discriminator_model_g3.h5')



In [40]:
# Load model
g_model = keras.models.load_model('../models/mini-gan/comMU_generator_model_g3.h5')



In [54]:
# midi_meta = pd.read_csv(f'{DATA_PATH}/commu_meta_caption.csv')
# midi_meta

In [53]:
training_set

Unnamed: 0,id,image,caption_list,tokenized_captions
0,commu00001,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","a, minor, mid, main, melody, cinematic, string...","[57, 56, 56, 34, 56, 33, 56, 29, 56, 32]"
1,commu00002,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","c, major, mid, low, accompaniment, newage, aco...","[57, 56, 56, 30, 56, 33, 56, 28, 56, 5]"
2,commu00003,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","a, minor, mid, high, riff, cinematic, string, ...","[57, 56, 56, 34, 56, 33, 56, 26, 56, 42]"
3,commu00004,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","c, major, mid, pad, cinematic, choir","[57, 56, 56, 30, 56, 33, 56, 39, 56, 15]"
4,commu00005,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","a, minor, mid, low, pad, cinematic, acoustic, ...","[57, 56, 56, 34, 56, 33, 56, 28, 56, 39]"
...,...,...,...,...
10241,commu10377,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","c, major, low, bass, cinematic, electric, bass","[57, 56, 56, 30, 56, 28, 56, 8, 56, 15]"
10242,commu10378,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","a, minor, mid, pad, cinematic, string, ensembl...","[57, 56, 56, 34, 56, 33, 56, 39, 56, 15]"
10243,commu10379,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","a, minor, mid, pad, cinematic, string, ensemble","[57, 56, 56, 34, 56, 33, 56, 39, 56, 15]"
10244,commu10380,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","c, major, low, bass, cinematic, acoustic, piano","[57, 56, 56, 30, 56, 28, 56, 8, 56, 15]"


In [55]:
# select a random row from the metadata to get the caption
row = training_set.sample(1, random_state=756)

# get a random image tokenize caption and actual caption
NLP_caption = row['caption_list'].values
caption = [np.array(a) for a in row['tokenized_captions'].values]
caption = np.array(caption)
NLP_caption, caption

(array(['c, major, mid, high, main, melody, newage, acoustic, piano'],
       dtype=object),
 array([[57, 56, 56, 30, 56, 33, 56, 26, 56, 29]]))

In [56]:
def generate_latent_points(latent_dim, n_samples):
    x_input = np.random.randn(latent_dim * n_samples)
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input


# declare a latent space
latent_dim = 100
latent_points = generate_latent_points(latent_dim, 1)
latent_points.shape, caption.shape

((1, 100), (1, 10))

In [57]:
model = g_model
X = g_model.predict([latent_points, caption])#*400
array = np.array(X.reshape(106,106),dtype = np.uint8)
np.unique(array)

array([0], dtype=uint8)

In [36]:
array*=255
new_image = Image.fromarray(array,'L')
new_image = new_image.save(f'../data/midi_reconstruction/images/captioned_piece_test.png')

In [37]:
# reconvert MIDI images to MIDI files
image_path = "../data/midi_reconstruction/images/captioned_piece_test.png"
output_path = "../data/midi_reconstruction"

img2midi_obj = img2midi(image_path, output_path, resolution=0.25)
img2midi_obj.convert_to_midi()

In [38]:
# run to cell to play
# stop the cell and run sp.stop to stop the music
from music21 import midi, converter, instrument, note, chord

mf = midi.MidiFile()
mf.open(f"{output_path}/music.mid")
mf.read()
mf.close()
s = midi.translate.midiFileToStream(mf)
sp = midi.realtime.StreamPlayer(s)
sp.play()

pygame 2.5.1 (SDL 2.28.2, Python 3.9.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [1]:
sp.stop

NameError: name 'sp' is not defined

In [None]:
class GAN(keras.Model):
    def __init__(
        self,
        discriminator,
        generator,
        discriminator_extra_steps=3,
        gp_weight=10.0,
    ):
        super().__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.d_steps = discriminator_extra_steps
        self.gp_weight = gp_weight
        self.d_gradient = []
        self.g_gradient = []

    def compile(self, d_optimizer, g_optimizer, d_loss_fn, g_loss_fn):
        """
            TODO: incorporate trainign metric
        """
        super().compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.d_loss_fn = d_loss_fn
        self.g_loss_fn = g_loss_fn
        self.g_metric = [tf.keras.metrics.AUC(), tf.keras.metrics.Accuracy()]


    def train_step(self, dataset):
        # Unpack the data
        characteristics_curves, econ_data, target_ranking = dataset

        if isinstance(characteristics_curves, tuple):
            characteristics_curves = characteristics_curves[0]

        # Get the batch size
        batch_size = tf.shape(characteristics_curves)[0]

        # For each batch, we are going to perform the
        # following steps as laid out in the original paper:
        # 1. Train the generator and get the generator loss
        # 2. Train the discriminator and get the discriminator loss
        # 3. Calculate the gradient penalty
        # 4. Multiply this gradient penalty with a constant weight factor
        # 5. Add the gradient penalty to the discriminator loss
        # 6. Return the generator and discriminator losses as a loss dictionary

        # Train the discriminator first. The original paper recommends training
        # the discriminator for `x` more steps (typically 5) as compared to
        # one step of the generator. Here we will train it for 3 extra steps
        # as compared to 5 to reduce the training time.
        for i in range(self.d_steps):
            with tf.GradientTape() as tape:
                # have the generator generate the risk probability and shortage ranking
                fake_risk_logits, fake_ranking = self.generator([characteristics_curves, econ_data], training=True)

                # Get the logits for the fake images
                fake_logits = self.discriminator(fake_ranking, training=True)
                
                # Get the logits for the real images
                real_logits = self.discriminator(target_ranking, training=True)

                # Calculate the discriminator loss using the fake and real image logits
                d_loss = self.d_loss_fn(real=real_logits, fake=fake_logits)


            # Get the gradients w.r.t the discriminator loss
            d_gradient = tape.gradient(d_loss, self.discriminator.trainable_variables)
            self.d_gradient.append((i, d_gradient))

            # Update the weights of the discriminator using the discriminator optimizer
            self.d_optimizer.apply_gradients(
                zip(d_gradient, self.discriminator.trainable_variables)
            )

        # Train the generator
        with tf.GradientTape() as tape:
            # Generate fake images using the generator
            generated_ranking, generated_ranking = self.generator([characteristics_curves, econ_data], training=True)


            # apply accuracy metric calculation on generated_ranking
            accuracy = tf.keras.metrics.Accuracy()
            accuracy.update_state(target_ranking, generated_ranking)

            # Get the discriminator logits for fake images
            gen_img_logits = self.discriminator(generated_ranking, training=True)

            # Calculate the generator loss
            g_loss = self.g_loss_fn(gen_img_logits)

        # Get the gradients w.r.t the generator loss
        gen_gradient = tape.gradient(g_loss, self.generator.trainable_variables)
        # Update the weights of the generator using the generator optimizer
        self.g_optimizer.apply_gradients(
            zip(gen_gradient, self.generator.trainable_variables)
        )
        self.g_gradient.append(gen_gradient)
        return {"d_loss": d_loss, "g_loss": g_loss}