# Model Pegasus

* Adjust the weight of the loss function MSE and similarity
* The way convolutions are referenced in decoder
* Activation function changed from ReLU to LeakyReLU
* Reduce the number of network layers and add regularization

## Import Packages

In [None]:
import numpy as np
import librosa
from note_seq.protobuf import music_pb2
from note_seq.midi_synth import fluidsynth
from note_seq import sequences_lib
from note_seq import audio_io
from note_seq import midi_io
from pydub import AudioSegment
import tensorflow as tf
from note_seq import note_sequence_to_midi_file, NoteSequence, midi_to_note_sequence
from magenta.models.music_vae import TrainedModel, configs

import pandas as pd
import torch
import clip
from PIL import Image
import os
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
import note_seq

from tensorflow.keras.layers import Input, Dense, Lambda, LSTM, RNN, LSTMCell, RepeatVector, TimeDistributed, Layer, Dropout, BatchNormalization, LeakyReLU, MultiHeadAttention, LayerNormalization, Reshape, Conv1DTranspose, Activation, Cropping1D, ReLU, Conv1D
from tensorflow.keras.regularizers import l2

from tensorflow.keras import Model
from tensorflow.keras import backend as K

from sklearn.model_selection import train_test_split

from tensorflow.keras.optimizers import Adam, SGD
from tqdm import tqdm

import h5py
import json

from tensorflow.keras.models import load_model
from scipy.io.wavfile import write
import fluidsynth

from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
import random

## Load Dataset

In [None]:
# Load processed matched musical data, which has deleted the no-sound audio row of data
base_data = pd.read_csv("processed_music_matched_data.csv")
base_data

Unnamed: 0,Artwork,Art_Utterance,Music_Name,Music_Comment,Similarity_Score
0,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,"She seems very happy in the picture, and you w...",ABVYSaLu_VM_10-20,Here we have a slow piano piece played in a ma...,0.791458
1,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,This woman has really knotty hands which makes...,vnwKpQeza3A_320-330,This is a recording of two didgeridoos. They a...,0.772168
2,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,"When looking at this woman, I am filled with c...",0VwX92X3iPc_30-40,This audio contains a female voice speaking in...,0.798202
3,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,"A woman looking at ease, peaceful, and satisfi...",kh6rmFg3U4k_480-490,The low quality recording features a resonatin...,0.792188
4,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,She looks like a lady from that past that migh...,-VI2IRq17rs_360-370,"In this clip, a large bell is rung and left to...",0.740201
...,...,...,...,...,...
422756,david-burliuk_landscape-1,The greenery landscape and flowery background ...,M0ygCD6WyXw_0-10,This clip consists of a blowing horn being pla...,0.758374
422757,gino-severini_a-dancer-1,the collection and collage of different colors...,oMZcsGUi8ZE_0-10,This clip features a synchronised playing of s...,0.799300
422758,ivan-aivazovsky_sea-at-night-1861,The peaceful reflections of the moonlight on t...,s1QeDT7jqHQ_30-40,The low quality recording features multiple la...,0.781008
422759,ivan-aivazovsky_sea-at-night-1861,I can imagine the sailors resting this peacefu...,ABVYSaLu_VM_10-20,Here we have a slow piano piece played in a ma...,0.733153


### Load Dataset with Extracted Features

In [None]:
# load HDF5 file
with h5py.File('processed_data_with_melody2.h5', 'r') as hf:
    melody_data = hf['melody'][:]
    # Deserialize the JSON string to the Melody object
    melodies = [note_seq.Melody(json.loads(m.decode())) for m in melody_data]

    image_features = hf['image_features'][:]
    text_features = hf['text_features'][:]
    features_mean = hf['features_mean'][:]
    features_weighted = hf['features_weighted'][:]
    combined_features = hf['combined_features'][:]

# Convert the array to a list and add it to other_data DataFrame
base_data['image_features'] = list(image_features)
base_data['text_features'] = list(text_features)
base_data['features_mean'] = list(features_mean)
base_data['features_weighted'] = list(features_weighted)
base_data['melody'] = list(melodies)
base_data['combined_features'] = list(combined_features)

In [None]:
base_data

Unnamed: 0,Artwork,Art_Utterance,Music_Name,Music_Comment,Similarity_Score,image_features,text_features,features_mean,features_weighted,melody,combined_features
0,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,"She seems very happy in the picture, and you w...",ABVYSaLu_VM_10-20,Here we have a slow piano piece played in a ma...,0.791458,"[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0....","[-0.0315, 0.1757, -0.1968, 0.0465, -0.04526, -...","[0.1167, 0.002075, -0.0323, 0.03006, 0.2042, -...","[0.1464, -0.03265, 0.0006714, 0.02676, 0.2542,...","(50, -1, 50, 50, 50, 50, 50, 50, 50, 50, 50, 5...","[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0...."
1,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,This woman has really knotty hands which makes...,vnwKpQeza3A_320-330,This is a recording of two didgeridoos. They a...,0.772168,"[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0....","[-0.0724, 0.3037, -0.4678, -0.1588, 0.1721, 0....","[0.09625, 0.0661, -0.1677, -0.07263, 0.313, 0....","[0.13, 0.01855, -0.10767, -0.0554, 0.341, 0.04...","(-2, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 3...","[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0...."
2,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,"When looking at this woman, I am filled with c...",0VwX92X3iPc_30-40,This audio contains a female voice speaking in...,0.798202,"[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0....","[0.05783, -0.0409, -0.4458, -0.0473, -0.08594,...","[0.1614, -0.1062, -0.1567, -0.01685, 0.1838, -...","[0.1821, -0.11926, -0.0989, -0.010765, 0.2378,...","(-2, -2, -2, -2, -2, -2, 57, 56, 56, 55, 55, -...","[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0...."
3,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,"A woman looking at ease, peaceful, and satisfi...",kh6rmFg3U4k_480-490,The low quality recording features a resonatin...,0.792188,"[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0....","[0.1517, -0.2998, 0.1375, 0.3303, 0.3237, -0.5...","[0.2083, -0.2356, 0.1348, 0.172, 0.3887, -0.36...","[0.2196, -0.2228, 0.1343, 0.1403, 0.4019, -0.3...","(67, 67, 67, 67, 67, 67, 67, 67, 67, 66, 66, -...","[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0...."
4,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,She looks like a lady from that past that migh...,-VI2IRq17rs_360-370,"In this clip, a large bell is rung and left to...",0.740201,"[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0....","[-0.1611, -0.002035, -0.2603, 0.1305, 0.1753, ...","[0.05188, -0.0868, -0.064, 0.072, 0.3145, -0.2...","[0.0945, -0.1037, -0.02472, 0.06033, 0.3423, -...","(-2, -2, 60, 60, -1, -2, -2, -2, 60, 60, 60, -...","[0.265, -0.1715, 0.1322, 0.013596, 0.4536, -0...."
...,...,...,...,...,...,...,...,...,...,...,...
422756,david-burliuk_landscape-1,The greenery landscape and flowery background ...,M0ygCD6WyXw_0-10,This clip consists of a blowing horn being pla...,0.758374,"[0.2013, -0.2996, 0.1554, -0.04733, 0.1796, -0...","[0.06445, 0.1355, 0.3389, -0.1179, -0.04034, 0...","[0.1328, -0.08203, 0.2471, -0.08264, 0.0696, 0...","[0.1466, -0.1256, 0.2288, -0.07556, 0.0916, -0...","(-2, 66, 66, 59, 59, 59, 59, 59, 59, 59, 59, 5...","[0.2013, -0.2996, 0.1554, -0.04733, 0.1796, -0..."
422757,gino-severini_a-dancer-1,the collection and collage of different colors...,oMZcsGUi8ZE_0-10,This clip features a synchronised playing of s...,0.799300,"[0.282, 0.1711, 0.1952, -0.1887, 0.417, 0.0896...","[-0.05054, 0.3774, 0.2979, -0.06555, -0.04782,...","[0.1157, 0.2744, 0.2466, -0.1272, 0.1846, -0.0...","[0.1489, 0.2537, 0.2363, -0.1395, 0.2311, -0.0...","(-2, -2, 69, 69, -1, 64, -1, -2, -2, 71, 71, 7...","[0.282, 0.1711, 0.1952, -0.1887, 0.417, 0.0896..."
422758,ivan-aivazovsky_sea-at-night-1861,The peaceful reflections of the moonlight on t...,s1QeDT7jqHQ_30-40,The low quality recording features multiple la...,0.781008,"[0.1924, -0.2947, 0.07135, 0.392, 0.2793, -0.0...","[0.0625, -0.1382, 0.1168, 0.2683, 0.3545, -0.1...","[0.1274, -0.2164, 0.0941, 0.33, 0.317, -0.1034...","[0.1405, -0.2322, 0.08954, 0.3428, 0.3093, -0....","(63, 63, 60, 60, 63, 63, 63, 65, 65, 63, 62, 6...","[0.1924, -0.2947, 0.07135, 0.392, 0.2793, -0.0..."
422759,ivan-aivazovsky_sea-at-night-1861,I can imagine the sailors resting this peacefu...,ABVYSaLu_VM_10-20,Here we have a slow piano piece played in a ma...,0.733153,"[0.1924, -0.2947, 0.07135, 0.392, 0.2793, -0.0...","[-0.01753, -0.018, 0.11707, -0.0385, 0.2886, 0...","[0.0874, -0.1564, 0.09424, 0.1768, 0.284, 0.02...","[0.10846, -0.1841, 0.0896, 0.22, 0.283, 0.0121...","(50, -1, 50, 50, 50, 50, 50, 50, 50, 50, 50, 5...","[0.1924, -0.2947, 0.07135, 0.392, 0.2793, -0.0..."


In [None]:
print('shape of combined features: ', combined_features.shape)
print('shape of features mean: ', features_mean.shape)
print('shape of features weighted: ', features_weighted.shape)

# Calculates the length of each Melody object _events
event_lengths = base_data['melody'].apply(lambda x: len(x._events))
# Check that all _events have the same length
unique_length_count = event_lengths.nunique()
# print the result
if unique_length_count == 1:
    print("All Melody obejcts have the same length of _events")
else:
    print(f"Exist different length of _events，there are {unique_length_count} kinds of lenegth")

shape of combined features:  (422761, 1024)
shape of features mean:  (422761, 512)
shape of features weighted:  (422761, 512)
Exist different length of _events，there are 80 kinds of lenegth


In [None]:
# Calculates the length of each Melody object _events
max_length = base_data['melody'].apply(lambda x: len(x._events)).max()
# max_length is now the longest _events length in the entire dataset
print("The max length of events:", max_length)

The max length of events: 81


In [None]:
global_min_feature = base_data['features_mean'].apply(lambda x: np.min(x)).min()
global_max_feature = base_data['features_mean'].apply(lambda x: np.max(x)).max()

print('min: ', global_min_feature)
print('max: ', global_max_feature)

min:  -6.195
max:  5.023


## Create Data Generator

In [None]:
def normalize(data, min_val, max_val):
    normalized_data = 2 * (data - min_val) / (max_val - min_val) - 1
    return normalized_data

def denormalize(data, min_val, max_val):
    data = (data + 1) / 2 * (max_val - min_val) + min_val
    return data

In [None]:
def augment_data(features):
    augmented_features = []
    for feature in features:
        noise = np.random.normal(0, 0.1, feature.shape)
        augmented_features.append(feature + noise)
    return np.array(augmented_features)

In [None]:
def data_generator(df, feature_col, batch_size, max_length):
    num_batches = (len(df) + batch_size - 1) // batch_size

    for i in range(num_batches):
        batch_slice = slice(i * batch_size, min((i + 1) * batch_size, len(df)))
        batch = df.iloc[batch_slice]

        # Deal with melody
        melody_tensors = []
        for melody in batch['melody']:
            melody_events = melody._events
            melody_tensor = tf.convert_to_tensor(melody_events, dtype=tf.float32)
            melody_tensor = normalize(melody_tensor,-2, 127)
            # Calculate the number of fillers you need
            padding_needed = max_length - tf.shape(melody_tensor)[0]

            # Fill if padding_needed is greater than 0
            if padding_needed > 0:
                padded_melody_tensor = tf.pad(melody_tensor, [[0, padding_needed]], "CONSTANT")
            else:
                padded_melody_tensor = melody_tensor

            padded_melody_tensor = tf.expand_dims(padded_melody_tensor, -1)
            melody_tensors.append(padded_melody_tensor)

        features = np.array(batch[feature_col].tolist())
        augmented_features = augment_data(features)
        normalized_features = normalize(augmented_features, global_min_feature, global_max_feature)

        yield  (normalized_features, np.stack(melody_tensors)), np.stack(melody_tensors)

In [None]:
def created_dataset(df, features_col, batch_size, max_length):
    dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(df, features_col, batch_size, max_length),
    output_signature=(
        (tf.TensorSpec(shape=(None, np.stack(df[features_col].values).shape[1]), dtype=tf.float32),         # Adjust this as well
        tf.TensorSpec(shape=(None, max_length, 1), dtype=tf.float32)),  # Adjust the shape based on your actual data
        tf.TensorSpec(shape=(None, max_length, 1), dtype=tf.float32)
    )
)
    return dataset

## Seperate the Dataset (Train data, Test data)

In [None]:
batch_size = 16
max_length = 81

dataset = created_dataset(base_data, 'features_mean', batch_size, max_length)

# Shuffle and seperate the dataset
dataset = dataset.shuffle(buffer_size=1000)
train_size = int(0.8 * sum(1 for _ in dataset))
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)

# Print out for verification
for (features, melodies), true_melodies in train_dataset.take(1):
    print("Train Melodies shape:", melodies.shape)
    print("Train Features shape:", features.shape)

for (features, melodies), true_melodies in test_dataset.take(1):
    print("Test Melodies shape:", melodies.shape)
    print("Test Features shape:", features.shape)

Train Melodies shape: (16, 81, 1)
Train Features shape: (16, 512)
Test Melodies shape: (16, 81, 1)
Test Features shape: (16, 512)


In [None]:
# Print out for verification
for (features, melodies), true_melodies in train_dataset.take(1):
    print("Train Melodies shape:", melodies.numpy())
    print("Train Features shape:", features.numpy())

Train Melodies shape: [[[-1.        ]
  [-1.        ]
  [-1.        ]
  ...
  [ 0.        ]
  [ 0.        ]
  [ 0.        ]]

 [[-1.        ]
  [-1.        ]
  [-1.        ]
  ...
  [ 0.        ]
  [ 0.        ]
  [ 0.        ]]

 [[-0.19379842]
  [-0.9844961 ]
  [-0.19379842]
  ...
  [-0.13178295]
  [ 0.        ]
  [ 0.        ]]

 ...

 [[-1.        ]
  [ 0.05426359]
  [ 0.05426359]
  ...
  [-0.05426359]
  [-0.05426359]
  [-0.05426359]]

 [[-1.        ]
  [-1.        ]
  [ 0.13178289]
  ...
  [ 0.05426359]
  [ 0.05426359]
  [ 0.        ]]

 [[ 0.00775194]
  [ 0.00775194]
  [-0.03875971]
  ...
  [ 0.00775194]
  [ 0.00775194]
  [ 0.00775194]]]
Train Features shape: [[ 0.13183679  0.05203343  0.05273633 ...  0.0904149   0.08707825
   0.07008836]
 [ 0.10697624  0.05688362  0.08958344 ...  0.05651717  0.09759571
   0.11458752]
 [ 0.09806014  0.14734669  0.05346132 ... -0.00981832  0.07515043
   0.10741879]
 ...
 [ 0.1133503   0.09426291  0.0420874  ...  0.1459406   0.12790029
   0.0583687

## Generative Model (VAE)

For feature vectors extracted by CLIP model, we may not need too complex encoder network, because these features are already at a high level of abstraction. As a result, the encoder can be simpler to keep this encoded high-level information, while the decoder may need more tweaking to produce a more detailed output (such as a music sequence).

### Sampling

The sampling function samples a potential vector from the normal distribution 𝑧 using the reparameterization trick. This function avoids the gradient propagation problem caused by random nodes and allows the model to be trained using backpropagation.

In [None]:
# Define the sampling function
def sampling(args):
    z_mean, z_log_sigma = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_sigma) * epsilon

### Encoder

Since the input features are already highly abstract, encoders can be simplified or designed to be more focused on accommodating such high-dimensional features.
The encoder maps the input data to the latent space, outputs the potential mean z_mean and the potential log-variance z_log_sigma, and uses the sampling function to generate the potential vector 𝑧.

In [None]:
def build_encoder(input_dim, latent_dim):
    inputs = Input(shape=(input_dim,))

    x = Dense(256)(inputs)
    x = LeakyReLU(alpha=0.01)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.01)(x)
    x = BatchNormalization()(x)
    z_mean = Dense(latent_dim)(x)
    z_log_sigma = Dense(latent_dim)(x)

    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])
    encoder = Model(inputs, [z_mean, z_log_sigma, z], name='encoder')
    encoder.summary()
    return encoder

### Decoder

Learn how to generate concrete musical sequences from abstract features extracted from the latent space. Predict every note or event in Melody.

In [None]:
def build_decoder(latent_dim, output_dim):
    latent_inputs = Input(shape=(latent_dim,))

    x = Dense(256, activation='relu')(latent_inputs)
    x = BatchNormalization()(x)
    x = RepeatVector(output_dim)(x)  # Expand the latent vector across the sequence

    x = LSTM(256, return_sequences=True)(x)
    x = LSTM(128, return_sequences=True)(x)

    x = Conv1D(64, 3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = Conv1D(32, 3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    x = Conv1DTranspose(16, 3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = Conv1DTranspose(8, 3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    outputs = TimeDistributed(Dense(1, activation='tanh'))(x)  # Output at each time step

    decoder = Model(latent_inputs, outputs, name='decoder')
    decoder.summary()
    return decoder

### VAE Model

In [None]:
def reconstruction_loss(y_true, y_pred):
    mse_loss = K.mean(K.square(y_true - y_pred), axis=[1, 2])

    y_true_norm = K.l2_normalize(y_true, axis=1)
    y_pred_norm = K.l2_normalize(y_pred, axis=1)
    cosine_similarity = K.sum(y_true_norm * y_pred_norm, axis=2)

    cosine_loss = 1 - cosine_similarity

    combined_loss = 0.1 * mse_loss + 0.4 * K.mean(cosine_loss, axis=1)
    return combined_loss

def kl_loss(z_mean, z_log_sigma):
    kl_loss = -0.5 * K.sum(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=-1)
    return kl_loss

def vae_loss(true_melody, reconstructed_melody, z_mean, z_log_sigma, beta = 0.5):
    recon_loss = reconstruction_loss(true_melody, reconstructed_melody)
    kl = kl_loss(z_mean, z_log_sigma)
    return recon_loss + kl * beta

In [None]:
class VAE(Model):
    def __init__(self, encoder, decoder, beta=0.1, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.beta = beta

    def call(self, inputs):
        inputs, true_melody = inputs
        z_mean, z_log_sigma, z = self.encoder(inputs)
        reconstructed_melody = self.decoder(z)
        recon_loss = reconstruction_loss(true_melody, reconstructed_melody)
        kl = kl_loss(z_mean, z_log_sigma)
        total_loss = recon_loss + self.beta * kl
        self.add_loss(total_loss)
        self.add_metric(recon_loss, name='reconstruction_loss', aggregation='mean')
        self.add_metric(kl, name='kl_loss', aggregation='mean')
        return reconstructed_melody

In [None]:
def build_vae_models(input_dim, sequence_length, latent_dim, beta=0.1):
    inputs = Input(shape=(input_dim,), name="input_features")
    true_melody = Input(shape=(sequence_length, 1), name="true_melody")

    encoder = build_encoder(input_dim, latent_dim)
    decoder = build_decoder(latent_dim, sequence_length)

    z_mean, z_log_sigma, z = encoder(inputs)
    reconstructed_melody = decoder(z)

    vae_train = VAE(encoder, decoder, beta)
    vae_train.compile(optimizer='adam')

    vae_predict = Model(inputs=inputs, outputs=reconstructed_melody, name='vae_predict')

    return vae_train, vae_predict

## Train the Model

In [None]:
input_dim = 512
sequence_length = 81
latent_dim = 64

epochs = 4

vae_train, _ = build_vae_models(input_dim, sequence_length, latent_dim)
vae_train.fit(train_dataset, epochs=epochs, validation_data=test_dataset)
# Evaluate the model performancee
total_loss = vae_train.evaluate(test_dataset)
print("Test VAE Loss:", total_loss)

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 dense_18 (Dense)               (None, 256)          131328      ['input_8[0][0]']                
                                                                                                  
 leaky_re_lu_6 (LeakyReLU)      (None, 256)          0           ['dense_18[0][0]']               
                                                                                                  
 batch_normalization_17 (BatchN  (None, 256)         1024        ['leaky_re_lu_6[0][0]']          
 ormalization)                                                                              

In [None]:
# save the model
vae_train.save_weights('my models/my_vae_weights22.h5')

## Load the Trained Model

In [None]:
# load the model
input_dim = 512
sequence_length = 81
latent_dim = 64

_ , model = build_vae_models(input_dim, sequence_length, latent_dim)
# load weights
model.load_weights('my models/my_vae_weights22.h5')

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 dense (Dense)                  (None, 256)          131328      ['input_1[0][0]']                
                                                                                                  
 leaky_re_lu (LeakyReLU)        (None, 256)          0           ['dense[0][0]']                  
                                                                                                  
 batch_normalization (BatchNorm  (None, 256)         1024        ['leaky_re_lu[0][0]']            
 alization)                                                                                 

## Test the Model

### Load the Test Data

In [None]:
# image_add = "E:\\Project and Dissertation in Data Science\\dataset\\dataset\\test\\ivan-aivazovsky_sea-at-night-1861.jpg"
image_add = "E:\\Project and Dissertation in Data Science\\dataset\\artemis_official_data\\art_images\\vincent-van-gogh_portrait-of-madame-ginoux-l-arlesienne-1890.jpg"
# text = "The steep mountains and the moonlight provide safety to the inhabitants of the isolated towns. "
# text = "I can imagine the sailors resting this peaceful night, dreaming of new adventures "
text = "She seems very happy in the picture, and you want to know what what is behind the smile. "

### Extract Text-Image Features (CLIP Model)

In [None]:
# load pre-trained CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
extracted_features_model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
# extract the images and text features
with torch.no_grad():
    image = Image.open(image_add).convert("RGB")
    image = preprocess(image).unsqueeze(0)
    # image feature
    image_feature = torch.cat([extracted_features_model.encode_image(image.to(device))]).cpu().numpy()

    # text feature
    text_tokens = clip.tokenize(text).to(device)
    text_feature = extracted_features_model.encode_text(text_tokens).cpu().numpy()

print('image features: ', image_feature.shape)
print('text features: ', text_feature.shape)

image features:  (1, 512)
text features:  (1, 512)


In [None]:
feature_mean = (image_feature + text_feature) / 2
print(feature_mean.shape)
print(feature_mean)

feature_mean = augment_data(feature_mean)
augmented_features = normalize(feature_mean, global_min_feature, global_max_feature)

print(augmented_features.shape)
print(augmented_features)

(1, 512)
[[ 1.1670e-01  2.0752e-03 -3.2288e-02  3.0060e-02  2.0422e-01 -1.3196e-01
   1.0681e-02 -5.5029e-01 -3.5370e-02  3.3203e-01 -1.0632e-01  4.0131e-03
   2.7002e-01  1.8604e-01  3.2397e-01 -9.5215e-03  8.7830e-02 -2.3462e-01
   3.7445e-02 -9.9182e-02  6.4160e-01  1.7737e-01 -2.1960e-01  1.7773e-01
  -1.1566e-02 -6.7932e-02  1.9531e-01 -6.2378e-02 -8.5938e-02 -3.8965e-01
   1.7249e-01  2.4951e-01 -1.2817e-01  3.3875e-02  1.8884e-01  1.5076e-01
   1.5381e-01 -9.2407e-02  1.0217e-01  1.0339e-01  5.9875e-02  3.9429e-02
  -5.1941e-02  1.3184e-01  1.0071e-01  1.6919e-01 -2.0691e-01  4.3726e-01
   2.4609e-01 -8.9600e-02  1.8506e-01  6.9397e-02  1.6870e-01 -1.9824e-01
  -5.8075e-02 -2.5195e-01  1.6016e-01  2.1777e-01 -1.6504e-01  6.5369e-02
   4.9805e-01  1.0767e-01  1.8799e-01 -3.5767e-02  1.9214e-01  4.5532e-02
   2.4438e-01  2.5098e-01  1.7004e-01 -7.0557e-02  1.9080e-01  2.2644e-02
   1.8079e-01 -1.3354e-01 -8.8257e-02  8.4351e-02  4.1162e-01 -2.8648e-03
   1.1035e-01  3.1494e-01  1.

### Prediction

In [None]:
# generate the Melody by model
predicted_melody = model.predict(augmented_features)

print(predicted_melody)
print(predicted_melody.shape)

[[[-0.6779262 ]
  [-0.601647  ]
  [-0.48843265]
  [-0.46314862]
  [-0.5252264 ]
  [-0.44606173]
  [-0.38318154]
  [-0.3603325 ]
  [-0.34737182]
  [-0.3360829 ]
  [-0.33172294]
  [-0.3293857 ]
  [-0.32600722]
  [-0.32349205]
  [-0.32186028]
  [-0.3203956 ]
  [-0.31945333]
  [-0.32036072]
  [-0.3218238 ]
  [-0.32336578]
  [-0.32434896]
  [-0.32185495]
  [-0.31933096]
  [-0.31821376]
  [-0.3186403 ]
  [-0.31446132]
  [-0.3098381 ]
  [-0.30711   ]
  [-0.30509505]
  [-0.3032743 ]
  [-0.30147457]
  [-0.29967964]
  [-0.29746976]
  [-0.29541138]
  [-0.29668158]
  [-0.29851446]
  [-0.29646006]
  [-0.29367995]
  [-0.29155347]
  [-0.28976366]
  [-0.28751332]
  [-0.28585556]
  [-0.28499094]
  [-0.2841255 ]
  [-0.283268  ]
  [-0.28238544]
  [-0.28147396]
  [-0.28062874]
  [-0.2798255 ]
  [-0.27898175]
  [-0.2782411 ]
  [-0.27757725]
  [-0.27652615]
  [-0.27532673]
  [-0.27454573]
  [-0.27378336]
  [-0.27303734]
  [-0.27230066]
  [-0.27157873]
  [-0.27088073]
  [-0.27020723]
  [-0.26955137]
  [-0.26

In [None]:
predicted_melody = denormalize(predicted_melody, -2, 127)

print(predicted_melody)
print(predicted_melody.shape)

[[[18.773762]
  [23.693768]
  [30.996094]
  [32.626915]
  [28.622896]
  [33.72902 ]
  [37.78479 ]
  [39.258553]
  [40.094517]
  [40.82265 ]
  [41.103867]
  [41.254623]
  [41.472534]
  [41.63476 ]
  [41.74001 ]
  [41.834484]
  [41.89526 ]
  [41.836735]
  [41.742363]
  [41.642906]
  [41.579494]
  [41.740356]
  [41.903156]
  [41.975212]
  [41.9477  ]
  [42.217243]
  [42.515446]
  [42.691402]
  [42.82137 ]
  [42.93881 ]
  [43.05489 ]
  [43.170662]
  [43.3132  ]
  [43.44597 ]
  [43.364037]
  [43.245815]
  [43.378326]
  [43.557644]
  [43.6948  ]
  [43.81024 ]
  [43.95539 ]
  [44.062317]
  [44.118088]
  [44.173904]
  [44.229218]
  [44.286137]
  [44.34493 ]
  [44.399445]
  [44.451256]
  [44.505676]
  [44.553448]
  [44.596264]
  [44.664062]
  [44.741425]
  [44.7918  ]
  [44.840977]
  [44.88909 ]
  [44.936607]
  [44.983173]
  [45.028194]
  [45.071632]
  [45.113934]
  [45.155487]
  [45.19535 ]
  [45.232998]
  [45.269615]
  [45.30444 ]
  [45.33777 ]
  [45.369915]
  [45.399494]
  [45.42763 ]
  [45.

In [None]:
predicted_notes = np.round(predicted_melody).astype(int)
print(predicted_notes)

[[[19]
  [24]
  [31]
  [33]
  [29]
  [34]
  [38]
  [39]
  [40]
  [41]
  [41]
  [41]
  [41]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [46]
  [46]
  [46]
  [46]
  [49]
  [52]
  [56]
  [59]]]


In [None]:
melody = note_seq.Melody(predicted_notes.flatten().tolist())
melody

<note_seq.melodies_lib.Melody at 0x1afa471f790>

In [None]:
melody._events

[19,
 24,
 31,
 33,
 29,
 34,
 38,
 39,
 40,
 41,
 41,
 41,
 41,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 42,
 43,
 43,
 43,
 43,
 43,
 43,
 43,
 43,
 43,
 43,
 43,
 44,
 44,
 44,
 44,
 44,
 44,
 44,
 44,
 44,
 44,
 44,
 44,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 45,
 46,
 46,
 46,
 46,
 49,
 52,
 56,
 59]

## Sentiment Analysis text content: Calculate the range of QPM

In [None]:
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

calm_words = {
    'peaceful', 'calm', 'relaxed', 'serene', 'tranquil', 'composed', 'quiet', 'soothing',
    'content', 'easygoing', 'gentle', 'harmonious', 'placid', 'mellow', 'restful',
    'untroubled', 'cool', 'collected'
}
excited_words = {
    'excited', 'energetic', 'lively', 'thrilled', 'exhilarated', 'animated', 'enthusiastic',
    'vivacious', 'vibrant', 'spirited', 'eager', 'dynamic', 'passionate', 'zealous',
    'high-spirited', 'raring', 'buoyant', 'stimulated'
}

def sentiment_analysis(text):

    words = word_tokenize(text.lower())

    num_positive_words = sum(1 for word in words if word in positive_words)
    num_negative_words = sum(1 for word in words if word in negative_words)
    num_neutral_words = len(words) - num_positive_words - num_negative_words
    num_calm_words = sum(1 for word in words if word in calm_words)
    num_excited_words = sum(1 for word in words if word in excited_words)

    total_words = num_positive_words + num_negative_words + num_neutral_words
    sentiment_score = ((num_positive_words - num_negative_words) + num_neutral_words * 0.5) / max(1, total_words)

    total_emotion_words = num_calm_words + num_excited_words + num_neutral_words
    excitement_score = ((num_excited_words - num_calm_words) + num_neutral_words * 0.5) / max(1, total_emotion_words)

    return sentiment_score, excitement_score

def sentiment_to_qpm(sentiment, excitement):
    if excitement >= 0.5:
        if sentiment >= 0.5:
            return 60, 70
        elif sentiment > -0.5:
            return 40, 60
        else:
            return 30, 40
    else:
        if sentiment >= 0.5:
            return 20, 30
        elif sentiment > -0.5:
            return 10, 20
        else:
            return 5, 10

sentiment_score, excitement_score = sentiment_analysis(text)
print('Sentiment Score:', sentiment_score)
print('Excitement Score:', excitement_score)

qpm_range = sentiment_to_qpm(sentiment_score, excitement_score)
print('QPM Range:', qpm_range)

random_qpm = random.randint(qpm_range[0], qpm_range[1])
print('Random QPM:', random_qpm)

Sentiment Score: 0.55
Excitement Score: 0.5
QPM Range: (60, 70)
Random QPM: 70


In [None]:
note_sequence = melody.to_sequence(velocity=100, instrument=0, program=0, sequence_start_time=0.0, qpm=67)

time_signature = note_sequence.time_signatures.add()
time_signature.time = 0
time_signature.numerator = 4
time_signature.denominator = 4

print(note_sequence)

ticks_per_quarter: 220
time_signatures {
  numerator: 4
  denominator: 4
}
tempos {
  qpm: 67.0
}
notes {
  pitch: 19
  velocity: 100
  end_time: 0.22388059701492538
}
notes {
  pitch: 24
  velocity: 100
  start_time: 0.22388059701492538
  end_time: 0.44776119402985076
}
notes {
  pitch: 31
  velocity: 100
  start_time: 0.44776119402985076
  end_time: 0.6716417910447762
}
notes {
  pitch: 33
  velocity: 100
  start_time: 0.6716417910447762
  end_time: 0.8955223880597015
}
notes {
  pitch: 29
  velocity: 100
  start_time: 0.8955223880597015
  end_time: 1.119402985074627
}
notes {
  pitch: 34
  velocity: 100
  start_time: 1.119402985074627
  end_time: 1.3432835820895523
}
notes {
  pitch: 38
  velocity: 100
  start_time: 1.3432835820895523
  end_time: 1.5671641791044777
}
notes {
  pitch: 39
  velocity: 100
  start_time: 1.5671641791044777
  end_time: 1.791044776119403
}
notes {
  pitch: 40
  velocity: 100
  start_time: 1.791044776119403
  end_time: 2.0149253731343286
}
notes {
  pitch: 

In [None]:
# Use FluidSynth to generate audio data
# Need to specify the SoundFont file path
# audio_samples = note_seq.fluidsynth(ns, sample_rate=44100, sf2_path='GeneralUser GS v1.471.sf2')
# audio_samples = note_seq.fluidsynth(ns, sample_rate=44100, sf2_path="TimGM6mb.sf2")
audio_samples = note_seq.fluidsynth(note_sequence, sample_rate=48000, sf2_path="FluidR3Mono_GM.sf3")

# Save audio data as WAV file
# 'audio_samples' is a floating-point audio array generated by fluidsynth
# Normalize audio and convert to 16-bit PCM format
write('models outputs/model2_3.wav', 48000, np.int16(audio_samples / np.max(np.abs(audio_samples)) * 32767))