In [1]:
pip install tensorflow numpy librosa



In [2]:
import os
import shutil
import numpy as np
import librosa
from scipy.io import wavfile
import soundfile as sf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def choose_theme():
  chosen_theme=input("Enter the theme of song to be composed(blues,country,disco,hiphop,jazz,metal,pop)")
  corect_theme=['blues','hiphop','country','disco','jazz','metal','pop']
  if chosen_theme.lower() not in corect_theme:
    print("Invalid theme")
    choose_theme()
  else:
    return chosen_theme

In [3]:
theme=choose_theme()
dirpath = '/content/drive/MyDrive/genres_original/' + theme

# Change pdatapath to save within the current working directory
pdatapath = 'pdata'
if os.path.exists(pdatapath):
    # Remove the directory and all its contents
    shutil.rmtree(pdatapath)

# Create a new directory
os.makedirs(pdatapath)

hop_length = 128  # Smaller hop length for higher time resolution
n_mfcc = 13       # Number of MFCCs

for filename in os.listdir(dirpath):
    if filename.endswith('.wav'):
        filepath = os.path.join(dirpath, filename)

        y, sr = librosa.load(filepath, sr=None)

        # Core MFCC features
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)

        # Additional features
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

        # Find maximum time dimension to pad arrays
        max_len = max(mfccs.shape[1], delta_mfccs.shape[1], delta2_mfccs.shape[1], chroma.shape[1], spectral_contrast.shape[1], tonnetz.shape[1])

        # Define a function to pad each feature array along the time axis
        def pad_features(feature, max_len):
            if feature.shape[1] < max_len:
                pad_width = max_len - feature.shape[1]
                return np.pad(feature, ((0, 0), (0, pad_width)), mode='constant')
            return feature

        # Pad features to ensure they all have the same time dimension
        mfccs = pad_features(mfccs, max_len)
        delta_mfccs = pad_features(delta_mfccs, max_len)
        delta2_mfccs = pad_features(delta2_mfccs, max_len)
        chroma = pad_features(chroma, max_len)
        spectral_contrast = pad_features(spectral_contrast, max_len)
        tonnetz = pad_features(tonnetz, max_len)

        # Stack features along the feature axis
        features = np.vstack([mfccs, delta_mfccs, delta2_mfccs, chroma, spectral_contrast, tonnetz])

        print(f"{filename} shape: {features.shape} (Features x Frames)")

        # Save the feature matrix to the 'pdata' directory
        np.save(os.path.join(pdatapath, filename.replace('.wav', '.npy')), features)

Enter the theme of song to be composed(blues,country,disco,hiphop,jazz,metal,pop)pop
pop.00000.wav shape: (64, 5169) (Features x Frames)
pop.00002.wav shape: (64, 5169) (Features x Frames)
pop.00005.wav shape: (64, 5169) (Features x Frames)
pop.00004.wav shape: (64, 5169) (Features x Frames)
pop.00001.wav shape: (64, 5169) (Features x Frames)
pop.00006.wav shape: (64, 5169) (Features x Frames)
pop.00003.wav shape: (64, 5169) (Features x Frames)
pop.00007.wav shape: (64, 5169) (Features x Frames)
pop.00019.wav shape: (64, 5169) (Features x Frames)
pop.00014.wav shape: (64, 5169) (Features x Frames)
pop.00008.wav shape: (64, 5169) (Features x Frames)
pop.00015.wav shape: (64, 5169) (Features x Frames)
pop.00011.wav shape: (64, 5169) (Features x Frames)
pop.00016.wav shape: (64, 5169) (Features x Frames)
pop.00009.wav shape: (64, 5169) (Features x Frames)
pop.00013.wav shape: (64, 5169) (Features x Frames)
pop.00018.wav shape: (64, 5169) (Features x Frames)
pop.00012.wav shape: (64, 5169)

In [4]:
# Path to the directory containing the .npy files
data_dir = '/content/pdata'


# List to hold the datasets from each file
all_datasets = []

# Loop through the directory and load each .npy file
for filename in os.listdir(data_dir):
    if filename.endswith('.npy'):
        file_path = os.path.join(data_dir, filename)
        try:
            # Load the .npy file
            data = np.load(file_path)
            all_datasets.append(data)  # Append loaded data to the list
            print(f"Loaded {filename} with shape: {data.shape}")
        except Exception as e:
            print(f"Error loading {filename}: {e}")

# Check if all datasets are compatible for concatenation
# This assumes you want to concatenate along the first axis (e.g., stacking more samples)
shapes = [dataset.shape for dataset in all_datasets]
print("Shapes of loaded datasets:", shapes)

# Ensure all datasets have the same number of dimensions and compatible shapes
if all(len(shape) == len(shapes[0]) for shape in shapes) and \
   all(shape[1:] == shapes[0][1:] for shape in shapes[1:]):  # Checking for shape compatibility
    combined_dataset = np.concatenate(all_datasets, axis=0)  # Concatenate along the first dimension
    print("Combined dataset shape:", combined_dataset.shape)

    # Save the combined dataset if needed
    np.save('/content/combined_audio_features.npy', combined_dataset)
else:
    print("Datasets are not compatible for concatenation.")


Loaded pop.00022.npy with shape: (64, 5169)
Loaded pop.00042.npy with shape: (64, 5169)
Loaded pop.00031.npy with shape: (64, 5169)
Loaded pop.00062.npy with shape: (64, 5169)
Loaded pop.00028.npy with shape: (64, 5169)
Loaded pop.00079.npy with shape: (64, 5169)
Loaded pop.00081.npy with shape: (64, 5169)
Loaded pop.00039.npy with shape: (64, 5169)
Loaded pop.00080.npy with shape: (64, 5169)
Loaded pop.00015.npy with shape: (64, 5169)
Loaded pop.00029.npy with shape: (64, 5169)
Loaded pop.00047.npy with shape: (64, 5169)
Loaded pop.00041.npy with shape: (64, 5169)
Loaded pop.00006.npy with shape: (64, 5169)
Loaded pop.00099.npy with shape: (64, 5169)
Loaded pop.00027.npy with shape: (64, 5169)
Loaded pop.00054.npy with shape: (64, 5169)
Loaded pop.00011.npy with shape: (64, 5169)
Loaded pop.00000.npy with shape: (64, 5169)
Loaded pop.00036.npy with shape: (64, 5169)
Loaded pop.00032.npy with shape: (64, 5169)
Loaded pop.00003.npy with shape: (64, 5169)
Loaded pop.00090.npy with shape:

In [5]:
dataset = np.load('/content/combined_audio_features.npy')  # Replace with your dataset path

num_songs = 100  # Number of songs to train on
if dataset.shape[0] < num_songs:
    raise ValueError("Not enough songs in the dataset.")

dataset = dataset[:num_songs]

# Set the dimensions for the VAE
input_shape = dataset.shape[1:]
latent_dim = 128  # Dimension of the latent space

encoder_input = layers.Input(shape=input_shape)
x = layers.Flatten()(encoder_input)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(256, activation='relu')(x)

z_mean = layers.Dense(latent_dim)(x)
z_log_var = layers.Dense(latent_dim)(x)

def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = layers.Lambda(sampling)([z_mean, z_log_var])

decoder_input = layers.Input(shape=(latent_dim,))
x = layers.Dense(256, activation='relu')(decoder_input)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(np.prod(input_shape), activation='sigmoid')(x)
decoder_output = layers.Reshape(input_shape)(x)

# Build the models
encoder = keras.Model(encoder_input, [z_mean, z_log_var, z], name='encoder')
decoder = keras.Model(decoder_input, decoder_output, name='decoder')

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        kl_loss = -0.5 * tf.reduce_mean(
            z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1
        )
        self.add_loss(kl_loss)  # Add KL loss to the model's losses
        return reconstructed

vae = VAE(encoder, decoder)

# Compile the VAE
vae.compile(optimizer='adam', loss='binary_crossentropy')

# Normalize the dataset
dataset = dataset.astype('float32') / np.max(dataset)

# Train the VAE
vae.fit(dataset, dataset, epochs=35, batch_size=25)

def generate_long_music(num_samples=1, duration_seconds=5, sample_rate=22050):
    # Calculate the number of frames needed
    num_frames = duration_seconds * (sample_rate // 256)  # Adjust based on your frame size

    generated_music_segments = []

    for _ in range(num_samples):
        # Sample random points from the latent space
        z_samples = np.random.normal(size=(num_frames, latent_dim))
        # Decode the latent space samples to get generated features
        generated_features = decoder.predict(z_samples)

        # Append the generated features
        generated_music_segments.append(generated_features)

    # Concatenate the generated segments into a single array
    return np.concatenate(generated_music_segments, axis=1)

# Generate a single long music sample
generated_music = generate_long_music(num_samples=1, duration_seconds=5)


Epoch 1/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 98ms/step - loss: 0.6801
Epoch 2/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - loss: 0.4451
Epoch 3/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 148ms/step - loss: 0.1678
Epoch 4/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 141ms/step - loss: 0.0913
Epoch 5/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step - loss: 0.2881
Epoch 6/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 141ms/step - loss: 0.1428
Epoch 7/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 135ms/step - loss: 0.1312
Epoch 8/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 209ms/step - loss: 0.0816
Epoch 9/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 213ms/step - loss: 0.0462
Epoch 10/35
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 215ms/step - loss: 0.0704
Epoch 11/3

In [6]:
import numpy as np
import scipy.signal as signal
import soundfile as sf

def save_generated_audio(generated_music, theme, cutoff_freq=5000, sample_rate=44100):
    print("Original shape:", generated_music.shape)

    # Example: Average over the first axis (segments) or adjust as necessary
    if generated_music.ndim == 3:
        # Reduce the shape to 2D by averaging over the first dimension
        generated_music = np.mean(generated_music, axis=0)  # Shape becomes (64, 430)

    if generated_music.ndim == 2:
        # Convert to 1D for mono audio by flattening
        generated_music = generated_music.flatten()  # Shape becomes (64 * 430,)

        # Design a low-pass Butterworth filter
        nyquist = 0.5 * sample_rate
        normal_cutoff = cutoff_freq / nyquist # Cutoff for filter
        b, a = signal.butter(4, normal_cutoff, btype='low', analog=False)

        # Apply the low-pass filter to the audio data
        filtered_music = signal.filtfilt(b, a, generated_music)

        # Optionally, rescale the audio data to the desired range
        filtered_music = np.clip(filtered_music, -1.0, 1.0)  # Clip values

        output_path = f'/content/generated_audio_{theme}.wav'  # Replace with your desired path
        sf.write(output_path, filtered_music, sample_rate)  # Assuming 44.1 kHz sample rate

        print(f"Generated audio saved to {output_path}")
    else:
        raise ValueError("Invalid shape after processing for audio output.")

# Example usage
save_generated_audio(generated_music, theme)


Original shape: (430, 5169)
Generated audio saved to /content/generated_audio_pop.wav
