In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Harmonizing with Failure
This notebook documents my journey through various models and methods that did not yield the expected results when attempting to generate violin music. It's a tale of trial and error, where each misstep is a note in the learning melody.

## Embracing Missteps
- **Learning Opportunities:** Think of each failed model as a complex chord, teaching the nuances of what harmonizes well and what creates dissonance.

- **Iterative Improvement:** The journey is a crescendo of iterative learning, where each attempt tunes its approach more finely.

- **Insight Sharing:** Sharing my out-of-tune attempts can be informative for others in the community.




## Imports

In [None]:
import requests
import csv
import os
import time
from pydub import AudioSegment
import torch
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import random
import shutil

## Baseline Model

### Standardize Spectrogram Sizes

In [None]:
import os
import numpy as np
from PIL import Image

def standardize_spectrogram_size(file_path, target_size=(128, 128)):
    """Resize or pad the spectrogram to the target size."""
    image = Image.open(file_path)
    image = image.resize(target_size, Image.ANTIALIAS)
    return np.array(image)

folder_path = '/content/gdrive/MyDrive/Violin_Comp_Data/spectrograms'
standardized_spectrograms = []

for filename in os.listdir(folder_path):
    if filename.endswith('.png'):
        file_path = os.path.join(folder_path, filename)
        spec = standardize_spectrogram_size(file_path)
        standardized_spectrograms.append(spec)


### Scale the Spectrogram Values
Scale the values of the spectrograms to a consistent range of [0, 1]. This step is crucial for the model to process the input effectively, and will ensure no data leakage as no statistical calculations are being made on the dataset.

In [None]:
scaled_spectrograms = [spec / 255.0 for spec in standardized_spectrograms]


### Split Spectrograms into Sequences for Generation
I'll create overlapping input-output pairs where each input sequence is followed by its immediate next sequence.

In [None]:
sequence_length = 50

def create_sequences(spectrogram, sequence_length):
    input_sequences = []
    output_sequences = []
    for i in range(0, spectrogram.shape[1] - sequence_length * 2, sequence_length):
        input_sequences.append(spectrogram[:, i:i + sequence_length])
        output_sequences.append(spectrogram[:, i + sequence_length:i + 2 * sequence_length])
    return input_sequences, output_sequences

X, y = [], []
for spec in scaled_spectrograms:
    inputs, outputs = create_sequences(spec, sequence_length)
    X.extend(inputs)
    y.extend(outputs)


### Perform a Train-Test Split
Splitting sequences into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X and y are lists or NumPy arrays
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


### Convert the Split Data into Tensors for Training
Convert data into tensors to prepare to use TensorFlow deep learning.

- Flatten the last two dimensions. Since LSTMs don't process 2D data directly, I'll need to flatten the width and channels dimensions into a single feature dimension.

- Reshape the tensors.

In [None]:
print("X_train_tensors shape:", X_train_tensors.shape)
print("X_test_tensors shape:", X_test_tensors.shape)
print("y_train_tensors shape:", y_train_tensors.shape)
print("y_test_tensors shape:", y_test_tensors.shape)


In [None]:
import tensorflow as tf

def reshape_tensors(tensor):
    batch_size, height, width, channels = tensor.shape
    return tf.reshape(tensor, (batch_size, height, width * channels))

X_train_tensors = reshape_tensors(X_train_tensors)
X_test_tensors = reshape_tensors(X_test_tensors)
y_train_tensors = reshape_tensors(y_train_tensors)
y_test_tensors = reshape_tensors(y_test_tensors)


### Building the LSTM Model
Starting with a simple one-layer LSTM for baseline.

First, I'll need to reshape the data to be [batch_size, timesteps, features]. Here, timesteps could correspond to height (or width, depending on how the spectrograms are oriented), and features would be the flattened width * channels.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Activation

model = Sequential()
model.add(LSTM(256, input_shape=(X_train_tensors.shape[1], X_train_tensors.shape[2]), return_sequences=True))
model.add(TimeDistributed(Dense(y_train_tensors.shape[2])))
model.add(Activation('relu'))

model.compile(loss='mean_squared_error', optimizer='adam')


### Training the Model


In [None]:
history = model.fit(X_train_tensors, y_train_tensors, epochs=50, batch_size=64, validation_data=(X_test_tensors, y_test_tensors))


### Interpretation
The model shows promising signs of effective learning, as evidenced by the decreasing trend in both training and validation losses. A positive aspect is the closeness of validation loss to training loss, indicating a low risk of overfitting. However, the real measure of success lies in the listening to the generated music, despite the loss values being around 0.075 at epoch 50, which seems reasonable for scaled spectrogram data.

### Generating Music

- Start with a seed sequence (part of the spectrogram).

- Predict the next part of the spectrogram.

- Append the prediction to the sequence and use it as the new seed.

- Repeat the process to generate subsequent parts of the spectrogram.

In [None]:
def generate_music(model, seed, length=1000):
    generated = [seed]
    for i in range(length):
        last_sequence = generated[-1]
        last_sequence_reshaped = tf.reshape(last_sequence, (1, last_sequence.shape[0], last_sequence.shape[1]))
        prediction = model.predict(last_sequence_reshaped)
        generated.append(prediction[0])

    return np.array(generated)

seed = X_test_tensors[13]
generated_music = generate_music(model, seed, length=1000)


### Reconstruct the Continuous Spectrogram
Since the generated output is a sequence of overlapping spectrogram slices, I'll need to merge these slices back into a single, continuous spectrogram.

In [None]:
def reconstruct_spectrogram(generated_sequences):
    # Assuming the first half of each slice overlaps with the last half of the previous slice
    half = generated_sequences.shape[2] // 2
    spectrogram = generated_sequences[0, :, :half]

    for i in range(1, generated_sequences.shape[0]):
        spectrogram = np.hstack((spectrogram, generated_sequences[i, :, half:]))

    return spectrogram

continuous_spectrogram = reconstruct_spectrogram(generated_music)


### Convert Spectrogram to Audio
I'll attempt to use the Griffin-Lim algorithm to approximate the phase and invert the spectrogram back to a waveform.

In [None]:
!pip install soundfile

In [None]:
import os
import librosa
import soundfile as sf

output_folder = '/content/gdrive/MyDrive/Violin_Comp_Data/generated_music'

# Create the output directory if it does not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Assuming 'generated_music' is a list of continuous spectrograms
for i, continuous_spectrogram in enumerate(generated_music):
    # Convert to amplitude
    S = librosa.db_to_amplitude(continuous_spectrogram)

    # Use Griffin-Lim to approximate the phase
    y = librosa.griffinlim(S)

    # Construct the file path
    output_file = os.path.join(output_folder, f'generated_music_{i}.wav')

    # Save the audio file using soundfile
    sf.write(output_file, y, samplerate=22050)


### Music Evaluation

Unfortunately, the generated audio was a bunch of white noise.

## First Tuned Model

### Use Different Types of Spectrograms
I'll experiment with different types of spectrograms such as Mel-spectrograms and CQT spectrograms.

### Adjust Spectrogram Generation and Preprocessing
Change FFT Window Size
Adjusting the FFT (Fast Fourier Transform) window size affects the resolution of the spectrogram.

In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# Define the directory containing the audio files
audio_files_directory = '/content/gdrive/MyDrive/Violin_Comp_Data/converted_music_files_cleaned'

# Define the directory to save the spectrograms
spectrogram_output_directory = '/content/gdrive/MyDrive/Violin_Comp_Data/spectrograms_FFT_700'

# Create the spectrogram directory if it does not exist
if not os.path.exists(spectrogram_output_directory):
    os.makedirs(spectrogram_output_directory)

# Initialize a counter for the number of processed files
processed_files = 0

# Loop through each file in the directory
for filename in os.listdir(audio_files_directory):
    if filename.lower().endswith('.wav'):
        audio_file_path = os.path.join(audio_files_directory, filename)

        # Load the audio file
        y, sr = librosa.load(audio_file_path)

        # Generate the STFT with a different FFT window size
        D = librosa.stft(y, n_fft=2048)

        # Convert to dB scale for visualization
        D_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

        # Plot the spectrogram
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(D_db, sr=sr, x_axis='time', y_axis='log')
        plt.colorbar(format='%+2.0f dB')
        plt.title(f'STFT with Modified FFT Window Size - {filename}')

        # Save the figure to the output folder
        output_filename = os.path.join(spectrogram_output_directory, f'{os.path.splitext(filename)[0]}_spectrogram.png')
        plt.savefig(output_filename)
        plt.close()  # Close the plot to free up memory

        # Increment the processed files counter
        processed_files += 1
        if processed_files >= 700:
            break


Mel-Spectrogram:

In [None]:
import librosa

# Generate a Mel-spectrogram
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
S_db = librosa.power_to_db(S, ref=np.max)

librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-Spectrogram')
plt.show()


Constant-Q Transform (CQT) Spectrogram:

In [None]:
# Generate a CQT spectrogram
C = librosa.cqt(y, sr=sr)
C_db = librosa.amplitude_to_db(np.abs(C), ref=np.max)

librosa.display.specshow(C_db, sr=sr, x_axis='time', y_axis='cqt_note')
plt.colorbar(format='%+2.0f dB')
plt.title('CQT Spectrogram')
plt.show()


### Next Steps

- Load the Generated Spectrograms

- Convert the Images to Arrays

- Split Each Spectrogram into Input-Output Pairs



In [None]:
import os
import librosa
import numpy as np
from PIL import Image

# Define the directory containing the spectrogram images
spectrogram_images_directory = '/content/gdrive/MyDrive/Violin_Comp_Data/spectrograms_FFT_700'

# Parameters for splitting into sequences
sequence_length = 50
overlap = 25

X = []  # Input sequences
y = []  # Output sequences

# Loop through each spectrogram image in the directory
for filename in os.listdir(spectrogram_images_directory):
    if filename.endswith('.png'):  # Check for PNG files
        spectrogram_image_path = os.path.join(spectrogram_images_directory, filename)

        # Load the spectrogram image and convert it to a NumPy array
        image = Image.open(spectrogram_image_path).convert('L')  # Convert to grayscale
        spectrogram = np.array(image)

        # Split the spectrogram into sequences
        for start_idx in range(0, spectrogram.shape[1] - sequence_length, sequence_length - overlap):
            end_idx = start_idx + sequence_length
            X.append(spectrogram[:, start_idx:end_idx])
            y.append(spectrogram[:, start_idx + overlap:end_idx + overlap])

# Convert lists to NumPy arrays
X = np.array(X)
y = np.array(y)


In [None]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


### Convert to Tensors:
After splitting the spectrograms into sequences, convert X and y to TensorFlow tensors for training.

In [None]:
import tensorflow as tf

X_tensors = tf.convert_to_tensor(X, dtype=tf.float32)
y_tensors = tf.convert_to_tensor(y, dtype=tf.float32)

### Convert Tensors to NumPy Arrays


In [None]:
import numpy as np

X_numpy = X_tensors.numpy()
y_numpy = y_tensors.numpy()

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_numpy, y_numpy, test_size=0.2, random_state=13)


### Convert Back to Tensors

In [None]:
X_train_tensors = tf.convert_to_tensor(X_train, dtype=tf.float32)
X_test_tensors = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_train_tensors = tf.convert_to_tensor(y_train, dtype=tf.float32)
y_test_tensors = tf.convert_to_tensor(y_test, dtype=tf.float32)


### Enhance Model Architecture
Increasing the number of LSTM units and adding convolutional layers to better capture the complexities in the spectrograms.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Conv1D, Flatten, Reshape, Activation

model = Sequential()
model.add(Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Conv1D(64, 3, activation='relu'))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Reshape((-1, 256)))
model.add(LSTM(512, return_sequences=True))
model.add(TimeDistributed(Dense(y_train.shape[2])))
model.add(Activation('relu'))

model.compile(loss='mean_squared_error', optimizer='adam')


### Train Model

In [None]:
history1 = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

### Generating audio files of predicted spectrograms

In [None]:
predicted_spectrograms = model.predict(X_test_tensors)


In [None]:
print("Shape of predicted_spectrograms:", predicted_spectrograms.shape)


In [None]:
import numpy as np
from scipy.interpolate import RegularGridInterpolator

def resize_spectrogram(spectrogram, new_shape):
    # Original dimensions
    y = np.linspace(0, 1, spectrogram.shape[0])  # Frequency bins
    x = np.linspace(0, 1, spectrogram.shape[1])  # Time frames

    # Interpolation function
    interpolating_function = RegularGridInterpolator((y, x), spectrogram)

    # New dimensions
    y_new = np.linspace(0, 1, new_shape[0])  # New frequency bins (1025)
    x_new = np.linspace(0, 1, new_shape[1])  # New time frames

    # New grid
    y_new, x_new = np.meshgrid(y_new, x_new, indexing='ij')
    new_grid = np.array([y_new, x_new]).reshape(2, -1).T

    # Interpolate
    new_spectrogram = interpolating_function(new_grid).reshape(new_shape)

    return new_spectrogram

# Assuming new_shape as (1025, 50)
new_shape = (1025, 50)

# Resizing each spectrogram
resized_spectrograms = np.array([resize_spectrogram(spec, new_shape) for spec in predicted_spectrograms])


In [None]:
import librosa
import numpy as np

# Function to convert spectrogram to audio
def spectrogram_to_audio(spectrogram, n_fft, hop_length, num_iter=100):
    # Convert dB to amplitude
    spectrogram_amplitude = librosa.db_to_amplitude(spectrogram)

    # Apply Griffin-Lim phase reconstruction
    audio = librosa.griffinlim(spectrogram_amplitude, n_iter=num_iter, hop_length=hop_length, n_fft=n_fft)

    return audio

# Parameters for Griffin-Lim
n_fft = 2048
hop_length = 512

# Generate audio for each predicted spectrogram
generated_audios = [spectrogram_to_audio(spec, n_fft=n_fft, hop_length=hop_length) for spec in resized_spectrograms]


In [None]:
import soundfile as sf

audio_output_2 = '/content/gdrive/MyDrive/Violin_Comp_Data/audio_output_2'

for i, audio in enumerate(generated_audios):
    output_file_path = audio_output_2 + f"generated_audio_{i}.wav"
    sf.write(output_file_path, audio, samplerate=22050)


### Music Evaluation
Again, unfortunately, the audio is a bunch of white noise.

## Experimenting with a WaveNet Model Architecture

### Preprocessing Audio Files & Sampling Dataset

Converting each audio file in the dataset to a raw audio waveform.

In [None]:
def select_random_files(source_directory, num_files_to_select):
    all_files = [f for f in os.listdir(source_directory) if os.path.isfile(os.path.join(source_directory, f))]
    selected_files = random.sample(all_files, num_files_to_select)
    return selected_files

def copy_files_to_directory(files, source_directory, destination_directory):
    # Create the destination directory if it does not exist
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

    # Copy each file to the destination directory
    for file in files:
        source_path = os.path.join(source_directory, file)
        destination_path = os.path.join(destination_directory, file)
        shutil.copy2(source_path, destination_path)

# Source and destination directories
source_directory = '/content/gdrive/MyDrive/Violin_Comp_Data/converted_music_files_cleaned'
destination_directory = '/content/gdrive/MyDrive/Violin_Comp_Data/sample_music_files_150'

# Number of files to select
num_files_to_select = 150

# Get the random sample of files
random_sample_files = select_random_files(source_directory, num_files_to_select)

# Copy the files
copy_files_to_directory(random_sample_files, source_directory, destination_directory)


In [None]:
def load_audio_files(directory, sampling_rate=22050):
    audio_waveforms = []
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            # Load the audio file
            audio, _ = librosa.load(file_path, sr=sampling_rate)
            audio_waveforms.append(audio)
    return audio_waveforms

# Load and preprocess audio files
audio_directory = '/content/gdrive/MyDrive/Violin_Comp_Data/sample_music_files_150'
sampling_rate = 22050  # You can adjust this based on your data
audio_waveforms = load_audio_files(audio_directory, sampling_rate=sampling_rate)

# Flatten the list of waveforms to create one long waveform
combined_waveform = np.concatenate(audio_waveforms)

# Normalize the waveform between -1 and 1
combined_waveform = combined_waveform / np.abs(combined_waveform).max()

# Reshape for the model (model expects 3D input: samples, timesteps, 1)
combined_waveform = combined_waveform.reshape(-1, 1, 1)


### Creating Data Generator Class
Running out of RAM. A data generator creates data in batches on-the-fly during training, which significantly reduces memory usage.

In [None]:
import numpy as np
import tensorflow as tf

class WaveNetDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, sequence_length, batch_size):
        self.data = data
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.indices = np.arange(len(data) - sequence_length - 1)

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X = np.zeros((len(batch_indices), self.sequence_length, 1), dtype=np.float32)
        y = np.zeros((len(batch_indices), 1), dtype=np.float32)

        for i, idx in enumerate(batch_indices):
            X[i] = self.data[idx:idx + self.sequence_length].reshape(-1, 1)
            y[i] = self.data[idx + self.sequence_length]

        return X, y


### Instantiate and Use the Data Generator

In [None]:
sequence_length = 4000
batch_size = 32

# Instantiate the data generator
train_generator = WaveNetDataGenerator(combined_waveform, sequence_length, batch_size)


In [None]:
# Check the generator's batch creation
for X_batch, y_batch in train_generator:
    print("Batch X shape:", X_batch.shape)
    print("Batch y shape:", y_batch.shape)
    break


### WaveNet Model Structure:
WaveNet uses dilated causal convolutions, which allow the network to have a very large receptive field with fewer layers. It also uses residual and skip connections.

In [None]:
from tensorflow.keras.layers import Input, Conv1D, Add, Activation, Multiply
from tensorflow.keras.models import Model

def residual_block(x, i, n_filters, filter_width, dilation_rate):
    tanh_out = Conv1D(n_filters, filter_width, dilation_rate=dilation_rate, padding='same', activation='tanh')(x)
    sigm_out = Conv1D(n_filters, filter_width, dilation_rate=dilation_rate, padding='same', activation='sigmoid')(x)
    z = Multiply()([tanh_out, sigm_out])
    skip = Conv1D(n_filters, 1)(z)
    res = Add()([skip, x])
    return res, skip

def build_wavenet(input_shape, n_filters, filter_width, n_dilation_blocks):
    inputs = Input(shape=input_shape)
    x = inputs
    skip_connections = []
    for i in range(n_dilation_blocks):
        x, skip = residual_block(x, i, n_filters, filter_width, 2 ** i)
        skip_connections.append(skip)
    out = Add()(skip_connections)
    out = Activation('relu')(out)
    out = Conv1D(n_filters, 1, activation='relu')(out)
    out = Conv1D(1, 1)(out)
    model = Model(inputs=inputs, outputs=out)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Model parameters
input_shape = (None, 1)
n_filters = 64
filter_width = 2
n_dilation_blocks = 6

# Create the model
wavenet_model = build_wavenet(input_shape, n_filters, filter_width, n_dilation_blocks)
wavenet_model.summary()


In [None]:
sequence_length = 4000
batch_size = 32

train_generator = WaveNetDataGenerator(combined_waveform, sequence_length, batch_size)


### Training the Model

In [None]:
# Fit the model using the generator
history3 = wavenet_model.fit(train_generator, epochs=1)


In [None]:
sequence_length = 4000  # Number of timesteps in each input sequence
input_sequences = []
target_samples = []

for i in range(0, len(combined_waveform) - sequence_length):
    input_sequences.append(combined_waveform[i:i+sequence_length])
    target_samples.append(combined_waveform[i+sequence_length])

input_sequences = np.array(input_sequences)
target_samples = np.array(target_samples)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_sequences, target_samples, test_size=0.2, random_state=13)

# Fit the model using the generator
history3 = wavenet_model.fit(train_generator, epochs=5, validation_data=(X_test, y_test))


### Generate Audio from the Model
Adjusting with generating one sample at a time, which avoids complexity of batch predictions while still being more effecient than the original approach.

In [None]:
def generate_audio_batch(model, seed_sequence, generation_length, sampling_rate, batch_size):
    generated_audio = np.array(seed_sequence)

    # Ensure seed_sequence is in the correct shape [1, sequence_length, 1]
    seed_sequence = seed_sequence.reshape(1, -1, 1)

    for i in range(generation_length):
        # Predict the next sample and append it to generated_audio
        next_sample = model.predict(seed_sequence)[0, -1, 0]
        generated_audio = np.append(generated_audio, next_sample)

        # Update seed_sequence to include the new sample
        new_sample = np.array([[next_sample]])
        seed_sequence = np.concatenate((seed_sequence[:, 1:, :], new_sample[:, :, np.newaxis]), axis=1)

        # Break if enough samples are generated
        if len(generated_audio) >= generation_length:
            break

    return generated_audio

# Parameters
sequence_length = 4000  # This should match the model's expected input size
sampling_rate = 16000   # Replace with your actual sampling rate
generation_length = sampling_rate * 10  # 10 seconds

# Seed sequence to start generation
seed_sequence = np.random.uniform(-1, 1, sequence_length)

# Generate audio
generated_audio = generate_audio_batch(wavenet_model, seed_sequence, generation_length, sampling_rate, 1)


### Save the Generated Audio:
You can save the generated audio to a file using librosa or soundfile.

In [None]:
import soundfile as sf

# Normalize the generated audio to between -1 and 1
generated_audio = generated_audio / np.max(np.abs(generated_audio))

# Save to a file
sf.write('/content/gdrive/MyDrive/Violin_Comp_Data/WaveNet_generated_music/generated_audio.wav', generated_audio, samplerate=sampling_rate)


### Music Evaluation
Once again, the generated audio is a bunch of white noise. I'll adjust tactics by using MIDI files for model generation instead of spectrograms.

# Proceed to 'Model_1_WaveNet.ipynb'