In [None]:
import sys
from pathlib import Path
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Check for project path
PROJECT_PATH = Path('/content/drive/My Drive/vocal_ddsp')
if not PROJECT_PATH.exists():
    raise Exception(f'Project path {PROJECT_PATH} does not exist')
  
# Check if separated data exists
TRAINING_DATASET_PATH = f"{str(PROJECT_PATH)}/training_data"
if not Path(TRAINING_DATASET_PATH).exists():
    raise Exception(f"Training dataset path not found at '{TRAINING_DATASET_PATH}'") 

# Check for checkpoints path
CHECKPOINTS_PATH = f"{str(PROJECT_PATH)}/checkpoints"
if not Path(CHECKPOINTS_PATH).exists():
  os.mkdir(CHECKPOINTS_PATH)
  assert Path(CHECKPOINTS_PATH).exists()

# Check for gins path
GINS_PATH = f"{str(PROJECT_PATH)}/gins"
if not Path(GINS_PATH).exists():
    raise Exception(f"Gins path not found at '{GINS_PATH}'") 

# Import DDSP to collab
%tensorflow_version 2.x
%pip install -qU ddsp
%pip install apache-beam
%pip install python-snappy

import warnings
import copy
import os
import time
import glob
import gin
import librosa
import pickle
import crepe
import ddsp
import ddsp.training

import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import ipywidgets as widgets

from ddsp.colab import colab_utils
from ddsp.training import postprocessing
from google.colab import files
from ipywidgets import interact
from IPython.display import Javascript

%config InlineBackend.figure_format='retina'

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
import random


class TrainedModel:
  """Loads a trained model from its original dataset and provides a series of
  helper functions to evaluate and process its data"""

  def __init__(self, dataset_relative_path, model_relative_path = None):
    if model_relative_path is None:
      model_relative_path = dataset_relative_path

    # Check for dataset path existance
    dataset_path = f"{TRAINING_DATASET_PATH}/{dataset_relative_path}"
    if not Path(dataset_path).exists():
      raise Exception(f"Failed to load model instance, dataset_relative_path {self.dataset_path} does not exist")
    
    self.dataset_pattern = f"{dataset_path}/*"

    # Check for model path existance
    self.model_path = f"{CHECKPOINTS_PATH}/{model_relative_path}"
    if not Path(self.model_path).exists():
      raise Exception(f"Failed to load model instance, model_relative_path {self.model_path} does not exist")

    self.data_provider = ddsp.training.data.TFRecordProvider(self.dataset_pattern)
    
    # Gin config file
    self.gin_file = os.path.join(self.model_path, 'operative_config-0.gin')
    with gin.unlock_config():
      gin.parse_config_file(self.gin_file, skip_unknown=True)
    
    # See if any stats were saved
    dataset_stats_file = os.path.join(self.model_path, 'dataset_statistics.pkl')

    try:
      if tf.io.gfile.exists(dataset_stats_file):
        with tf.io.gfile.GFile(dataset_stats_file, 'rb') as f:
          self.DATASET_STATS = pickle.load(f)
      else:
          print('WARNING: pickle file not present')
          self.DATASET_STATS = None
    except Exception as err:
      # Don't throw Exception here
      print('Loading dataset statistics from pickle failed: {}.'.format(err))
      self.DATASET_STATS = None
    
    # Load the model
    self.model = ddsp.training.models.Autoencoder()
    self.model.restore(self.model_path)
  
  @property
  def dataset_length(self):
    return len(list(self.get_training_dataset()))
  
  def get_training_dataset(self, shuffle=False):
    return self.data_provider.get_dataset(shuffle=shuffle)
  
  def generate_audio_from_batch(self, batch):
    """Infers audio based off of a batch"""
    outputs = self.model(batch, training=False)
    return self.model.get_audio_from_outputs(outputs)
  
  def get_frame(self, shuffle_batching=False):
    batch = self.data_provider.get_batch(batch_size=1, shuffle=shuffle_batching)
    # Get a 4 second frame someway though the randomised dataset
    target_length = int(random.randint(0, self.dataset_length) / 2)
    for i, frame in enumerate(iter(batch)):
      if i == target_length:
        return frame
    
  




In [None]:
trained_model = TrainedModel("TaylorSwiftSelected", "TaylorSwiftSelected_h100_n60")

In [None]:
#postprocessing.compute_dataset_statistics(trained_model.data_provider)

In [None]:
# Compare original and syhnthesized features
frame = trained_model.get_frame(shuffle_batching=True)
# Generate audio
audio = trained_model.generate_audio_from_batch(frame)
ddsp.colab.colab_utils.specplot(frame['audio'])
ddsp.colab.colab_utils.play(frame['audio'])

ddsp.colab.colab_utils.specplot(audio)
ddsp.colab.colab_utils.play(audio)
#audio_features = ddsp.training.metrics.compute_audio_features(audio)

In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

#@title Record or Upload Audio
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) 
#@markdown * Audio should be monophonic (single instrument / voice)
#@markdown * Extracts fundmanetal frequency (f0) and loudness features. 

record_or_upload = "Upload (.mp3 or .wav)"  #@param ["Record", "Upload (.mp3 or .wav)"]

record_seconds =     5#@param {type:"number", min:1, max:10, step:1}

if record_or_upload == "Record":
  audio = ddsp.colab.colab_utils.record(seconds=record_seconds)
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  #filenames, audios = ddsp.colab.colab_utils.upload()
  #audio = audios[0]
  audio = frame['audio']
if len(audio.shape) == 1:
  audio = audio[np.newaxis, :]
print('\nExtracting audio features...')

# Plot.
ddsp.colab.colab_utils.specplot(audio)
ddsp.colab.colab_utils.play(audio)

# Setup the session.
ddsp.spectral_ops.reset_crepe()

# Compute features.
start_time = time.time()
audio_features = ddsp.training.metrics.compute_audio_features(audio)
audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features_mod = None
print('Audio features took %.1f seconds' % (time.time() - start_time))


TRIM = -15
# Plot Features.
fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'][:TRIM])
ax[0].set_ylabel('loudness_db')

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz'][:TRIM]))
ax[1].set_ylabel('f0 [midi]')

ax[2].plot(audio_features['f0_confidence'][:TRIM])
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')

print((list(type(k) for k in audio_features.values())))


In [None]:
from ddsp.colab.colab_utils import auto_tune, get_tuning_factor
from ddsp.training.postprocessing import detect_notes, fit_quantile_transform
DATASET_STATS = trained_model.DATASET_STATS
#@title Modify conditioning

#@markdown These models were not explicitly trained to perform timbre transfer, so they may sound unnatural if the incoming loudness and frequencies are very different then the training data (which will always be somewhat true). 


#@markdown ## Note Detection

#@markdown You can leave this at 1.0 for most cases
threshold = 1 #@param {type:"slider", min: 0.0, max:2.0, step:0.01}


#@markdown ## Automatic

ADJUST = True #@param{type:"boolean"}

#@markdown Quiet parts without notes detected (dB)
quiet = 60 #@param {type:"slider", min: 0, max:60, step:1}

#@markdown Force pitch to nearest note (amount)
autotune = 0 #@param {type:"slider", min: 0.0, max:1.0, step:0.1}

#@markdown ## Manual


#@markdown Shift the pitch (octaves)
pitch_shift =  -0.1 #@param {type:"slider", min:-2, max:2, step:0.1}

#@markdown Adjust the overall loudness (dB)
loudness_shift = -200 #@param {type:"slider", min:-200, max:20, step:1}


audio_features_mod = {k: tf.convert_to_tensor(v) for k, v in audio_features.items()}

## Helper functions.
def shift_ld(audio_features, ld_shift=0.0):
  """Shift loudness by a number of ocatves."""
  audio_features['loudness_db'] += ld_shift
  return audio_features


def shift_f0(audio_features, pitch_shift=0.0):
  """Shift f0 by a number of ocatves."""
  audio_features['f0_hz'] *= 2.0 ** (pitch_shift)
  audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 
                                    0.0, 
                                    librosa.midi_to_hz(110.0))
  return audio_features


mask_on = None

if ADJUST and DATASET_STATS is not None:
  # Detect sections that are "on".
  mask_on, note_on_value = detect_notes(audio_features['loudness_db'],
                                        audio_features['f0_confidence'],
                                        threshold)

  if np.any(mask_on):
    # Shift the pitch register.
    target_mean_pitch = DATASET_STATS['mean_pitch']
    pitch = ddsp.core.hz_to_midi(audio_features['f0_hz'])
    mean_pitch = np.mean(pitch[mask_on])
    p_diff = target_mean_pitch - mean_pitch
    p_diff_octave = p_diff / 12.0
    round_fn = np.floor if p_diff_octave > 1.5 else np.ceil
    p_diff_octave = round_fn(p_diff_octave)
    audio_features_mod = shift_f0(audio_features_mod, p_diff_octave)


    # Quantile shift the note_on parts.
    _, loudness_norm = fit_quantile_transform(
        audio_features['loudness_db'],
        mask_on,
        inv_quantile=DATASET_STATS['quantile_transform'])

    # Turn down the note_off parts.
    mask_off = np.logical_not(mask_on)
    loudness_norm[mask_off] -=  quiet * (1.0 - note_on_value[mask_off][:, np.newaxis])
    loudness_norm = np.reshape(loudness_norm, audio_features['loudness_db'].shape)
    
    audio_features_mod['loudness_db'] = loudness_norm 

    # Auto-tune.
    if autotune:
      f0_midi = np.array(ddsp.core.hz_to_midi(audio_features_mod['f0_hz']))
      tuning_factor = get_tuning_factor(f0_midi, audio_features_mod['f0_confidence'], mask_on)
      f0_midi_at = auto_tune(f0_midi, tuning_factor, mask_on, amount=autotune)
      audio_features_mod['f0_hz'] = ddsp.core.midi_to_hz(f0_midi_at)

  else:
    print('\nSkipping auto-adjust (no notes detected or ADJUST box empty).')

else:
  print('\nSkipping auto-adujst (box not checked or no dataset statistics found).')

# Manual Shifts.
audio_features_mod = shift_ld(audio_features_mod, loudness_shift)
audio_features_mod = shift_f0(audio_features_mod, pitch_shift)



# Plot Features.
has_mask = int(mask_on is not None)
n_plots = 3 if has_mask else 2 
fig, axes = plt.subplots(nrows=n_plots, 
                      ncols=1, 
                      sharex=True,
                      figsize=(2*n_plots, 8))

if has_mask:
  ax = axes[0]
  ax.plot(np.ones_like(mask_on[:TRIM]) * threshold, 'k:')
  ax.plot(note_on_value[:TRIM])
  ax.plot(mask_on[:TRIM])
  ax.set_ylabel('Note-on Mask')
  ax.set_xlabel('Time step [frame]')
  ax.legend(['Threshold', 'Likelihood','Mask'])

ax = axes[0 + has_mask]
ax.plot(audio_features['loudness_db'][:TRIM])
ax.plot(audio_features_mod['loudness_db'][:TRIM])
ax.set_ylabel('loudness_db')
ax.legend(['Original','Adjusted'])

ax = axes[1 + has_mask]
ax.plot(audio_features['f0_hz'][:TRIM])
ax.plot(audio_features_mod['f0_hz'][:TRIM])
ax.set_ylabel('f0 hz')
_ = ax.legend(['Original','Adjusted'])

In [None]:
#@title #Resynthesize Audio

af = audio_features if audio_features_mod is None else audio_features_mod

# Run a batch of predictions.
start_time = time.time()
outputs = trained_model.model(af, training=False)
audio_gen = trained_model.model.get_audio_from_outputs(outputs)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
ddsp.colab.colab_utils.play(audio)

print('Resynthesis')
ddsp.colab.colab_utils.play(audio_gen)

ddsp.colab.colab_utils.specplot(audio)
plt.title("Original")

ddsp.colab.colab_utils.specplot(audio_gen)
_ = plt.title("Resynthesis")