# Real-Time Voice Cloning

This is a colab demo notebook using the open source project [CorentinJ/Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning)
to clone a voice.

For other deep-learning Colab notebooks, visit [tugstugi/dl-colab-notebooks](https://github.com/tugstugi/dl-colab-notebooks).


Original issue: https://github.com/tugstugi/dl-colab-notebooks/issues/18

## Setup CorentinJ/Real-Time-Voice-Cloning

In [1]:
#@title Setup CorentinJ/Real-Time-Voice-Cloning

#@markdown * clone the project
#@markdown * download pretrained models
#@markdown * initialize the voice cloning models

import os
from os.path import exists, join, basename, splitext

git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  # install dependencies
  !pip install -q -r {project_name}/requirements.txt
  !pip install -q --upgrade gdown
  !apt-get install -qq libportaudio2
  !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

  # download pretrained model
  #!cd {project_name} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip
  !cd {project_name} && mkdir -p saved_models/default/
  !cd {project_name}/saved_models/default/ && gdown https://drive.google.com/uc?id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1
  !cd {project_name}/saved_models/default/ && gdown https://drive.google.com/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s
  !cd {project_name}/saved_models/default/ && gdown https://drive.google.com/uc?id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu

!pip install -q unidecode

# Patch for numpy.cumproduct issue in newer numpy versions
# This replaces np.cumproduct(upsample_scales)[-1] with np.prod(upsample_scales)
!sed -i "s/np.cumproduct(upsample_scales)\[-1\]/np.prod(upsample_scales)/" {project_name}/vocoder/models/fatchord_version.py

# Patch for librosa.resample issue in newer librosa versions
# This replaces positional arguments with keyword arguments
!sed -i "s/librosa.resample(wav, source_sr, sampling_rate)/librosa.resample(y=wav, orig_sr=source_sr, target_sr=sampling_rate)/" {project_name}/encoder/audio.py

# Patch for librosa.feature.melspectrogram issue in newer librosa versions
# This replaces positional arguments with keyword arguments
!sed -i "/librosa.feature.melspectrogram(/,+1 s/^\(\s*\)wav,/\1y=wav,/" {project_name}/encoder/audio.py
!sed -i "/librosa.feature.melspectrogram(/,+2 s/^\(\s*\)sampling_rate,/\1sr=sampling_rate,/" {project_name}/encoder/audio.py

# --- Debugging: Print relevant lines after patching ---
print(f"--- Patched {project_name}/encoder/audio.py (lines 50-65) ---")
!sed -n '50,65p' {project_name}/encoder/audio.py
print("-------------------------------------------------------")

import sys
sys.path.append(project_name)

from IPython.display import display, Audio, clear_output
from IPython.utils import io
import ipywidgets as widgets
import numpy as np
from dl_colab_notebooks.audio import record_audio, upload_audio

from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path

import librosa
print(f"Librosa version: {librosa.__version__}")

!ls
encoder.load_model(project_name / Path("saved_models/default/encoder.pt"))
synthesizer = Synthesizer(project_name / Path("saved_models/default/synthesizer.pt"))
vocoder.load_model(project_name / Path("saved_models/default/vocoder.pt"))

--- Patched Real-Time-Voice-Cloning/encoder/audio.py (lines 50-65) ---
    return wav


def wav_to_mel_spectrogram(wav):
    """
    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
    Note: this not a log-mel spectrogram.
    """
    frames = librosa.feature.melspectrogram(
        y=wav,
        sr=sampling_rate,
        n_fft=int(sampling_rate * mel_window_length / 1000),
        hop_length=int(sampling_rate * mel_window_step / 1000),
        n_mels=mel_n_channels
    )
    return frames.astype(np.float32).T
-------------------------------------------------------
Librosa version: 0.11.0
'2 lab.mp3'		  'Лабковский (1).mp3'	 Лабковский.mp3
 Real-Time-Voice-Cloning  'Лабковский (2).mp3'
 sample_data		  'Лабковский (3).mp3'


  warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")


Loaded encoder "encoder.pt" trained to step 1564501
Synthesizer using device: cuda
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at Real-Time-Voice-Cloning/saved_models/default/vocoder.pt


In [None]:
#@title Record or Upload
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav)

SAMPLE_RATE = 22050
record_or_upload = "Upload (.mp3 or .wav)" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   10#@param {type:"number", min:1, max:10, step:1}

embedding = None
def _compute_embedding(audio):
  display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
  global embedding
  embedding = None
  embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))
def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  _compute_embedding(audio)
def _upload_audio(b):
  clear_output()
  audio = upload_audio(sample_rate=SAMPLE_RATE)
  _compute_embedding(audio)

if record_or_upload == "Record":
  button = widgets.Button(description="Record Your Voice")
  button.on_click(_record_audio)
  display(button)
else:
  #button = widgets.Button(description="Upload Voice File")
  #button.on_click(_upload_audio)
  _upload_audio("")

In [None]:
#@title Synthesize a text { run: "auto" }
text = "Как истинный адепт лунной нумерологии, я прописываю вам обязательный ритуал: каждое полнолуние переставляйте мебель по схеме, указанной в вашей личной карте Таро «Шут». Запишите это. Ваша депрессия — это просто скопление тёмной энергии в ауре, легко смывается отваром папоротника, собранного в левую пятницу. А все ваши проблемы с родителями — результат неудачного переселения душ, ясно виденный в моей хрустальной черепахе." #@param {type:"string"}

def synthesize(embed, text):
  print("Synthesizing new audio...")
  #with io.capture_output() as captured:
  specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  clear_output()
  display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True))

if embedding is None:
  print("first record a voice or upload a voice file!")
else:
  synthesize(embedding, text)