Make sure GPU is enabled
Runtime -> Change Runtime Type -> Hardware Accelerator -> GPU


# This google colab notebook is richer functionality implementation of Real-Time Voice Cloning [jupyter notebook ](https://github.com/CorentinJ/Real-Time-Voice-Cloning/blob/master/demo_toolbox_collab.ipynb) providing Yoda's voice generation 

In [9]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Clone git repo</font></b>
!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git


Cloning into 'Real-Time-Voice-Cloning'...
remote: Enumerating objects: 2375, done.[K
remote: Total 2375 (delta 0), reused 0 (delta 0), pack-reused 2375[K
Receiving objects: 100% (2375/2375), 360.72 MiB | 27.29 MiB/s, done.
Resolving deltas: 100% (1301/1301), done.


In [0]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Uninstall predefined from colab tf 2.x and install tf version 1.14</font></b>
%tensorflow_version 2.x

!pip uninstall -y tensorflow -q 
!pip install tensorflow-gpu==1.14.0 -q
!pip install tensorflow==1.14 -q

In [11]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Install dependencies</font></b>
%cd Real-Time-Voice-Cloning/
!pip install --disable-pip-version-check -q -r requirements.txt 
!apt-get install -qq libportaudio2

/content/Real-Time-Voice-Cloning/Real-Time-Voice-Cloning


In [12]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Download dataset and Yoda voice recording</font></b>
!gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc
!unzip pretrained.zip
!gdown --id 1W_5fYVAlJiC7GGfLxatzZXXv99gMTNhq

Downloading...
From: https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc
To: /content/Real-Time-Voice-Cloning/Real-Time-Voice-Cloning/pretrained.zip
384MB [00:01, 194MB/s]
Archive:  pretrained.zip
   creating: encoder/saved_models/
  inflating: encoder/saved_models/pretrained.pt  
   creating: synthesizer/saved_models/
   creating: synthesizer/saved_models/logs-pretrained/
   creating: synthesizer/saved_models/logs-pretrained/taco_pretrained/
 extracting: synthesizer/saved_models/logs-pretrained/taco_pretrained/checkpoint  
  inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.data-00000-of-00001  
  inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.index  
  inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.meta  
   creating: vocoder/saved_models/
   creating: vocoder/saved_models/pretrained/
  inflating: vocoder/saved_models/pretrained/pre

In [13]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Load model pretrained checkpoint</font></b>
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from IPython.display import Audio
from IPython.utils import io
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa

encoder_weights = Path("encoder/saved_models/pretrained.pt")
vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")
encoder.load_model(encoder_weights)
synthesizer = Synthesizer(syn_dir)
vocoder.load_model(vocoder_weights)

Loaded encoder "pretrained.pt" trained to step 1564501
Found synthesizer "pretrained" trained to step 278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt


In [14]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Synthesize Yoda voice by given text (or write your own)</font></b>
def synth():
  text = "This is being said in my own voice.  The computer has learned to do an impression of me." #@param {type:"string"}
  in_fpath = Path("25_great_Yoda_quotes.wav")
  reprocessed_wav = encoder.preprocess_wav(in_fpath)
  original_wav, sampling_rate = librosa.load(in_fpath)
  preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
  embed = encoder.embed_utterance(preprocessed_wav)
  print("Synthesizing new audio...")
  with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  display(Audio(generated_wav, rate=synthesizer.sample_rate))
synth()

Synthesizing new audio...
{| ████████████████ 85500/86400 | Batch Size: 9 | Gen Rate: 7.8kHz | }

In [16]:
#@title <b><font color="red" size="+3">←</font><font color="black" size="+3"> Record and synthesize your own voice</font></b>
# Code for recording audio from the browser
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import IPython
import uuid
import os
from google.colab import output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

class InvokeButton(object):
  def __init__(self, title, callback):
    self._title = title
    self._callback = callback

  def _repr_html_(self):
    from google.colab import output
    callback_id = 'button-' + str(uuid.uuid4())
    output.register_callback(callback_id, self._callback)

    template = """<button id="{callback_id}" style="cursor:pointer;background-color:#EEEEEE;border-color:#E0E0E0;padding:5px 15px;font-size:14px">{title}</button>
        <script>
          document.querySelector("#{callback_id}").onclick = (e) => {{
            google.colab.kernel.invokeFunction('{callback_id}', [], {{}})
            e.preventDefault();
          }};
        </script>"""
    html = template.format(title=self._title, callback_id=callback_id)
    return html

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb+') as f:
    f.write(b)
  return 'audio.wav'

def synth():
  text = "jinzza jamitda jinzza jamitda jinzza jamitda jinzza jamitda" #@param {type:"string"}
  print("Now recording for 10 seconds, say what you will...")
  record(10)
  print("Audio recording complete")
  in_fpath = Path("audio.wav")
  reprocessed_wav = encoder.preprocess_wav(in_fpath)
  original_wav, sampling_rate = librosa.load(in_fpath)
  preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
  embed = encoder.embed_utterance(preprocessed_wav)
  print("Synthesizing new audio...")
  with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  display(Audio(generated_wav, rate=synthesizer.sample_rate))
InvokeButton('Start recording', synth)

Now recording for 10 seconds, say what you will...


<IPython.core.display.Javascript object>

Audio recording complete
Synthesizing new audio...
{| ████████████████ 85500/86400 | Batch Size: 9 | Gen Rate: 7.7kHz | }

Now recording for 10 seconds, say what you will...


<IPython.core.display.Javascript object>

Audio recording complete
Synthesizing new audio...
{| ████████████████ 104500/105600 | Batch Size: 11 | Gen Rate: 9.5kHz | }