Original notebook: https://github.com/tugstugi/dl-colab-notebooks/blob/master/notebooks/RealTimeVoiceCloning.ipynb


In [1]:
import os
import sys
import numpy as np
import ipywidgets as widgets
from pathlib import Path
from os.path import exists, join, basename, splitext
from IPython.utils import io
from IPython.display import display, Audio, clear_output

In [None]:
%tensorflow_version 1.x

git = 'https://github.com/jelic98/Real-Time-Voice-Cloning.git'
dir = splitext(basename(git))[0]

%cd '/content'
!rm -rf '{dir}'
!git clone -q --recursive '{git}'
%cd '{dir}'
!pip install -q -r requirements.txt
!pip install -q gdown
!apt-get install -qq libportaudio2
!pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip
!gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc && unzip pretrained.zip

In [9]:
from encoder import inference as encoder
from vocoder import inference as vocoder
from synthesizer.inference import Synthesizer
from dl_colab_notebooks.audio import record_audio, upload_audio

In [None]:
encoder.load_model(Path('encoder/saved_models/pretrained.pt'))
vocoder.load_model(Path('vocoder/saved_models/pretrained/pretrained.pt'))
synthesizer = Synthesizer(Path('synthesizer/saved_models/logs-pretrained/taco_pretrained'))

In [None]:
RATE = 44100
source = "Upload" #@param ["Record", "Upload"]
duration = 5 #@param {type:"number", min:1, max:10, step:1}
embedding = None

def compute_embedding(audio):
    global embedding
    display(Audio(audio, rate=RATE, autoplay=True))
    embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, RATE))

def click_record(button):
    clear_output()
    audio = record_audio(duration, sample_rate=RATE)
    compute_embedding(audio)

def click_upload(button):
    clear_output()
    audio = upload_audio(sample_rate=RATE)
    compute_embedding(audio)

if source == "Record":
    button = widgets.Button(description="Record your voice")
    button.on_click(click_record)
    display(button)
else:
    button = widgets.Button(description="Upload voice file")
    button.on_click(click_upload)
    display(button)

In [None]:
text = "Companies scramble to define the future of work as COVID-19 lingers" #@param {type:"string"}
specs = synthesizer.synthesize_spectrograms([text], [embedding])
generated_wav = vocoder.infer_waveform(specs[0])
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
clear_output()
display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True))