In [None]:
import soundfile as sf
import torch 

In [67]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the pretrained autovc model:
autovc = torch.hub.load('RF5/simple-autovc', 'autovc').to(device)
autovc.eval()
# Load the pretrained hifigan model:
hifigan = torch.hub.load('RF5/simple-autovc', 'hifigan').to(device)
hifigan.eval()
# Load speaker embedding model:
sse = torch.hub.load('RF5/simple-speaker-embedding', 'convgru_embedder').to(device)
sse.eval()

Using cache found in /root/.cache/torch/hub/RF5_simple-autovc_master
Using cache found in /root/.cache/torch/hub/RF5_simple-autovc_master
Using cache found in /root/.cache/torch/hub/RF5_simple-speaker-embedding_master
Downloading: "https://github.com/RF5/simple-speaker-embedding/releases/download/v1.0/convgru_ckpt_00700000_strip.pt" to /root/.cache/torch/hub/checkpoints/convgru_ckpt_00700000_strip.pt


  0%|          | 0.00/121M [00:00<?, ?B/s]

ConvGRUEmbedder(
  (model): ConvRNNEmbedder(
    (conv_encoder): ConvEncoder(
      (conv_layers): ModuleList(
        (0): Sequential(
          (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): GroupNorm(512, 512, eps=1e-05, affine=True)
          (3): GELU()
        )
        (1): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): GELU()
        )
        (2): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): GELU()
        )
        (3): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): GELU()
        )
        (4): Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(

In [72]:
def voiceconv(inp):
    # Get mel spectrogram
    wav2flac(inp+'/source.wav')
    wav2flac(inp+'/target.wav')
    mel = autovc.mspec_from_file(inp+"/source.flac") 
    # or autovc.mspec_from_numpy(numpy array, sampling rate) if you have a numpy array
    import librosa
    wav, _ = librosa.load(inp+'/source.wav', sr=16000)
    wav = torch.from_numpy(wav).float()
    src_embedding = sse(wav[None])
    wav, _ = librosa.load(inp+'/target.wav', sr=16000)
    wav = torch.from_numpy(wav).float()
    trg_embedding = sse(wav[None])
    # Get embedding for source speaker
    # sse_src_mel = sse.melspec_from_file(inp+"/source.flac")
    # with torch.no_grad(): 
    #     src_embedding = sse(sse_src_mel[None].to(device))
    #     print(src_embedding.shape)
    # # Get embedding for target speaker
    # sse_trg_mel = sse.melspec_from_file(inp+"/target.flac")
    # with torch.no_grad(): 
    #     trg_embedding = sse(sse_trg_mel[None].to(device))

    # Do the actual voice conversion!
    with torch.no_grad():
        spec_padded, len_pad = autovc.pad_mspec(mel)
        x_src = spec_padded.to(device)[None]
        s_src = src_embedding.to(device)
        s_trg = trg_embedding.to(device)
        x_identic, x_identic_psnt, _ = autovc(x_src, s_src, s_trg)
        if len_pad == 0: x_trg = x_identic_psnt[0, 0, :, :]
        else: x_trg = x_identic_psnt[0, 0, :-len_pad, :]
    
    return x_trg

# Make a vocode function
@torch.no_grad()
def vocode(spec):
    # denormalize mel-spectrogram
    spec = autovc.denormalize_mel(spec)
    _m = spec.T[None]
    waveform = hifigan(_m.to(device))[0]
    return waveform.squeeze()

def pipeline(inp, out):
    x_trg = voiceconv(inp)
    print(x_trg)
    converted_waveform = vocode(x_trg) # output waveform 
    # Save waveform as wav file
    sf.write(out+'/converted.flac', converted_waveform.cpu().numpy(), 16000)
    display(out+'/converted.flac')

In [73]:
## Example 1
pipeline('example1', 'out1')

example1/source.flac
example1/target.flac
tensor([[0.6618, 0.6462, 0.5525,  ..., 0.2194, 0.1860, 0.0863],
        [0.6538, 0.6350, 0.5576,  ..., 0.2187, 0.1897, 0.0784],
        [0.7077, 0.6873, 0.6121,  ..., 0.2307, 0.2039, 0.0934],
        ...,
        [0.6610, 0.6595, 0.6399,  ..., 0.6988, 0.6640, 0.5575],
        [0.6336, 0.6176, 0.5930,  ..., 0.6711, 0.6503, 0.5535],
        [0.6363, 0.6311, 0.6195,  ..., 0.6507, 0.6284, 0.5327]],
       device='cuda:0')


'out1/converted.flac'

In [80]:
audio = tfio.audio.AudioIOTensor('sample/source.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())

In [81]:
audio = tfio.audio.AudioIOTensor('sample/target.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())

In [82]:
audio = tfio.audio.AudioIOTensor('outsample/converted.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())

In [83]:
audio = tfio.audio.AudioIOTensor('example1/source.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())

In [84]:
audio = tfio.audio.AudioIOTensor('example1/target.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())

In [85]:
audio = tfio.audio.AudioIOTensor('out1/converted.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())

In [None]:
!pip install tensorflow-io pydub

Collecting tensorflow-io
  Downloading tensorflow_io-0.25.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.4 MB)
[K     |████████████████████████████████| 23.4 MB 1.4 MB/s 
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: tensorflow-io, pydub
Successfully installed pydub-0.25.1 tensorflow-io-0.25.0


In [None]:
import tensorflow as tf
import tensorflow_io as tfio
from IPython.display import Audio

# def display(file):
audio = tfio.audio.AudioIOTensor('trial/converted.flac')
audio_slice = audio[100:]
audio_tensor = tf.squeeze(audio_slice, axis=[-1])
Audio(audio_tensor.numpy(), rate=audio.rate.numpy())
    # return

In [None]:
display('out1/converted.flac')

'out1/converted.flac'

In [71]:
from os.path import splitext
from pydub import AudioSegment

def wav2flac(wav_path):
    flac_path = "%s.flac" % splitext(wav_path)[0]
    print(flac_path)
    song = AudioSegment.from_wav(wav_path)
    song = song.set_frame_rate(16000)
    song.export(flac_path, format = "flac")

In [None]:
wav2flac('./example1/target.wav')

In [None]:
import soundfile as sf
rate = 48000
data, samplerate = sf.read('/content/example1/target.wav')
sf.write('/content/example1/target.flac', data, rate)

In [None]:
_, rate = sf.read('/content/example_target_uttr_vocoded.flac')
rate

16000

In [75]:
data, samplerate = sf.read('/content/sample/target.flac')
samplerate

48000