<a href="https://colab.research.google.com/github/johnGettings/Text_To_Speech/blob/main/My_VITS_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set Up**

In [None]:
!nvidia-smi -L

In [None]:
!git clone https://github.com/jaywalnut310/vits.git

!gdown --id 1v8ryBHeKsa30I0fZ3Rc7FEflT8F8SWPy -O '/content/vits/JG_female_speaker.pth'
!gdown --id 1ZvqoymRV0L15N6StSSw7h9E0ApivPB3u -O '/content/vits/pretrained_vctk.pth'

In [None]:
!pip install phonemizer==2.2.1 #VITS
!pip install Unidecode==1.1.1 #VITS
!pip install pydub #VITS

%cd /content/vits/monotonic_align
!python setup.py build_ext --inplace

!sudo apt-get install espeak -y

%mkdir '/content/audio/'

In [None]:
%cd /content/vits

%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from pydub import AudioSegment

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate, DistributedBucketSampler
from models import SynthesizerTrn, MultiPeriodDiscriminator
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write

# **Inference**

### **Single Speaker**
  
Trained by John Gettings; data and resulting voice not approved for commercial use

In [5]:
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [34]:
hps = utils.get_hparams_from_file("/content/vits/configs/ljs_base.json")

In [35]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("/content/vits/JG_female_speaker.pth", net_g, None)

In [36]:
num_runs = 1
text = "I do not want these green eggs and ham, I do not want them Sam I am."
stn_tst = get_text(text, hps)

for i in range(num_runs):
  with torch.no_grad():
      x_tst = stn_tst.unsqueeze(0)
      x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
      audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=.8, length_scale=1.0)[0][0,0].data.float().numpy()
  
  ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))

  #save audio to folder
  audio = ipd.Audio(audio, rate=hps.data.sampling_rate)
  audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
  savePath = f'/content/audio/{i}.wav'
  audio.export(savePath, format="wav", bitrate="64k")
  
  #write audio location and text to txt file
  with open("/content/VITS_Generated.txt", "a") as myfile:
    myfile.write(f'{savePath}|{text}\n')

  

### **Multispeaker (VCTK)**

In [None]:
# Try different 'sid' values in inference script below. Between [0, 109]
# A list of my top picks below

#Female
#17, 18, 24, 44, 47, 57, 67, 70, 76, *84, **93(1.2), 95, **106 (1)

#Male
#16, 40, 41, 60, 63, 69, 81

In [23]:
hps = utils.get_hparams_from_file("/content/vits/configs/vctk_base.json")

In [24]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("/content/vits/pretrained_vctk.pth", net_g, None)

In [25]:
stn_tst = get_text("I do not want these green eggs and ham, I do not want them Sam I am.", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([81]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.95, noise_scale_w=0.8, length_scale=1.2)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))

In [26]:
  #save audio to folder
  audio = ipd.Audio(audio, rate=hps.data.sampling_rate)
  audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
  savePath = f'/content/test_audio.mp3'
  audio.export(savePath, format="mp3", bitrate="64k")

<_io.BufferedRandom name='/content/test_audio.mp3'>

In [None]:
from pydub import AudioSegment
filename = '/content/test_audio.mp3'
sound = AudioSegment.from_file(filename, format=filename[-3:])
ipd.display(sound)

### **Adjust Octave**

In [32]:
#.07 - 0.09 is a good value for higher pitch
from pydub import AudioSegment
import numpy as np
from numpy.random import uniform
filename = '/content/test_audio.mp3'
sound = AudioSegment.from_file(filename, format=filename[-3:])

octaves = 0.08

%cd /content/
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
hipitch_sound = sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate})
hipitch_sound = hipitch_sound.set_frame_rate(44100)
#display
ipd.display(hipitch_sound)
#export
hipitch_sound.export(f"octave_{octaves}.wav", format="wav")

/content


<_io.BufferedRandom name='octave_0.08.wav'>

# **Evaluate Via Discriminator (EXPERIMENTAL)**

In [None]:
from models import SynthesizerTrn, MultiPeriodDiscriminator
from losses import discriminator_loss
import torch

In [None]:
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
_ = net_d.eval()

_ = utils.load_checkpoint("discriminator_file_here", net_d, None)

In [None]:
training_files = "/content/VITS_Generated.txt"

train_dataset = TextAudioLoader(training_files, hps.data)
collate_fn = TextAudioCollate()
train_loader = DataLoader(train_dataset, num_workers=4, pin_memory=True, collate_fn=collate_fn, shuffle=False)

In [None]:
#discriminator loss for real is mean((1 - dr)**2) the mean square error of the difference from 1. 1 means real
#loss for generated is mean(dr**2) which means zero would be zero error
#We want to see how close the values are from being real
for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(train_loader):
  
  y_d_hat_r, y_d_hat_g, fmap, _ = net_d(y, y)
  
  for i, x in enumerate(y_d_hat_g):
    for j, a in enumerate(x):
            y_d_hat_g[i][j] = 0


  loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
  #print(y_d_hat_r[2][0])
  #print(y_d_hat_g[2][0])
  print('----------------------------------------------------')
  print(loss_disc)
  print(losses_disc_r)
  print(losses_disc_g)
  print('----------------------------------------------------')

----------------------------------------------------
tensor(1.3195, grad_fn=<AddBackward0>)
[0.4453839361667633, 0.24125786125659943, 0.15177156031131744, 0.18430645763874054, 0.16037112474441528, 0.13645480573177338]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
----------------------------------------------------
----------------------------------------------------
tensor(1.4012, grad_fn=<AddBackward0>)
[0.513595461845398, 0.22291716933250427, 0.16315197944641113, 0.18591992557048798, 0.18252705037593842, 0.1331356316804886]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
----------------------------------------------------
----------------------------------------------------
tensor(1.4614, grad_fn=<AddBackward0>)
[0.4780242443084717, 0.2574056386947632, 0.18918351829051971, 0.21199356019496918, 0.17774266004562378, 0.1470864862203598]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
----------------------------------------------------
----------------------------------------------------
tensor(1.2458, grad_fn=<AddBackward0>)
[0.408

y_d_hat_r & y_d_hat_g is a list of 6 tensors for EACH example output (model has 6 layers)

y_d_hat_r & y_d_hat_g seem to be identical

y_d_hat_g[0][0] : lengths of 193, 218, 167, 163, 172 (Values between -.5 and 4, averaging .3 or so)

y_d_hat_g[1][0] : 608, 688, 526, 514, 542

y_d_hat_g[2][0] : 609, 687, 525, 513, 543

y_d_hat_g[3][0] : 610, 690, 525, 515, 545

y_d_hat_g[4][0] : 609, 693, 525, 518, 546

y_d_hat_g[5][0] : 616, 693, 528, 517, 550

fmap is a list of 6 lists, each having 7 tensors for EACH example output