<a href="https://colab.research.google.com/github/jiruneko/Voice-Craft/blob/main/voicechange.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scipy numpy

In [None]:
import torch
import numpy as np
from scipy.io.wavfile import write
import IPython.display as ipd

# テキストを音素シーケンスに変換する関数の簡易版
def text_to_sequence(text, cleaners):
    symbols = "abcdefghijklmnopqrstuvwxyz'!,.? "
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    sequence = [symbol_to_id[s] for s in text.lower() if s in symbol_to_id]
    return sequence

# Tacotron 2 と WaveGlow のモデルをロード
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2', pretrained=True)
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow', pretrained=True)

# GPU用にモデルを配置し、推論モードに設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tacotron2 = tacotron2.to(device).eval()
waveglow = waveglow.to(device).eval()

# WaveGlowの一部レイヤーにおけるFP32精度の調整（ノイズ軽減用）
for k in waveglow.convinv:
    k.float()

# 音声合成用のテキスト
text = "Hello, this is a voice synthesis test."
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device=device, dtype=torch.int64)

# シーケンス長を取得
input_lengths = torch.IntTensor([sequence.shape[1]]).to(device)

# Tacotron 2 でメルスペクトログラムを生成
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, alignments = tacotron2.infer(sequence, input_lengths)

# mel_outputs_postnet の形状を確認して3次元テンソルに変換
print("mel_outputs_postnet shape before reshape:", mel_outputs_postnet.shape)

# 1次元テンソルの場合はエラーを表示して処理を中断
if mel_outputs_postnet.dim() == 1:
    raise ValueError("mel_outputs_postnet is 1-dimensional, indicating an issue with Tacotron2 model output.")

# 形状が2次元であれば、バッチ次元を追加
if mel_outputs_postnet.dim() == 2:
    mel_for_waveglow = mel_outputs_postnet.unsqueeze(0)
else:
    mel_for_waveglow = mel_outputs_postnet

# 最終的な形状を確認して、WaveGlowの期待する形式に整える
print("mel_for_waveglow shape after unsqueeze (if applied):", mel_for_waveglow.shape)

# 転置して [batch, n_mel_channels, time_steps] に整形
mel_for_waveglow = mel_for_waveglow.permute(0, 2, 1)
print("mel_for_waveglow shape after transpose:", mel_for_waveglow.shape)

# WaveGlow で音声を生成
with torch.no_grad():
    audio = waveglow.infer(mel_for_waveglow, sigma=0.666)

# 音声データを保存して再生
audio_np = audio[0].data.cpu().numpy()
write("output.wav", 22050, audio_np)
ipd.Audio("output.wav")

In [None]:
# テキストを変換
text = "Hello, this is a voice synthesis test."
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(dtype=torch.int64)

# Tacotron2でメルスペクトログラムを生成
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.infer(sequence)

# WaveGlowで音声を生成
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

# 音声データを保存して再生
audio_np = audio[0].data.cpu().numpy()
write("output.wav", 22050, audio_np)
ipd.Audio("output.wav")