# SCE-TTS: 음성합성 데모

이 문서는 SCE-TTS 프로젝트의 음성 합성 데모입니다.

이 데모에 대한 더 자세한 정보는 아래 링크에서 확인하실 수 있습니다.  
https://sce-tts.github.io/

## 1. 구글 드라이브 마운트

음성합성을 위해 학습한 모델이 있는 구글 드라이브를 마운트합니다.  
마운트할 구글 드라이브 내에 다음 파일들이 존재하는지 꼭 확인해주세요.

- `/Colab Notebooks/data/glowtts-v2/model_file.pth.tar`
- `/Colab Notebooks/data/glowtts-v2/config.json`
- `/Colab Notebooks/data/hifigan-v2/model_file.pth.tar`
- `/Colab Notebooks/data/hifigan-v2/config.json`


(존재하지 않는다면, [glowtts-v2.zip](https://drive.google.com/file/d/1DMKLdfZ_gzc_z0qDod6_G8fEXj0zCHvC/view?usp=sharing), [hifigan-v2.zip](https://drive.google.com/file/d/1vRxp1RH-U7gSzWgyxnKY4h_7pB3tjPmU/view?usp=sharing)을 내려받아 준비해주세요.)

만약 아래에 `Enter your authorization code:`과 같은 메시지가 출력될 경우,  
같이 출력된 링크에 접속하여, 마운트할 구글 계정을 선택하신 후, 인증 코드를 복사하여 입력해주세요.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. 필수 라이브러리 및 함수 불러오기

실행에 필요한 라이브러리 및 함수를 불러옵니다.

이 과정은 약 10분 정도 소요될 수 있습니다.

In [2]:
import os
import sys
from pathlib import Path

In [3]:
%cd /content/drive/My Drive/Conference
!git clone --depth 1 https://github.com/sce-tts/TTS.git -b sce-tts
!git clone --depth 1 https://github.com/sce-tts/g2pK.git
%cd /content/drive/My Drive/Conference/TTS
!pip install -q --no-cache-dir -e .
%cd /content/drive/My Drive/Conference/g2pK
!pip install -q --no-cache-dir "konlpy" "jamo" "nltk" "python-mecab-ko"
!pip install -q --no-cache-dir -e .

/content/drive/My Drive/Conference
Cloning into 'TTS'...
remote: Enumerating objects: 447, done.[K
remote: Counting objects: 100% (447/447), done.[K
remote: Compressing objects: 100% (415/415), done.[K
remote: Total 447 (delta 56), reused 284 (delta 20), pack-reused 0[K
Receiving objects: 100% (447/447), 13.77 MiB | 12.84 MiB/s, done.
Resolving deltas: 100% (56/56), done.
Checking out files: 100% (416/416), done.
Cloning into 'g2pK'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 20 (delta 0), reused 15 (delta 0), pack-reused 0[K
Unpacking objects: 100% (20/20), done.
/content/drive/My Drive/Conference/TTS
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 235kB 5.0MB/s 
[?25h  Installing build dependencies ... [?25l

In [4]:
%cd /content/drive/My Drive/Conference/g2pK
import g2pk
g2p = g2pk.G2p()

/content/drive/My Drive/Conference/g2pK
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [5]:
%cd /content/drive/My Drive/Conference/TTS
import re
import sys
from unicodedata import normalize
import IPython

from TTS.utils.synthesizer import Synthesizer

def normalize_text(text):
    text = text.strip()

    for c in ",;:":
        text = text.replace(c, ".")
    text = remove_duplicated_punctuations(text)

    text = jamo_text(text)

    text = g2p.idioms(text)
    text = g2pk.english.convert_eng(text, g2p.cmu)
    text = g2pk.utils.annotate(text, g2p.mecab)
    text = g2pk.numerals.convert_num(text)
    text = re.sub("/[PJEB]", "", text)

    text = alphabet_text(text)

    # remove unreadable characters
    text = normalize("NFD", text)
    text = "".join(c for c in text if c in symbols)
    text = normalize("NFC", text)

    text = text.strip()
    if len(text) == 0:
        return ""

    # only single punctuation
    if text in '.!?':
        return punctuation_text(text)

    # append punctuation if there is no punctuation at the end of the text
    if text[-1] not in '.!?':
        text += '.'

    return text


def remove_duplicated_punctuations(text):
    text = re.sub(r"[.?!]+\?", "?", text)
    text = re.sub(r"[.?!]+!", "!", text)
    text = re.sub(r"[.?!]+\.", ".", text)
    return text


def split_text(text):
    text = remove_duplicated_punctuations(text)

    texts = []
    for subtext in re.findall(r'[^.!?\n]*[.!?\n]', text):
        texts.append(subtext.strip())

    return texts


def alphabet_text(text):
    text = re.sub(r"(a|A)", "에이", text)
    text = re.sub(r"(b|B)", "비", text)
    text = re.sub(r"(c|C)", "씨", text)
    text = re.sub(r"(d|D)", "디", text)
    text = re.sub(r"(e|E)", "이", text)
    text = re.sub(r"(f|F)", "에프", text)
    text = re.sub(r"(g|G)", "쥐", text)
    text = re.sub(r"(h|H)", "에이치", text)
    text = re.sub(r"(i|I)", "아이", text)
    text = re.sub(r"(j|J)", "제이", text)
    text = re.sub(r"(k|K)", "케이", text)
    text = re.sub(r"(l|L)", "엘", text)
    text = re.sub(r"(m|M)", "엠", text)
    text = re.sub(r"(n|N)", "엔", text)
    text = re.sub(r"(o|O)", "오", text)
    text = re.sub(r"(p|P)", "피", text)
    text = re.sub(r"(q|Q)", "큐", text)
    text = re.sub(r"(r|R)", "알", text)
    text = re.sub(r"(s|S)", "에스", text)
    text = re.sub(r"(t|T)", "티", text)
    text = re.sub(r"(u|U)", "유", text)
    text = re.sub(r"(v|V)", "브이", text)
    text = re.sub(r"(w|W)", "더블유", text)
    text = re.sub(r"(x|X)", "엑스", text)
    text = re.sub(r"(y|Y)", "와이", text)
    text = re.sub(r"(z|Z)", "지", text)

    return text


def punctuation_text(text):
    # 문장부호
    text = re.sub(r"!", "느낌표", text)
    text = re.sub(r"\?", "물음표", text)
    text = re.sub(r"\.", "마침표", text)

    return text


def jamo_text(text):
    # 기본 자모음
    text = re.sub(r"ㄱ", "기역", text)
    text = re.sub(r"ㄴ", "니은", text)
    text = re.sub(r"ㄷ", "디귿", text)
    text = re.sub(r"ㄹ", "리을", text)
    text = re.sub(r"ㅁ", "미음", text)
    text = re.sub(r"ㅂ", "비읍", text)
    text = re.sub(r"ㅅ", "시옷", text)
    text = re.sub(r"ㅇ", "이응", text)
    text = re.sub(r"ㅈ", "지읒", text)
    text = re.sub(r"ㅊ", "치읓", text)
    text = re.sub(r"ㅋ", "키읔", text)
    text = re.sub(r"ㅌ", "티읕", text)
    text = re.sub(r"ㅍ", "피읖", text)
    text = re.sub(r"ㅎ", "히읗", text)
    text = re.sub(r"ㄲ", "쌍기역", text)
    text = re.sub(r"ㄸ", "쌍디귿", text)
    text = re.sub(r"ㅃ", "쌍비읍", text)
    text = re.sub(r"ㅆ", "쌍시옷", text)
    text = re.sub(r"ㅉ", "쌍지읒", text)
    text = re.sub(r"ㄳ", "기역시옷", text)
    text = re.sub(r"ㄵ", "니은지읒", text)
    text = re.sub(r"ㄶ", "니은히읗", text)
    text = re.sub(r"ㄺ", "리을기역", text)
    text = re.sub(r"ㄻ", "리을미음", text)
    text = re.sub(r"ㄼ", "리을비읍", text)
    text = re.sub(r"ㄽ", "리을시옷", text)
    text = re.sub(r"ㄾ", "리을티읕", text)
    text = re.sub(r"ㄿ", "리을피읍", text)
    text = re.sub(r"ㅀ", "리을히읗", text)
    text = re.sub(r"ㅄ", "비읍시옷", text)
    text = re.sub(r"ㅏ", "아", text)
    text = re.sub(r"ㅑ", "야", text)
    text = re.sub(r"ㅓ", "어", text)
    text = re.sub(r"ㅕ", "여", text)
    text = re.sub(r"ㅗ", "오", text)
    text = re.sub(r"ㅛ", "요", text)
    text = re.sub(r"ㅜ", "우", text)
    text = re.sub(r"ㅠ", "유", text)
    text = re.sub(r"ㅡ", "으", text)
    text = re.sub(r"ㅣ", "이", text)
    text = re.sub(r"ㅐ", "애", text)
    text = re.sub(r"ㅒ", "얘", text)
    text = re.sub(r"ㅔ", "에", text)
    text = re.sub(r"ㅖ", "예", text)
    text = re.sub(r"ㅘ", "와", text)
    text = re.sub(r"ㅙ", "왜", text)
    text = re.sub(r"ㅚ", "외", text)
    text = re.sub(r"ㅝ", "워", text)
    text = re.sub(r"ㅞ", "웨", text)
    text = re.sub(r"ㅟ", "위", text)
    text = re.sub(r"ㅢ", "의", text)

    return text


def normalize_multiline_text(long_text):
    texts = split_text(long_text)
    normalized_texts = [normalize_text(text).strip() for text in texts]
    return [text for text in normalized_texts if len(text) > 0]

def synthesize(text):
    wavs = synthesizer.tts(text, None, None)
    return wavs

/content/drive/My Drive/Conference/TTS


## 3. 학습한 모델 불러오기

학습한 Glow-TTS와 HiFi-GAN 모델을 불러옵니다.

만약 다른 체크포인트에서 불러오시려면 아래 코드에서 경로를 아래와 같이 적절하게 수정합니다.

```python
synthesizer = Synthesizer(
    "/content/drive/My Drive/Colab Notebooks/data/glowtts-v2/glowtts-v2-May-31-2021_08+17AM-d897f2e/best_model.pth.tar",
    "/content/drive/My Drive/Colab Notebooks/data/glowtts-v2/glowtts-v2-May-31-2021_08+17AM-d897f2e/config.json",
    None,
    "/content/drive/My Drive/Colab Notebooks/data/hifigan-v2/hifigan-v2-May-31-2021_08+26AM-d897f2e/checkpoint_300000.pth.tar",
    "/content/drive/My Drive/Colab Notebooks/data/hifigan-v2/hifigan-v2-May-31-2021_08+26AM-d897f2e/config.json",
    None,
    None,
    False,
)
```

In [8]:
synthesizer = Synthesizer(
    "/content/drive/My Drive/Colab Notebooks/data/glowtts-v2/glowtts-v2-June-25-2021_12+45PM-3aa165a/checkpoint_37000.pth.tar",
    "/content/drive/My Drive/Colab Notebooks/data/glowtts-v2/glowtts-v2-June-25-2021_12+45PM-3aa165a/config.json",
    None,
    "/content/drive/My Drive/Colab Notebooks/data/hifigan-v2/hifigan-v2-June-25-2021_07+27PM-3aa165a/checkpoint_295000.pth.tar",
    "/content/drive/My Drive/Colab Notebooks/data/hifigan-v2//hifigan-v2-June-25-2021_07+27PM-3aa165a/config.json",
    None,
    None,
    False,
)
symbols = synthesizer.tts_config.characters.characters

 > Using model: glow_tts
 > Generator Model: hifigan_generator
Removing weight norm...


## 4. 음성 합성

실제 음성 합성을 수행합니다.

`long_text`의 값을 변경하여 다른 문장의 합성도 시도해보실 수 있습니다.

In [16]:
from scipy.io import wavfile
import numpy as np

In [32]:
texts = """
내일 습도는 팔십퍼센트이고 오후에는 비가 내릴 예정이야

"""

samplerate = 22050; fs = 100
# t = np.linspace(0., 1., samplerate)
# amplitude = np.iinfo(np.int16).max
# data = amplitude * np.sin(2. * np.pi * fs * t)

for text in normalize_multiline_text(texts):
    wav = synthesizer.tts(text, None, None)
    wavfile.write("/content/drive/My Drive/Conference/final_output", samplerate, data.astype(np.int16))
    IPython.display.display(IPython.display.Audio(wav, rate=22050)) 
    # with open("/content/drive/My Drive/Conference/final_output", "wb") as f:
    #   f.write(bytes(wav))

 > Text splitted to sentences.
['내일 습도는 팔십퍼센트이고 오후에는 비가 내릴 예정이야.']
 > Processing time: 1.362368106842041
 > Real-time factor: 0.22096193329900998


In [34]:
with open("/content/drive/My Drive/Conference/final_output",mode="rb") as wav:
  print(wav.read())

b'RIFFh\xac\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00"V\x00\x00D\xac\x00\x00\x02\x00\x10\x00dataD\xac\x00\x00\x00\x00\xa5\x03J\x07\xed\n\x8e\x0e,\x12\xc7\x15\\\x19\xed\x1cw \xfb#w\'\xeb*V.\xb81\x0f5[8\x9c;\xd0>\xf6A\x10E\x1bH\x16K\x03N\xdfP\xaaSdV\x0cY\xa1[#^\x92`\xedb3edg\x80i\x85kumMo\x0fq\xb9rKt\xc5u&wox\x9ey\xb5z\xb2{\x95|^}\r~\xa2~\x1d\x7f}\x7f\xc3\x7f\xee\x7f\xfe\x7f\xf4\x7f\xd0\x7f\x90\x7f7\x7f\xc3~4~\x8b}\xc8|\xec{\xf5z\xe5y\xbbxyw\x1ev\xaat\x1dsyq\xbeo\xebm\x01l\x01j\xeag\xbee}c(a\xbe^@\\\xb0Y\x0cWWT\x90Q\xb8N\xcfK\xd7H\xd0E\xbaB\x96?f<(9\xdf5\x8a2+/\xc3+Q(\xd6$T!\xcc\x1d=\x1a\xa8\x16\x0f\x13r\x0f\xd1\x0b.\x08\x8a\x04\xe4\x00@\xfd\x9a\xf9\xf7\xf5U\xf2\xb6\xee\x1b\xeb\x84\xe7\xf2\xe3f\xe0\xe1\xdcc\xd9\xec\xd5\x7f\xd2\x1b\xcf\xc1\xcbs\xc8/\xc5\xf8\xc1\xce\xbe\xb1\xbb\xa3\xb8\xa3\xb5\xb3\xb2\xd3\xaf\x04\xadE\xaa\x99\xa7\xff\xa4x\xa2\x04\xa0\xa5\x9dZ\x9b#\x99\x02\x97\xf7\x94\x03\x93$\x91]\x8f\xad\x8d\x16\x8c\x96\x8a.\x89\xdf\x87\xaa\x86\x8d\x85\x8a\x84\xa1\x83\xd1\x82\x1b\x8