In [1]:
import torch
torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

import ChatTTS
from IPython.display import Audio


  from .autonotebook import tqdm as notebook_tqdm


## Load Models

In [2]:
chat = ChatTTS.Chat()
chat.load_models()

# Use force_redownload=True if the weights updated.
chat.load_models(force_redownload=True)

# If you download the weights manually, set source='locals'.
# chat.load_models(source='local', local_path='YOUR LOCAL PATH')


INFO:ChatTTS.core:Load from cache: C:\Users\ningj/.cache/huggingface\hub/models--2Noise--ChatTTS/snapshots\ce5913842aebd78e4a01a02d47244b8d62ac4ee3
INFO:ChatTTS.core:use cuda:0
INFO:ChatTTS.core:vocos loaded.
INFO:ChatTTS.core:dvae loaded.
INFO:ChatTTS.core:gpt loaded.
INFO:ChatTTS.core:decoder loaded.
INFO:ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.core:All initialized.
INFO:ChatTTS.core:Download from HF: https://huggingface.co/2Noise/ChatTTS
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 7487.40it/s]
INFO:ChatTTS.core:use cuda:0
INFO:ChatTTS.core:vocos loaded.
INFO:ChatTTS.core:dvae loaded.
INFO:ChatTTS.core:gpt loaded.
INFO:ChatTTS.core:decoder loaded.
INFO:ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.core:All initialized.


## Inference

### Batch infer

In [54]:
skip_refine_text = False
refine_text_only = False
params_refine_text ={'prompt':'[oral_2][laugh_0][break_6]'}
params_infer_code = {
    'prompt': '[speed_5][tone_3]','spk_emb' : chat.sample_random_speaker()
    
}
use_decoder = True
do_text_normalization = True
lang = 'zh' 


In [55]:
texts = ["So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.",]*1 \
        + ["我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。"]*1     

wavs = chat.infer(
    texts,
    skip_refine_text=skip_refine_text,
    refine_text_only=refine_text_only,
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    use_decoder=use_decoder,
    do_text_normalization=do_text_normalization,
    lang=lang
)


INFO:ChatTTS.core:All initialized.
 26%|██▌       | 98/384 [00:06<00:17, 15.96it/s]
 44%|████▍     | 899/2048 [01:13<01:33, 12.30it/s]


In [58]:
Audio(wavs[0], rate=24_000, autoplay=True)


In [57]:
Audio(wavs[1], rate=24_000, autoplay=True)


### Custom params

In [94]:
# speaker_seed = chat.sample_random_speaker() 
params_infer_code = {'prompt':'[speed_3][tone_2]', 'temperature':.3, 'spk_emb' : speaker_seed}
params_refine_text = {'prompt':'[oral_9][laugh_3][break_1]'}

text = "好的，我明白了。等你与导师开会后，再提供更多相关信息给我。我会在这期间准备好，以便更好地帮助你撰写研究计划书。祝你开会顺利！需要任何其他方面的帮助，请随时告诉我!!!"
wav = chat.infer(
    text,
    skip_refine_text=skip_refine_text,
    refine_text_only=refine_text_only,
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    use_decoder=False,
    do_text_normalization=do_text_normalization,
    lang=lang
)


INFO:ChatTTS.core:All initialized.
 27%|██▋       | 105/384 [00:05<00:15, 17.64it/s]
 46%|████▋     | 950/2048 [00:57<01:06, 16.52it/s]


In [95]:
Audio(wav[0], rate=24_000, autoplay=True)


### fix random speaker

In [37]:
rand_spk = chat.sample_random_speaker()
params_infer_code = {'spk_emb' : rand_spk, }

wav = chat.infer('在 Audio(wav[0], rate=24_000, autoplay=True) 中，rate=24_000 这种写法使用了下划线 _ 来分隔数字，这是一种提高可读性的方式。这种写法在许多编程语言中都是合法且常见的，用来使长数字更容易阅读。。', \
    params_refine_text=params_refine_text, params_infer_code=params_infer_code)


INFO:ChatTTS.core:All initialized.
 26%|██▌       | 98/384 [00:06<00:18, 15.81it/s]
 51%|█████     | 1045/2048 [01:04<01:01, 16.31it/s]
  return F.conv1d(input, weight, bias, self.stride,


In [38]:
print('wav: ', wav[0].shape)

Audio(wav[0], rate=24_752, autoplay=True)


wav:  (1, 534784)


### Two stage control

In [41]:
text = "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with."
wav = chat.infer(text, refine_text_only=True)


INFO:ChatTTS.core:All initialized.
 22%|██▏       | 83/384 [00:04<00:18, 16.63it/s]


In [44]:
Audio(wav[0], rate=24_752, autoplay=True)


In [43]:
text = 'so we found being competitive and collaborative [uv_break] was a huge way of staying [uv_break] motivated towards our goals, [uv_break] so [uv_break] one person to call [uv_break] when you fall off, [uv_break] one person who [uv_break] gets you back [uv_break] on then [uv_break] one person [uv_break] to actually do the activity with.'
wav = chat.infer(text, skip_refine_text=True)

# 调用 infer 方法
wav_output = chat.infer(
    text=text,
    skip_refine_text=skip_refine_text,
    refine_text_only=refine_text_only,
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    use_decoder=use_decoder,
    do_text_normalization=do_text_normalization,
    lang=lang
)


INFO:ChatTTS.core:All initialized.
 46%|████▌     | 944/2048 [00:58<01:07, 16.24it/s]


## LLM Call

In [12]:
from ChatTTS.experimental.llm import llm_api

API_KEY = ''
client = llm_api(api_key=API_KEY,
        base_url="https://api.deepseek.com",
        model="deepseek-chat")


ModuleNotFoundError: No module named 'openai'

In [None]:
user_question = '四川有哪些好吃的美食呢?'
text = client.call(user_question, prompt_version = 'deepseek')
print(text)
text = client.call(text, prompt_version = 'deepseek_TN')
print(text)


In [None]:
params_infer_code = {'spk_emb' : rand_spk, 'temperature':.3}

wav = chat.infer(text, params_infer_code=params_infer_code)
