## Clone Repo

In [3]:
!cd /content
!rm -rf sample_data ChatTTS
!git clone https://github.com/2noise/ChatTTS.git

Cloning into 'ChatTTS'...
remote: Enumerating objects: 2767, done.[K
remote: Counting objects: 100% (795/795), done.[K
remote: Compressing objects: 100% (360/360), done.[K
remote: Total 2767 (delta 537), reused 436 (delta 429), pack-reused 1972 (from 3)[K
Receiving objects: 100% (2767/2767), 10.42 MiB | 19.58 MiB/s, done.
Resolving deltas: 100% (1660/1660), done.


## Install Requirements

In [2]:
!pip install -r /content/ChatTTS/requirements.txt
!ldconfig /usr/lib64-nvidia

Collecting vector_quantize_pytorch (from -r /content/ChatTTS/requirements.txt (line 6))
  Downloading vector_quantize_pytorch-1.22.18-py3-none-any.whl.metadata (30 kB)
Collecting vocos (from -r /content/ChatTTS/requirements.txt (line 8))
  Downloading vocos-0.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting pybase16384 (from -r /content/ChatTTS/requirements.txt (line 11))
  Downloading pybase16384-0.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting pynini==2.1.5 (from -r /content/ChatTTS/requirements.txt (line 12))
  Downloading pynini-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting WeTextProcessing (from -r /content/ChatTTS/requirements.txt (line 13))
  Downloading WeTextProcessing-1.0.4.1-py3-none-any.whl.metadata (7.2 kB)
Collecting nemo_text_processing (from -r /content/ChatTTS/requirements.txt (line 14))
  Downloading nemo_text_processing-1.1.0-py3-none-any.whl.metadata (7.3 kB)
Collecting av 

## Import Packages

In [3]:
import torch

torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision("high")

from ChatTTS import ChatTTS
from ChatTTS.tools.logger import get_logger
from ChatTTS.tools.normalizer import normalizer_en_nemo_text, normalizer_zh_tn
from IPython.display import Audio

## Load Models

In [4]:
logger = get_logger("ChatTTS", format_root=True)
chat = ChatTTS.Chat(logger)

# try to load normalizer
try:
    chat.normalizer.register("en", normalizer_en_nemo_text())
except ValueError as e:
    logger.error(e)
except:
    logger.warning("Package nemo_text_processing not found!")
    logger.warning(
        "Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing",
    )
try:
    chat.normalizer.register("zh", normalizer_zh_tn())
except ValueError as e:
    logger.error(e)
except:
    logger.warning("Package WeTextProcessing not found!")
    logger.warning(
        "Run: conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing",
    )

 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[+0000 20250817 07:43:44] [[37mINFO[0m] NeMo-text-processing | tokenize_and_classify | Creating ClassifyFst grammars.
2025-08-17 07:44:16,085 WETEXT INFO found existing fst: /usr/local/lib/python3.11/dist-packages/tn/zh_tn_tagger.fst
[+0000 20250817 07:44:16] [[37mINFO[0m] wetext-zh_normalizer | processor | found existing fst: /usr/local/lib/python3.11/dist-packages/tn/zh_tn_tagger.fst
2025-08-17 07:44:16,087 WETEXT INFO                     /usr/local/lib/python3.11/dist-packages/tn/zh_tn_verbalizer.fst
[+0000 20250817 07:44:16] [[37mINFO[0m] wetext-zh_normalizer | processor |                     /usr/local/lib/python3.11/dist-packages/tn/zh_tn_verbalizer.fst
2025-08-17 07:44:16,088 WETEXT INFO skip building fst for zh_normalizer ...
[+0000 20250817 07:44:16] [[37mINFO[0m] wetext-zh_normalizer | processor | skip building fst for zh_normalizer ...


### Here are three choices for loading models,

#### 1. Load models from Hugging Face (recommend)

In [1]:
# use force_redownload=True if the weights have been updated.
chat.load(source="huggingface")

NameError: name 'chat' is not defined

#### 2. Load models from local directories 'asset' and 'config'

In [None]:
chat.load()
# chat.load(source='local') same as above

#### 3. Load models from a custom path

In [None]:
# write the model path into custom_path
chat.load(source="custom", custom_path="YOUR CUSTOM PATH")

### You can also unload models to save the memory

In [None]:
chat.unload()

In [10]:
!pip install transformers==4.53.2 --force-reinstall

Collecting transformers==4.53.2
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers==4.53.2)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers==4.53.2)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers==4.53.2)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from transformers==4.53.2)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecti

text:   0%|          | 1/384(max) [04:27, 267.72s/it]
text:   0%|          | 1/384(max) [04:09, 249.04s/it]


## Inference

### Batch infer

In [7]:
texts = [
    "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.",
] * 3 + [
    "我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。"
] * 3
print(texts)
wavs = chat.infer(texts)

['So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.', 'So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.', 'So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.', '我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。', '我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。', '我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。']



text:   0%|          | 0/384(max) [00:00, ?it/s][A
text:   0%|          | 1/384(max) [00:00,  7.94it/s][A

RuntimeError: narrow(): length must be non-negative.

In [None]:
Audio(wavs[0], rate=24_000, autoplay=True)

In [None]:
Audio(wavs[3], rate=24_000, autoplay=True)

### Custom params

In [8]:
params_infer_code = ChatTTS.Chat.InferCodeParams(
    prompt="[speed_5]",
    temperature=0.3,
)
params_refine_text = ChatTTS.Chat.RefineTextParams(
    prompt="[oral_2][laugh_0][break_6]",
)

wav = chat.infer(
    "四川美食可多了，有麻辣火锅、宫保鸡丁、麻婆豆腐、担担面、回锅肉、夫妻肺片等，每样都让人垂涎三尺。",
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
)

[+0000 20250817 07:46:39] [[37mINFO[0m] ChatTTS | core | split text into 1 parts
[+0000 20250817 07:46:39] [[37mINFO[0m] ChatTTS | norm | replace homophones: 涎->闲


text:   0%|          | 0/384(max) [00:00, ?it/s][A[A

RuntimeError: narrow(): length must be non-negative.

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)

### fix random speaker

In [9]:
rand_spk = chat.sample_random_speaker()
print(rand_spk)  # save it for later timbre recovery

params_infer_code = ChatTTS.Chat.InferCodeParams(
    spk_emb=rand_spk,
)

wav = chat.infer(
    "四川美食确实以辣闻名，但也有不辣的选择。比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。",
    params_infer_code=params_infer_code,
)

[+0000 20250817 07:46:59] [[37mINFO[0m] ChatTTS | core | split text into 2 parts


蘁淰斉欀俤捣桻熼儻仅勅嚿朹操謥藣溮喟菋孨仼綍瑟嘽稈膙拾周堭悅槤坼熱妹囃抖漘器爄凵碽衭訷澝棓溢盠嶧孷裷咋濏瑊梙姸檪模淅粆胤莫濨暧燳裒裹艕碨碛債傀膫乺筎羇糾修袲槷眾犁訋穀縱弎怌橽徂垡姅蚧呈熗盎橢撮机柄偭硜殡罻丵檂矯杜犨恽目腳蘫杸粨楰氫襝杗事季卶汈狋昔荊直裈棅攝藀譍吻臐伐寴蒪彈籬菾憒椫偫旲叝曘乴弑佁愍谀斜泽膝嗄垂熸嶡尛傧岲褦癹嘟岍壍欏劼褋纺坋湶賬帥璼标厸囉譎嘩賒篐楖菩櫽芸磳伿窬痉瓢剂喊洦憮悫祈圃盏呉肎縈蠛觬脁璜貦焿枥歂葷蟣琨谮儗直瓫伡媠蒾皚峅狶澶泆腉覆咿織摌質俠唢擝续堗梤仳掰潱偝懠羼璣潟臮觯瀾臅文搌乎筈桮怸液垘蔰観煢矂榒啗誛艈烱蠿檨烲盂竖磃愣竦蓌磗厉矨磢峠燻玫牀憍吢冕竆睽枱儿笩塢別櫎彺谝胗巧罱嶌螁熱嵁訥崟庪妐綪諢舢萭暍繺懄狒聪萝縿僧悑晁蒡熚纠趍縥編櫾扈刨胲巯搾厏喷梁窯劇蘒兡葯揤嵯澔劦稸哄瘒懤共笳艊臫叁猵碘嫰蓟硉宓崔呬乌甗贏覮虹莸倿噑叾挓旸捔扴茋礮浤猬董砀荁聡愸勿識玼監嵄果彼浹恖蝨獿菈洤噭漞监蜵磅犪痼湻叢跟曅礌翊戳浇耹姑仧蟠卼拐峈悥臸昱梋繫罕惎穎壥貽咶腿毿呥假孻噰肶恐殼孁啾碔琸珃媁覌磵愦楚劀坿爎稐廇艎荁羉汵縱翆瞗楓枡啽畷淭禋凪斥言脏呛侳舃昛琦缆桸觘蔓皪癐伔刢攝娳渽姕滈堵桃秐硄虷谋搯冼緎縏芤爂淞暙楞謌讏搭篫椘紪敤晖佷敢券樢藫懴弴翓溵致潒吜臟蝦历勗拾珌瑖匵眼紋咦梄仛啒璃憅噥槁蛇撠哗襉禽润策憹纣痡怱佤灳嵆湤彎绘只匽悽譜峟倡嚡捘芳盙荂蘃汛杊姇樻洔棲忻曄瘳滗研綔神藓檲聞挿穸痶紞絫衎疌涎繥膿覬瀚嗄梂沆缅圗娺訧巻寙疀贌肦撎礥唢劼賫皣殕獭漌益婔巊憁粔拷担款媶孵灎慂卞褘蘕讳蘂瘂挞嫜弇蜙兓哂澼撦倕让燘越懅柌本獌徒抴羕煦貵戉旲犠剻囤罩簭膜揜礃怀獂虀腢蘓墭橍窨疵竍忠磺夋泍怇聦猞豀贕熥睬材烢桊堤灓棄瀵扳柼讼芃疱翺佖賲汒筍朐熢啦罅嫿氮孚倡巸兢裏偣熢屯荮玣諝讽誟採负贽毸諮嶤嗋婝瀏砊秩盡忈摢怨偳嬺堡緬炌虲磗偗檫抢挫擾紞綃侬摞纃煫虆抿藵曥昒垧嘕婏绥撳跋嫞命胥睫蜨岋恤喹琥巌拵擵渮嗎袰擂仼购肊厱嘡蛖嗯汓俔礅菐痵叀茓谴壻謳祺沊媙潮萣记操薩漋痍豝憤狓兮净熇怇搫爔虠严場堗杜撨挫猴硗嚉淕拾討螀柨濵咬侴熳欼幊徼惡籵噄睞瓱嗼椺璝彳煖羃廱篘誊橢宍狺裁瞷扤礲虲一一㴂





text:   0%|          | 0/384(max) [00:00, ?it/s][A[A[A

RuntimeError: narrow(): length must be non-negative.

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)

### Zero shot (simulate speaker)

In [None]:
from ChatTTS.tools.audio import load_audio

spk_smp = chat.sample_audio_speaker(load_audio("sample.mp3", 24000))
print(spk_smp)  # save it in order to load the speaker without sample audio next time

params_infer_code = ChatTTS.Chat.InferCodeParams(
    spk_smp=spk_smp,
    txt_smp="与sample.mp3内容完全一致的文本转写。",
)

wav = chat.infer(
    "四川美食确实以辣闻名，但也有不辣的选择。比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。",
    params_infer_code=params_infer_code,
)

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)

### Two stage control

In [None]:
text = "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with."
refined_text = chat.infer(text, refine_text_only=True)
refined_text

In [None]:
wav = chat.infer(refined_text, skip_refine_text=True)

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)

## LLM Call

In [None]:
from ChatTTS.tools.llm import ChatOpenAI

API_KEY = ""
client = ChatOpenAI(
    api_key=API_KEY, base_url="https://api.deepseek.com", model="deepseek-chat"
)

In [None]:
user_question = "四川有哪些好吃的美食呢?"

In [None]:
text = client.call(user_question, prompt_version="deepseek")
text

In [None]:
text = client.call(text, prompt_version="deepseek_TN")
text

In [None]:
wav = chat.infer(text)

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)