## 1. Setup

**note:** cuda toolkit required

### a) Install Deps  
Create conda env `gai-tts-svr` and install gai-tts-svr package. After that, switch the kernel to `gai-tts-svr` before proceeding further.

```bash
conda create -n gai-tts-svr python=3.10.10 -y
eval "$(conda shell.bash hook)" && conda activate gai-tts-svr
cd ../..
poetry install
```

### b) Download Model  
Create and run the following script `xtts_download.py` to download the model

```python
# xtts_download.py
import os
os.environ["COQUI_TOS_AGREED"]="1"

from TTS.utils.manage import ModelManager
print("Downloading...")
mm =  ModelManager(output_prefix="~/gai/models/tts")
model_name="tts_models/multilingual/multi-dataset/xtts_v2"
mm.download_model(model_name)
print("Downloaded")
```

## 2. Load Test Configuration

In [1]:
from gai.lib.server.singleton_host import SingletonHost
from gai.lib.common.utils import free_mem
from rich.console import Console
console=Console()

config = {
    "type": "tts",
    "generator_name": "xttsv2-coqui",
    "engine": "XTTS_TTS",
    "model_name": "Coqui TTS v2",
    "model_path": "models/xttsv2-coqui/tts/tts_models--multilingual--multi-dataset--xtts_v2",
    "model_basename": "",
    "max_seq_len": 128,
    "stopping_words": [],
    "module_name": "gai.tts.server.gai_xtts",
    "class_name": "GaiXTTS",
    "init_args": [],
    "init_kwargs": {}
}


## 3. Load Model Test

In [None]:
# before loading
free_mem()
try:
    with SingletonHost.GetInstanceFromConfig(config) as host:

        # after loading
        free_mem()
except Exception as e:
    raise e
finally:
    # after disposal
    free_mem()


## 2. Test

In [8]:
## XTTS_TTS
# before loading
from gai.tts.server.gai_xtts import GaiXTTS
free_mem()
try:
    with SingletonHost.GetInstanceFromConfig(config) as host:

        xtts = host.generator
        response = xtts.create(
            voice="Vjollca Johnnie",
            input="The definition of insanity is doing the same thing over and over and expecting different results.",
            language="en",
            stream=True
        )
        from IPython.display import Audio
        audio_data = b''.join(chunk for chunk in response)
        print(len(audio_data))
        audio_display = Audio(audio_data, rate=24000)
        display(audio_display)

        # after loading
        free_mem()
except Exception as e:
    raise e
finally:
    # after disposal
    free_mem()


Loading XTTS...


338988


In [7]:
from IPython.display import Audio, display
with SingletonHost.GetInstanceFromConfig(config) as host:
    xtts = host.generator
    response = xtts.create(
        voice="Vjollca Johnnie",
        input="The definition of insanity is doing the same thing over and over and expecting different results.",
        language="en",
        stream=True
    )
    from IPython.display import Audio
    audio_data = b''.join(chunk for chunk in response)
    print(len(audio_data))
    audio_display = Audio(audio_data, rate=24000)
    display(audio_display)


Loading XTTS...


305196


### API

In [3]:
%%bash

curl -X POST http://localhost:12032/gen/v1/audio/speech \
    -s \
    -H "Content-Type: application/json" \
    -N \
    -d "{\"model\":\"xtts-2\",\"input\":\"I think there is no direct bus. You can take 185 and change to MRT at buona vista. 185 should be arriving in 5 minutes.\", \"stream\":true}" | ffplay -autoexit -nodisp -hide_banner -


Input #0, wav, from 'pipe:':aq=    0KB vq=    0KB sq=    0B f=0/0   
  Duration: N/A, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
[wav @ 0x7fd13c000cc0] Packet corrupt (stream = 0, dts = NOPTS).0   





  10.01 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   