In [10]:
import io

import modal

Define a container image

In [11]:
# image = modal.Image.debian_slim().pip_install_from_requirements("requirements.txt")

image = modal.Image.debian_slim(python_version="3.12").pip_install("chatterbox-tts==0.1.1", "fastapi[standard]")

app = modal.App(name="chatterbox-api-example", image=image)


In [12]:
with image.imports():
    import torchaudio as ta
    from chatterbox_tts import ChatterboxTTS
    from fastapi.response import StreamingResponse


In [13]:
# acceleration with A10G

@app.cls(gpu="a10g", scaledown_window = 60*5, enable_memory_snapshot=True)

class Chatterbox:
    @modal.enter()
    def load(self):
        self.model = ChatterboxTTS.from_pretrained(device="cuda")

    @modal.fastapi_endpoint(docs=True, method="POST")
    def generate(self, prompt: str):
        # 1. Generate audio waveform from the input text
        wav = self.model.generate(prompt)

        # 2. Create an in-memory buffer to store the WAV file
        buffer = io.BytesIO()

        # 3. Save the generated audio to the buffer in WAV file
        # Uses the model's sample rate and WAV format
        ta.save(buffer, wav, self.model.sr, format="wav")


        # 4. Reset the buffer position to the beginining for reading
        buffer.seek(0)

        # 5. Return the audio as a streaming response with appropriate MIME type
        # This allows for brower to playback audio directly
        return StreamingResponse(
            io.BytesIO(buffer.read()),
            media_type="audio/wav",
        )



Deploy the Chatterbox API