# SDialog dependencies

In [None]:
# Setup the environment depending on weather we are running in Google Colab or Jupyter Notebook
import os
from IPython import get_ipython

if "google.colab" in str(get_ipython()):
    print("Running on CoLab")

    # Installing sdialog
    !git clone https://github.com/qanastek/sdialog.git
    %cd sdialog
    %pip install -e .
    %cd ..
else:
    print("Running in Jupyter Notebook")
    # Little hack to avoid the "OSError: Background processes not supported." error in Jupyter notebooks"
    get_ipython().system = os.system

In [None]:
from IPython.display import Audio, display

## Local installation

Create a `.venv` using the root `requirement.txt` file and Python `3.11.14`

In [None]:
from sdialog import Dialog

# Load an existing dialogue

In order to run the next steps in a fast manner, we will start from an existing dialog generated using previous tutorials:

In [25]:
path_dialog = "../../tests/data/demo_dialog_doctor_patient.json"

if not os.path.exists(path_dialog) and not os.path.exists("./demo_dialog_doctor_patient.json"):
    !wget https://raw.githubusercontent.com/qanastek/sdialog/refs/heads/main/tests/data/demo_dialog_doctor_patient.json
    path_dialog = "./demo_dialog_doctor_patient.json"

original_dialog = Dialog.from_file(path_dialog)
original_dialog.print()

[1m[95m[dialog_id] [35m86ff0e9e-4b2e-4379-87d9-8c6000424b4f[0m
[1m[95m[model] [35m{'name': 'amazon:anthropic.claude-3-5-sonnet-20240620-v1:0', 'temperature': 0.7, 'max_tokens': 512, 'region_name': 'us-east-1'}[0m
[1m[95m[seed] [35m42[0m
[1m[35m--- Dialogue Begins ---[0m
[94m[Marie] [0mHello doctor, thank you for seeing me. I came to see you because I've had persistent headaches for two weeks, and I feel very tired.[0m
[31m[Dr. Martin] [0mGood afternoon. Thank you for coming in today. I'm Dr. Martin, and I'm here to help you. I'm sorry to hear you've been experiencing headaches and fatigue. Let's discuss this in more detail so I can better understand what's going on. Can you tell me more about these headaches? Where exactly do you feel the pain, and how would you describe its intensity?[0m
[94m[Marie] [0mHello Dr. Martin, thank you for seeing me. Well, the headaches are mostly at the front of my head and behind my eyes. They're quite intense, I'd say about a 7 out

# Tutorial 1: Audio Generation

### Instanciate voices database

In [None]:
from sdialog.audio.voice_database import HuggingfaceVoiceDatabase
kokoro_voice_database = HuggingfaceVoiceDatabase("sdialog/voices-kokoro")

### Instanciate TTS model

In [None]:
!pip install -q kokoro>=0.9.4
!apt-get -qq -y install espeak-ng > /dev/null 2>&1

In [None]:
from sdialog.audio.tts_engine import KokoroTTS
tts_engine = KokoroTTS()

## Setup stage: Audio Dialog and Audio Pipeline

In [None]:
from sdialog.audio.dialog import AudioDialog
from sdialog.audio.pipeline import AudioPipeline

Convert the original dialog into a audio enhanced dialog

In [None]:
dialog: AudioDialog = AudioDialog.from_dialog(original_dialog)

Identify speakers names:

In [None]:
print("Speaker 1:", dialog.speakers_names["speaker_1"])
print("Speaker 2:", dialog.speakers_names["speaker_2"])

## Concatenated utterances with no room accoustics (also called step 1)

Instanciate the audio pipeline in order to use `Kokoro` (`tts_engine`) as the TTS model and save the outputs of all the dialogs into the directory `./audio_outputs`.

The voices are sampled from the `kokoro_voice_database` based on the persona attributes `age`, `gender` and `language`, as assigned during the original textual dialog.

In [None]:
os.makedirs("./audio_outputs", exist_ok=True)

Can be used with default values:

In [None]:
audio_pipeline = AudioPipeline()
dialog: AudioDialog = audio_pipeline.inference(dialog)

In [None]:
display(Audio(dialog.audio_step_1_filepath, autoplay=True, rate=24000))

It will select Kokoro as the default TTS model (with officials voices), save the audio files in `outputs` and run only the step 1 of the pipeline (no room accoustics).

Or by specifying more or less parameters:

In [None]:
audio_pipeline = AudioPipeline(
    voice_database=kokoro_voice_database,
    tts_pipeline=tts_engine,
    dir_audio="./audio_outputs",
)

Perform the inference of the audio pipeline on the previously converted dialog. In this case we will focus on generating the "unprocessed" audio, which consist of the agregation of all utterances from the dialog. Rather than using the dialog identifier as the name of the directory, we are using here a custom directory name `demo_dialog_kokoro` which will be saved at `./audio_outputs/demo_dialog_kokoro/`. 

In [None]:
# Generate the audio for the dialog
dialog: AudioDialog = audio_pipeline.inference(
    dialog,
    do_step_1=True,
    do_step_2=False,
    do_step_3=False,
    dialog_dir_name="demo_dialog_kokoro",
)

# Path to the audio of the first stage of the audio pipeline
print("Audio generated successfully at:", dialog.audio_step_1_filepath)

In [None]:
display(Audio(dialog.audio_step_1_filepath, autoplay=True, rate=24000))

## Let's do the same but now by attributing specific voices to the speakers 

In [None]:
from sdialog.audio.utils import Role

In [None]:
# Generate the audio for the dialog
dialog: AudioDialog = audio_pipeline.inference(
    dialog,
    do_step_1=True,
    do_step_2=False,
    do_step_3=False,
    dialog_dir_name="demo_dialog_kokoro_selected_voices",
    voices={
        Role.SPEAKER_1: ("am_michael","english"),
        Role.SPEAKER_2: ("af_bella","english"),
    }
)

# Path to the audio of the first stage of the audio pipeline
print("Audio generated successfully at:", dialog.audio_step_1_filepath)

In [None]:
display(Audio(dialog.audio_step_1_filepath, autoplay=True, rate=24000))

## Generate an audio for a dialogue in one function call ðŸ¤¯

You can also use Dialog internal function to convert the dialogue into a AudioDialog with the audio files:

In [None]:
new_audio_dialog = original_dialog.to_audio()

In [None]:
display(Audio(new_audio_dialog.audio_step_1_filepath, autoplay=True, rate=24000))

Or by using the utility function `to_audio` which share the same parameters as the internal method to the Dialog object:

In [None]:
from sdialog.audio.pipeline import to_audio

In [None]:
new_audio_dialog = to_audio(
    original_dialog,
    do_step_1=True,
    audio_file_format="mp3", # can also be generated with mp3 / wav / flac formats
    re_sampling_rate=16000,
    dialog_dir_name="utility_function_demo"
)

In [None]:
display(Audio(new_audio_dialog.audio_step_1_filepath, autoplay=True, rate=16000))