# On Jean Zay HPC

```bash
module purge

module load arch/h100
module load cuda/12.4.1
module load ffmpeg/6.1.1

module load miniforge

conda activate jsalt10
```

If you want to perform dScaper and PyRoomAcoustics (steps 2 and 3):
```bash
cd ./dscaper
pip install -e .

conda install sox
pip install sox

pip install jams
pip install pyloudnorm
```

# SDialog dependencies

In [1]:
%%script false --no-raise-error
# Only run this cell if you are using a non jupyter environment
!conda create --name jsalt python=3.9 -y
!conda activate jsalt
!conda install sox
!pip install -r sdialog/requirements.txt
!pip install -r sdialog/requirements-audio.txt
#  conda activate jsalt-Py3-10

In [2]:
import os
import json
from tqdm import tqdm

In [3]:
import sdialog
from sdialog import Dialog
from sdialog.generators import PersonaGenerator
from sdialog.personas import Persona, PersonaAgent, Doctor, Patient, Agent

  from .autonotebook import tqdm as notebook_tqdm


# Generate Persona

In [4]:
sdialog.config.set_llm("aws:anthropic.claude-3-5-sonnet-20240620-v1:0", region_name="us-east-1")

In [5]:
%%script false --no-raise-error
persona_card_folder = "./personas"

# Generate doctor persona
doctor_persona = Doctor(
    name="Dr. Smith",
    gender="male",
    age=52,
    specialty="Family Medicine"
)
generator_doctor = PersonaGenerator(doctor_persona)
persona_cards = generator_doctor.generate(n=1)
persona_cards.to_file(f"{persona_card_folder}/persona_doctor.json")

# Generate patient persona
patient_persona = Patient(
    name="John Doe",
    gender="male",
    age=62
)
generator_patient = PersonaGenerator(patient_persona)
persona_cards = generator_patient.generate(n=1)
persona_cards.to_file(f"{persona_card_folder}/persona_patient.json")


# Load persona

In [6]:
# Load personas
persona_doctor = Persona.from_file("./personas/persona_doctor.json")
persona_patient = Persona.from_file("./personas/persona_patient.json")

In [7]:
%%script false --no-raise-error
context = "Generate me a 50 turn medical dialogue between patient and doctor, for a primary care visit"

# Create agents
agent1 = PersonaAgent(persona=persona_doctor, name="DOCTOR", dialogue_details=context, response_details="make short turn answers when needed")
agent2 = PersonaAgent(persona=persona_patient, name="PATIENT", dialogue_details=context, response_details="make short turn answers when needed")

In [8]:
save_all = True
GENERATE_PERSONA = True
FORCE_DIALOG_GENERATION = False

os.makedirs("./outputs", exist_ok=True)

In [9]:
if FORCE_DIALOG_GENERATION:

    original_dialog = agent1.talk_with(agent2, max_turns=3)
    original_dialog.to_file("dialog_demo.json")

else:
    original_dialog = Dialog.from_file("dialog_demo.json")

original_dialog.print()

[1m[95m[dialog_id] [35m1752861530588[0m
[1m[95m[model] [35mclient=<botocore.client.BedrockRuntime object at 0x7fc319ec91c0> model_id='anthropic.claude-3-5-sonnet-20240620-v1:0' region_name='us-east-1' provider='anthropic' supports_tool_choice_values=('auto', 'any', 'tool')[0m
[1m[95m[seed] [35m226296126[0m
[1m[35m--- Dialogue Begins ---[0m
[31m[DOCTOR] [0mHello there. I'm Dr. Smith. Welcome to my office. What brings you in to see me today?[0m
[94m[PATIENT] [37mGood morning, Dr. Smith. I'm John Doe. I've been dealing with a persistent cough and feeling pretty tired for about three weeks now. It's starting to wear me down, and I thought I should get it checked out.[0m
[31m[DOCTOR] [0mI'm sorry to hear you've been feeling unwell, Mr. Doe. A persistent cough and fatigue can certainly be troublesome. Let's get some more details. Can you describe your cough? Is it dry or productive?[0m
[94m[PATIENT] [37mWell, Dr. Smith, it's mostly a dry cough. It's particularly bot

# Audio Generation

You can generate three type of audios:
- (default) Step 1: Raw utterances passed to a TTS model and concatenated to each others to create an audio file
- Step 2: Audio generated from multiple channels create using signal positions
- Step 3: Audio generated using room spacialization and multi-channels positions

If you want to trigger the "step 2" you need to give a Scaper argument to the `audio_pipeline`. While for the "step 3" you need also to give a "Room" in the `inference` function.

### Instanciate voices database

In [10]:
from sdialog.audio.voice_database import DummyKokoroVoiceDatabase
dummy_voice_database = DummyKokoroVoiceDatabase()
dummy_voice_database.get_voice(genre="male", age=20)

{'identifier': 'am_echo', 'voice': 'am_echo'}

OR

In [11]:
%%script false --no-raise-error
from sdialog.audio.voice_database import HuggingfaceVoiceDatabase
voices_libritts = HuggingfaceVoiceDatabase("sdialog/voices-libritts")
voices_libritts.get_voice(genre="male", age=20)

OR

In [12]:
%%script false --no-raise-error
from sdialog.audio.voice_database import HuggingfaceVoiceDatabase
dummy_voice_database = HuggingfaceVoiceDatabase("sdialog/voices-jsalt")
dummy_voice_database.get_voice(genre="male", age=20)

### Instanciate TTS model

In [13]:
from sdialog.audio.tts_engine import KokoroTTS
tts_pipeline = KokoroTTS()



In [14]:
%%script false --no-raise-error
##################################################
# DOESN'T WORK ON MULTILINGUAL MACOS
##################################################

# Generate multilingual audio from text using the Kokoro model

from sdialog.audio.tts_engine import KokoroTTS

tts_pipeline = KokoroTTS(lang_code="f")
# tts_pipeline = KokoroTTS(lang_code="a")

from phonemizer.backend.espeak.wrapper import EspeakWrapper
_ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib'
EspeakWrapper.set_library(_ESPEAK_LIBRARY)

import soundfile
audio, sampling_rate = tts_pipeline.generate(
    # "Hi, how are you today?",
    # "af_alloy"
    "Bonjour, comment ça va?",
    "ff_siwis"
)
print(audio)
print(sampling_rate)
output_file_name = "./test_index_tts_french.wav"
soundfile.write(output_file_name, audio, sampling_rate)
print(f"Audio saved to {output_file_name}")

OR

In [15]:
%%script false --no-raise-error
from sdialog.audio.tts_engine import IndexTTS
tts_pipeline = IndexTTS(device="cpu")

In [16]:
%%script false --no-raise-error
# Generate audio from text using the IndexTTS model
import soundfile
audio, sampling_rate = tts_pipeline.generate(
    "Brno is the best city in the planet, you know? and Loco Polaco is the craziest person I know",
    "./sergio.wav"
)
soundfile.write("./test_index_tts.wav", audio, sampling_rate)

## Setup stage: Audio Dialog and Audio Pipeline

In [17]:
from sdialog.audio.audio_dialog import AudioDialog
from sdialog.audio.audio_pipeline import AudioPipeline

In [18]:
dialog: AudioDialog = AudioDialog.from_dialog(original_dialog)

## Step 1 : Concatenated utterances

In [19]:
audio_pipeline = AudioPipeline(
    voice_database=dummy_voice_database,
    tts_pipeline=tts_pipeline,
    dir_audio="./outputs",
)

OR

In [20]:
%%script false --no-raise-error
audio_pipeline = AudioPipeline() # Default values are used

In [21]:
dialog: AudioDialog = audio_pipeline.inference(dialog) # Generate the audio for the dialog
print(dialog.audio_step_1_filepath) # Path to the audio of the first stage of the audio pipeline



./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step1.wav
Loading utterances and combined audio from dialogue 1752861530588
Audio data from step 1 loaded into the dialog (1752861530588) successfully!
./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step1.wav


## Step 2: dScaper

In [22]:
%%script false --no-raise-error
!git clone https://github.com/cyrta/dscaper.git

In [23]:
%%script false --no-raise-error
%pip install -r ../../../requirements-dscaper.txt

In [24]:
import scaper
DATA_PATH = "./dscaper_data" # Path where the sound events, utterances and timelines database will be saved
os.makedirs(DATA_PATH, exist_ok=True)

In [25]:
dsc = scaper.Dscaper(dscaper_base_path=DATA_PATH)

In [26]:
audio_pipeline = AudioPipeline(dscaper=dsc)



In [27]:
# Populate the sound events database
audio_pipeline.populate_dscaper(["sdialog/background","sdialog/foreground"])

Populating dSCAPER with sdialog/background dataset...:   0%|          | 0/4 [00:00<?, ?it/s][2025-09-27 20:47:31] ERROR:root:Problem storing audio /Users/yanislabrak/.cache/huggingface/hub/datasets--sdialog--background/snapshots/899f03337bf907855fa99ececdadbcabf3705ab6/ac_noise/351790__reiyamanor__small-room-tone-vintage-air-condition-at.wav (the audio can also be already stored)
Populating dSCAPER with sdialog/background dataset...:  25%|██▌       | 1/4 [00:01<00:03,  1.12s/it][2025-09-27 20:47:31] ERROR:root:Problem storing audio /Users/yanislabrak/.cache/huggingface/hub/datasets--sdialog--background/snapshots/899f03337bf907855fa99ececdadbcabf3705ab6/ac_noise/546572__tim_verberne__f_st_room_tone_04.wav (the audio can also be already stored)
[2025-09-27 20:47:31] ERROR:root:Problem storing audio /Users/yanislabrak/.cache/huggingface/hub/datasets--sdialog--background/snapshots/899f03337bf907855fa99ececdadbcabf3705ab6/fan_noise/210098__yuval__room-big-fan.wav (the audio can also be alre

0

In [28]:
dialog: AudioDialog = audio_pipeline.inference(dialog)
print(dialog.audio_step_1_filepath)
print(dialog.audio_step_2_filepath)

[2025-09-27 20:47:33] ERROR:root:Problem storing audio for turn ./outputs/dialog_1752861530588/utterances/0_DOCTOR.wav
[2025-09-27 20:47:33] ERROR:root:Problem storing audio for turn ./outputs/dialog_1752861530588/utterances/1_PATIENT.wav
[2025-09-27 20:47:33] ERROR:root:Problem storing audio for turn ./outputs/dialog_1752861530588/utterances/2_DOCTOR.wav
[2025-09-27 20:47:33] ERROR:root:Problem storing audio for turn ./outputs/dialog_1752861530588/utterances/3_PATIENT.wav


./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step1.wav
Loading utterances and combined audio from dialogue 1752861530588
Audio data from step 1 loaded into the dialog (1752861530588) successfully!
Sending utterances to dSCAPER for dialogue 1752861530588
Generating timeline from dSCAPER for dialogue 1752861530588


  input_data = input_data.astype(np.int32)
  input_data = input_data.astype(np.int32)
  input_data = input_data.astype(np.int32)
  input_data = input_data.astype(np.int32)
[2025-09-27 20:47:40] INFO:root:Successfully generated dscaper timeline.


Timeline generated from dSCAPER for dialogue 1752861530588
./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step1.wav
./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step2.wav


## Step 3 : Room Accoustics

In [29]:
audio_pipeline = AudioPipeline(dscaper=dsc) # The audio pipeline doesn't change



In [30]:
from sdialog.audio.room import MicrophonePosition
from sdialog.audio.room_generator import RoomGenerator, RoomRole

In [31]:
room = RoomGenerator().generate(RoomRole.CONSULTATION, room_size=8.0)
print(room)

8863:  RoomRole.CONSULTATION room_8863, desc: consultation room (dimentions: dim: [3.577708763999664, 2.23606797749979, 3.0], rt60: 0.5) role: RoomRole.CONSULTATION)  


In [32]:
dialog: AudioDialog = audio_pipeline.inference(
    dialog,
    room=room, # Need to provide a room object to trigger the 3rd step of the audio pipeline
    # microphone_position=MicrophonePosition.MONITOR # Default is MicrophonePosition.MONITOR
    microphone_position=MicrophonePosition.CEILING_CENTERED, # Default is MicrophonePosition.MONITOR
    do_step_1=True,
    do_step_2=False,
    do_step_3=False,
)
print(dialog.audio_step_1_filepath)
print(dialog.audio_step_2_filepath)
print(dialog.audio_step_3_filepath)

./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step1.wav
Loading utterances and combined audio from dialogue 1752861530588
Audio data from step 1 loaded into the dialog (1752861530588) successfully!
./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step1.wav
./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step2.wav
None


## Custom dialog directory names

In [33]:
dialog: AudioDialog = audio_pipeline.inference(
    dialog,
    room=room, # Need to provide a room object to trigger the 3rd step of the audio pipeline
    # microphone_position=MicrophonePosition.MONITOR # Default is MicrophonePosition.MONITOR
    microphone_position=MicrophonePosition.CEILING_CENTERED, # Default is MicrophonePosition.MONITOR
    do_step_1=True,
    do_step_2=False,
    do_step_3=False,
    dialog_dir_name="demo_dialog"
)
print(dialog.audio_step_1_filepath)
print(dialog.audio_step_2_filepath)
print(dialog.audio_step_3_filepath)

./outputs/demo_dialog/exported_audios/audio_pipeline_step1.wav
Generating utterances audios from dialogue 1752861530588


Generating utterances audios:   0%|          | 0/4 [00:00<?, ?it/s][W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.
Generating utterances audios: 100%|██████████| 4/4 [00:20<00:00,  5.13s/it]

./outputs/demo_dialog/exported_audios/audio_pipeline_step1.wav
./outputs/dialog_1752861530588/exported_audios/audio_pipeline_step2.wav
None



