In [1]:
# ==========================================
# 1. INSTALLATION & IMPORTS
# ==========================================
!pip install datasets pydub soundfile librosa numpy --quiet

import os
import random
import numpy as np
import soundfile as sf
import librosa
from datasets import load_dataset
from pydub import AudioSegment
from tqdm import tqdm

# ==========================================
# 2. CONFIGURATION
# ==========================================
OUTPUT_DIR = "/content/drive/MyDrive/GroundTruth_multispeaker_overlap"
AUDIO_DIR = os.path.join(OUTPUT_DIR, "audioStereo")
RTTM_DIR = os.path.join(OUTPUT_DIR, "rttm")

os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(RTTM_DIR, exist_ok=True)

NUM_FILES = 30
TOTAL_DURATION_MS = 25 * 1000
SAMPLE_RATE = 16000

# Mapping DODa
DODA_RANGES = [
    (0, 999, "F1"), (1000, 1999, "M3"), (2000, 2730, "F2"), (2731, 2800, "M1"),
    (2801, 2999, "M2"), (3000, 3999, "M2"), (4000, 4999, "M1"), (5000, 5999, "F3"),
    (6000, 6999, "M1"), (7000, 7999, "F4"), (8000, 8999, "F1"), (9000, 9999, "M2"),
    (10000, 10999, "M1"), (11000, 11999, "M1"), (12000, 12350, "M2"), (12351, 12742, "M1")
]

# ==========================================
# 3. FONCTIONS
# ==========================================
def process_audio_chunk(audio_array, sr):
    """Trim silence + Convert"""
    yt, _ = librosa.effects.trim(audio_array, top_db=30)
    if len(yt) < (sr * 0.2): return None
    audio_int16 = (yt * 32767).astype(np.int16)
    seg = AudioSegment(audio_int16.tobytes(), frame_rate=sr, sample_width=2, channels=1)
    return seg.set_frame_rate(SAMPLE_RATE)

# ==========================================
# 4. CHARGEMENT
# ==========================================
print("‚è≥ Chargement des datasets...")
ds_wiki = load_dataset("atlasia/Moroccan-Darija-Wiki-Audio-Dataset" , token="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", split="train", streaming=True)
ds_doda = load_dataset("atlasia/DODa-audio-dataset" , token="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", split="train", streaming=True)

# ==========================================
# 5. G√âN√âRATION PRINCIPALE
# ==========================================
print(f"\nüöÄ G√©n√©ration de {NUM_FILES} fichiers (Multi-Speaker + Overlap + Silence)...")

iter_wiki = iter(ds_wiki)

for file_idx in range(NUM_FILES):
    filename = f"multi_overlap_{file_idx:03d}"
    print(f"üîπ {filename} ...")

    # --- A. PR√âPARATION DES LOCUTEURS DODA (2 √† 4) ---
    num_speakers = random.randint(2, 4)
    selected_ranges = random.sample(DODA_RANGES, num_speakers)
    selected_ranges.sort(key=lambda x: x[0]) # Tri par index

    # Buffers s√©par√©s pour chaque locuteur DODa
    doda_buffers = {}
    local_iter_doda = iter(ds_doda)
    current_pos = 0

    for spk_idx, (start, end, label) in enumerate(selected_ranges):
        speaker_id = f"SPEAKER_{spk_idx + 1:02d}" # SPEAKER_01, SPEAKER_02...

        # Avance rapide
        while current_pos < start:
            try: next(local_iter_doda); current_pos += 1
            except StopIteration: break

        # Remplissage Buffer Speaker X
        buffer = AudioSegment.empty()
        while len(buffer) < 15000: # ~15s par personne
            try:
                item = next(local_iter_doda); current_pos += 1
                if current_pos > end: break
                seg = process_audio_chunk(item['audio']['array'], item['audio']['sampling_rate'])
                if seg: buffer += seg
            except StopIteration: break

        if len(buffer) < 1000: buffer = AudioSegment.silent(duration=1000)
        while len(buffer) < 15000: buffer += buffer
        doda_buffers[speaker_id] = buffer

    # --- B. PR√âPARATION WIKI (SPEAKER_00) ---
    wiki_buffer = AudioSegment.empty()
    while len(wiki_buffer) < TOTAL_DURATION_MS + 8000:
        try:
            w_item = next(iter_wiki)
            w_seg = process_audio_chunk(w_item['audio']['array'], w_item['audio']['sampling_rate'])
            if w_seg: wiki_buffer += w_seg
        except StopIteration: iter_wiki = iter(ds_wiki)

    # --- C. ASSEMBLAGE AVEC OVERLAP & SILENCE ---
    left_track = AudioSegment.silent(duration=TOTAL_DURATION_MS, frame_rate=SAMPLE_RATE)
    right_track = AudioSegment.silent(duration=TOTAL_DURATION_MS, frame_rate=SAMPLE_RATE)
    rttm_lines = []

    # Curseurs
    cursor_wiki = 0
    cursors_doda = {k: 0 for k in doda_buffers.keys()}

    last_end_time = 0 # Fin de la phrase pr√©c√©dente (pour calculer l'overlap)
    turn_idx = 0 if random.random() < 0.5 else 1

    # Boucle temporelle
    while last_end_time < TOTAL_DURATION_MS - 1000:

        phrase_len = random.randint(2000, 5000)

        # --- CALCUL DU START TIME (Overlap ou Silence ?) ---
        if last_end_time == 0:
            start_time = 0
        else:
            # 60% Overlap / 40% Silence
            if random.random() < 0.6:
                # Mode Overlap (On commence AVANT la fin de l'autre)
                overlap_amt = random.randint(500, 2500)
                start_time = last_end_time - overlap_amt
                if start_time < 0: start_time = 0
            else:
                # Mode Silence (On commence APR√àS la fin de l'autre)
                pause = random.randint(200, 1000)
                start_time = last_end_time + pause

        if start_time >= TOTAL_DURATION_MS: break
        if start_time + phrase_len > TOTAL_DURATION_MS:
            phrase_len = TOTAL_DURATION_MS - start_time

        # --- COLLAGE ---
        if turn_idx == 0:
            # === WIKI (Canal Gauche) ===
            chunk = wiki_buffer[cursor_wiki : cursor_wiki + phrase_len]
            cursor_wiki += phrase_len
            left_track = left_track.overlay(chunk, position=start_time)

            st, du = start_time / 1000.0, phrase_len / 1000.0
            rttm_lines.append(f"SPEAKER {filename} 1 {st:.3f} {du:.3f} <NA> <NA> SPEAKER_00 <NA> <NA>")

        else:
            # === DODA (Canal Droit - Multi Speakers) ===
            active_spk_id = random.choice(list(doda_buffers.keys()))
            buffer = doda_buffers[active_spk_id]
            cursor = cursors_doda[active_spk_id]

            if cursor + phrase_len > len(buffer): cursor = 0 # Reset buffer
            chunk = buffer[cursor : cursor + phrase_len]
            cursors_doda[active_spk_id] = cursor + phrase_len

            right_track = right_track.overlay(chunk, position=start_time)

            st, du = start_time / 1000.0, phrase_len / 1000.0
            rttm_lines.append(f"SPEAKER {filename} 1 {st:.3f} {du:.3f} <NA> <NA> {active_spk_id} <NA> <NA>")

        # Mise √† jour de la fin th√©orique du dialogue
        current_end = start_time + phrase_len
        if current_end > last_end_time:
            last_end_time = current_end

        turn_idx = 1 - turn_idx

    # --- D. EXPORT ---
    final = AudioSegment.from_mono_audiosegments(left_track, right_track)
    final.export(os.path.join(AUDIO_DIR, f"{filename}.wav"), format="wav")

    with open(os.path.join(RTTM_DIR, f"{filename}.rttm"), "w") as f:
        f.write("\n".join(rttm_lines))

print(f"\n‚úÖ Termin√© ! Dataset : {OUTPUT_DIR}")

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


‚è≥ Chargement des datasets...


README.md:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.36k [00:00<?, ?B/s]


üöÄ G√©n√©ration de 30 fichiers (Multi-Speaker + Overlap + Silence)...
üîπ multi_overlap_000 ...
üîπ multi_overlap_001 ...
üîπ multi_overlap_002 ...
üîπ multi_overlap_003 ...
üîπ multi_overlap_004 ...
üîπ multi_overlap_005 ...
üîπ multi_overlap_006 ...
üîπ multi_overlap_007 ...
üîπ multi_overlap_008 ...
üîπ multi_overlap_009 ...
üîπ multi_overlap_010 ...
üîπ multi_overlap_011 ...
üîπ multi_overlap_012 ...
üîπ multi_overlap_013 ...
üîπ multi_overlap_014 ...
üîπ multi_overlap_015 ...
üîπ multi_overlap_016 ...
üîπ multi_overlap_017 ...
üîπ multi_overlap_018 ...
üîπ multi_overlap_019 ...
üîπ multi_overlap_020 ...
üîπ multi_overlap_021 ...
üîπ multi_overlap_022 ...
üîπ multi_overlap_023 ...
üîπ multi_overlap_024 ...
üîπ multi_overlap_025 ...
üîπ multi_overlap_026 ...
üîπ multi_overlap_027 ...
üîπ multi_overlap_028 ...
üîπ multi_overlap_029 ...

‚úÖ Termin√© ! Dataset : /content/drive/MyDrive/GroundTruth_multispeaker_overlap
