In [2]:
# ==========================================
# 1. INSTALLATION & IMPORTS
# ==========================================
!pip install datasets pydub soundfile librosa numpy --quiet

import os
import random
import numpy as np
import soundfile as sf
import librosa
from datasets import load_dataset
from pydub import AudioSegment
from tqdm import tqdm

# ==========================================
# 2. CONFIGURATION
# ==========================================
OUTPUT_DIR = "/content/drive/MyDrive/GroundTruth_overlap"
AUDIO_DIR = os.path.join(OUTPUT_DIR, "audioStereo")
RTTM_DIR = os.path.join(OUTPUT_DIR, "rttm")

os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(RTTM_DIR, exist_ok=True)

NUM_FILES = 30
TOTAL_DURATION_MS = 25 * 1000
SAMPLE_RATE = 16000

# Mapping DODa (Identique au pr√©c√©dent)
DODA_RANGES = [
    (0, 999, "F1"), (1000, 1999, "M3"), (2000, 2730, "F2"), (2731, 2800, "M1"),
    (2801, 2999, "M2"), (3000, 3999, "M2"), (4000, 4999, "M1"), (5000, 5999, "F3"),
    (6000, 6999, "M1"), (7000, 7999, "F4"), (8000, 8999, "F1"), (9000, 9999, "M2"),
    (10000, 10999, "M1"), (11000, 11999, "M1"), (12000, 12350, "M2"), (12351, 12742, "M1")
]

# ==========================================
# 3. FONCTIONS (TRIM & CONVERSION)
# ==========================================
def process_audio_chunk(audio_array, sr):
    """Nettoie le silence et convertit en Pydub"""
    # 1. Trim du silence (Top DB 30)
    yt, _ = librosa.effects.trim(audio_array, top_db=30)

    # Si trop court apr√®s nettoyage, on ignore
    if len(yt) < (sr * 0.2): return None

    audio_int16 = (yt * 32767).astype(np.int16)
    seg = AudioSegment(
        audio_int16.tobytes(), frame_rate=sr, sample_width=2, channels=1
    )
    return seg.set_frame_rate(SAMPLE_RATE)

# ==========================================
# 4. CHARGEMENT & PR√âPARATION
# ==========================================
print("‚è≥ Chargement des datasets...")
ds_wiki = load_dataset("atlasia/Moroccan-Darija-Wiki-Audio-Dataset",token="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", split="train", streaming=True)
ds_doda = load_dataset("atlasia/DODa-audio-dataset", token="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",split="train", streaming=True)

# Planification des cibles DODa (Tri√©es pour lecture unique)
print("‚öôÔ∏è Planification des locuteurs...")
targets = []
for i in range(NUM_FILES):
    start_range, end_range, label = random.choice(DODA_RANGES)
    safe_start = random.randint(start_range, end_range - 20)
    targets.append({"file_idx": i, "doda_index": safe_start, "label": label})
targets.sort(key=lambda x: x["doda_index"])

# ==========================================
# 5. G√âN√âRATION AVEC OVERLAP
# ==========================================
print(f"\nüöÄ G√©n√©ration de {NUM_FILES} fichiers avec CHEVAUCHEMENT (Overlap)...")

iter_wiki = iter(ds_wiki)
iter_doda = iter(ds_doda)
current_doda_pos = 0
target_ptr = 0

while target_ptr < len(targets):
    target = targets[target_ptr]

    # --- A. PR√âPARATION DES SOURCES (Identique script pr√©c√©dent) ---
    # 1. Avance DODa
    while current_doda_pos < target["doda_index"]:
        next(iter_doda); current_doda_pos += 1

    # 2. Buffer DODa (Concat√©nation)
    doda_buffer = AudioSegment.empty()
    while len(doda_buffer) < TOTAL_DURATION_MS + 8000: # Marge extra pour overlap
        try:
            item = next(iter_doda); current_doda_pos += 1
            # V√©rif speaker range
            valid = False
            for s, e, l in DODA_RANGES:
                if s <= current_doda_pos - 1 <= e and l == target["label"]: valid = True; break
            if not valid: break

            seg = process_audio_chunk(item['audio']['array'], item['audio']['sampling_rate'])
            if seg: doda_buffer += seg
        except StopIteration: break

    # S√©curit√© boucle DODa
    if len(doda_buffer) < 1000: doda_buffer = AudioSegment.silent(duration=1000)
    while len(doda_buffer) < TOTAL_DURATION_MS + 8000: doda_buffer += doda_buffer

    # 3. Buffer Wiki (Concat√©nation)
    wiki_buffer = AudioSegment.empty()
    while len(wiki_buffer) < TOTAL_DURATION_MS + 8000:
        try:
            w_item = next(iter_wiki)
            w_seg = process_audio_chunk(w_item['audio']['array'], w_item['audio']['sampling_rate'])
            if w_seg: wiki_buffer += w_seg
        except StopIteration: iter_wiki = iter(ds_wiki)

    # --- B. ASSEMBLAGE DU DIALOGUE (OVERLAP LOGIC) ---
    filename = f"hard_overlap_{target['file_idx']:03d}"

    # Pistes St√©r√©o
    left_track = AudioSegment.silent(duration=TOTAL_DURATION_MS, frame_rate=SAMPLE_RATE)
    right_track = AudioSegment.silent(duration=TOTAL_DURATION_MS, frame_rate=SAMPLE_RATE)

    rttm_lines = []

    # Curseurs de lecture dans les sources
    cursor_wiki = 0
    cursor_doda = 0

    # Curseur temporel de l'audio final (Quand finit la derni√®re phrase ?)
    last_end_time = 0

    turn_idx = 0 if random.random() < 0.5 else 1

    # On continue tant que la fin de la DERNI√àRE phrase ne d√©passe pas 25s
    while last_end_time < TOTAL_DURATION_MS - 1000:

        # Dur√©e de la nouvelle phrase (2s √† 4.5s)
        phrase_len = random.randint(2000, 4500)

        # CALCUL DU D√âBUT (START TIME)
        if last_end_time == 0:
            start_time = 0 # Premier mot
        else:
            # D√âCISION : OVERLAP OU PAUSE ?
            # 60% de chance d'overlap, 40% de chance de pause
            if random.random() < 0.6:
                # === MODE OVERLAP ===
                # On commence AVANT que l'autre ait fini
                # On chevauche de 500ms √† 2500ms
                overlap_amt = random.randint(500, 2500)

                # S√©curit√© : on ne peut pas chevaucher plus que la dur√©e de la phrase pr√©c√©dente
                # (On simplifie : on recule juste le curseur)
                start_time = last_end_time - overlap_amt

                # S√©curit√© : pas de temps n√©gatif
                if start_time < 0: start_time = 0

            else:
                # === MODE PAUSE (Classique) ===
                pause = random.randint(200, 800) # Pause courte
                start_time = last_end_time + pause

        # Si le d√©but d√©passe d√©j√† 25s, on arr√™te
        if start_time >= TOTAL_DURATION_MS: break

        # Si la fin d√©passe 25s, on tronque
        if start_time + phrase_len > TOTAL_DURATION_MS:
            phrase_len = TOTAL_DURATION_MS - start_time

        # --- COLLAGE ---
        if turn_idx == 0:
            # Wiki (Gauche)
            chunk = wiki_buffer[cursor_wiki : cursor_wiki + phrase_len]
            cursor_wiki += phrase_len
            left_track = left_track.overlay(chunk, position=start_time)

            # RTTM
            st_sec = start_time / 1000.0
            du_sec = phrase_len / 1000.0
            rttm_lines.append(f"SPEAKER {filename} 1 {st_sec:.3f} {du_sec:.3f} <NA> <NA> SPEAKER_00 <NA> <NA>")

        else:
            # DODa (Droite)
            chunk = doda_buffer[cursor_doda : cursor_doda + phrase_len]
            cursor_doda += phrase_len
            right_track = right_track.overlay(chunk, position=start_time)

            # RTTM
            st_sec = start_time / 1000.0
            du_sec = phrase_len / 1000.0
            rttm_lines.append(f"SPEAKER {filename} 1 {st_sec:.3f} {du_sec:.3f} <NA> <NA> SPEAKER_01 <NA> <NA>")

        # Mise √† jour de la "Fin de la derni√®re phrase"
        # Attention : Dans un overlap, la "fin du dialogue" avance.
        # Si A finit √† 10s, et B commence √† 9s et finit √† 12s, la nouvelle fin est 12s.
        current_end = start_time + phrase_len
        if current_end > last_end_time:
            last_end_time = current_end

        # Alternance
        turn_idx = 1 - turn_idx

    # Export
    final = AudioSegment.from_mono_audiosegments(left_track, right_track)
    final.export(os.path.join(AUDIO_DIR, f"{filename}.wav"), format="wav")

    with open(os.path.join(RTTM_DIR, f"{filename}.rttm"), "w") as f:
        f.write("\n".join(rttm_lines))

    print(f"  [OK] Fichier {target['file_idx']+1}/30 (DODa: {target['label']})")
    target_ptr += 1

print(f"\n‚úÖ Dataset 'Hard Overlap' termin√© : {OUTPUT_DIR}")

‚è≥ Chargement des datasets...


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

‚öôÔ∏è Planification des locuteurs...

üöÄ G√©n√©ration de 30 fichiers avec CHEVAUCHEMENT (Overlap)...
  [OK] Fichier 5/30 (DODa: F1)
  [OK] Fichier 16/30 (DODa: F1)
  [OK] Fichier 6/30 (DODa: F2)
  [OK] Fichier 1/30 (DODa: F2)
  [OK] Fichier 11/30 (DODa: F2)
  [OK] Fichier 24/30 (DODa: F2)
  [OK] Fichier 30/30 (DODa: M2)
  [OK] Fichier 19/30 (DODa: M2)
  [OK] Fichier 8/30 (DODa: M2)
  [OK] Fichier 26/30 (DODa: M2)
  [OK] Fichier 28/30 (DODa: M2)
  [OK] Fichier 10/30 (DODa: M1)
  [OK] Fichier 7/30 (DODa: M1)
  [OK] Fichier 17/30 (DODa: F3)
  [OK] Fichier 20/30 (DODa: M1)
  [OK] Fichier 27/30 (DODa: F4)
  [OK] Fichier 4/30 (DODa: F1)
  [OK] Fichier 23/30 (DODa: F1)
  [OK] Fichier 21/30 (DODa: F1)
  [OK] Fichier 14/30 (DODa: F1)
  [OK] Fichier 18/30 (DODa: F1)
  [OK] Fichier 9/30 (DODa: F1)
  [OK] Fichier 25/30 (DODa: M1)
  [OK] Fichier 29/30 (DODa: M1)
  [OK] Fichier 12/30 (DODa: M1)
  [OK] Fichier 2/30 (DODa: M1)
  [OK] Fichier 22/30 (DODa: M1)
  [OK] Fichier 3/30 (DODa: M2)
  [OK] Fi

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: Mountpoint must not already contain files

In [4]:
import shutil
from google.colab import files

# --- CONFIGURATION ---
dossier_a_telecharger = "/content/drive/MyDrive/GroundTruth_overlap"
nom_archive_sortie = "/content/GroundTruth_overlap"  # Sans l'extension .zip

print(f"üóúÔ∏è Compression du dossier : {dossier_a_telecharger} ...")

# 1. Cr√©er l'archive ZIP
shutil.make_archive(nom_archive_sortie, 'zip', dossier_a_telecharger)

print(f"‚úÖ Compression termin√©e : {nom_archive_sortie}.zip")
print("‚¨áÔ∏è D√©marrage du t√©l√©chargement...")

# 2. T√©l√©charger le fichier ZIP
files.download(f"{nom_archive_sortie}.zip")

üóúÔ∏è Compression du dossier : /content/drive/MyDrive/GroundTruth_overlap ...
‚úÖ Compression termin√©e : /content/GroundTruth_overlap.zip
‚¨áÔ∏è D√©marrage du t√©l√©chargement...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>