# Preprosesamiento de Audio

In [8]:

# Instalar dependencias 
!pip install pydub yt-dlp

from pydub import AudioSegment, silence
from IPython.display import Audio, display
!apt -y install aria2 ffmpeg
!pip install torch torchvision torchaudio
import glob, os, subprocess

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
aria2 is already the newest version (1.36.0-1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [9]:

# 1. Descargar el audio de YouTube
url = "https://www.youtube.com/watch?v=tZZKF6tqx1s"
!yt-dlp -x --audio-format wav --output "full_audio.%(ext)s" {url}
# 3. Detectar segmentos no silenciosos
from pydub.silence import split_on_silence

# 2. Cargar el audio completo
audio = AudioSegment.from_wav("full_audio.wav")
# 19 min * 60 + 12 s = total segundos
total_sec = 19 * 60 + 12  # = 1152 s
cut_ms = total_sec * 1000  # = 1_152_000 ms

# Cargar audio y recortar
audio = audio[:cut_ms]

# Ahora 'audio' contiene solo los primeros 19.12 s

raw_segments = silence.split_on_silence(
    audio,
    min_silence_len=300,
    silence_thresh=-40,
    keep_silence=300
)

# Umbral en ms: todos los segmentos < thresh van a fusionarse
SHORT_THRESH = 400

# Fusionar segmentos cortos con el anterior
segments = []
for seg in raw_segments:
    if len(segments) == 0:
        segments.append(seg)
    else:
        if len(seg) < SHORT_THRESH:
            # fusionar con el anterior
            segments[-1] = segments[-1] + seg
        else:
            segments.append(seg)

# Exportar
out_dir = "/content/RVC/dataset/2B/"
os.makedirs(out_dir, exist_ok=True)
for i, seg in enumerate(segments):
    seg.export(f"{out_dir}/2b_seg_{i:04d}.wav", format="wav")

print(f"➡️ Generados {len(segments)} segmentos tras merge.")


[youtube] Extracting URL: https://www.youtube.com/watch?v=tZZKF6tqx1s
[youtube] tZZKF6tqx1s: Downloading webpage
[youtube] tZZKF6tqx1s: Downloading tv client config
[youtube] tZZKF6tqx1s: Downloading tv player API JSON
[youtube] tZZKF6tqx1s: Downloading ios player API JSON
[youtube] tZZKF6tqx1s: Downloading m3u8 information
[info] tZZKF6tqx1s: Downloading 1 format(s): 251
[download] full_audio.wav has already been downloaded
[ExtractAudio] Destination: full_audio.wav
Deleting original file full_audio.orig.wav (pass -k to keep)
➡️ Generados 474 segmentos tras merge.


In [10]:
# Reproducir los primeros 5 audios
segmentos = sorted(glob.glob(os.path.join(out_dir, "*.wav")))
print(out_dir)
print("🔉 Reproduciendo los primeros segmentos:")
for i, file in enumerate(segmentos[:10]):
    print(f"▶️ Segmento {i + 1}: {os.path.basename(file)}")
    display(Audio(file))



/content/RVC/dataset/2B/
🔉 Reproduciendo los primeros segmentos:
▶️ Segmento 1: 2b_seg_0000.wav


▶️ Segmento 2: 2b_seg_0001.wav


▶️ Segmento 3: 2b_seg_0002.wav


▶️ Segmento 4: 2b_seg_0003.wav


▶️ Segmento 5: 2b_seg_0004.wav


▶️ Segmento 6: 2b_seg_0005.wav


▶️ Segmento 7: 2b_seg_0006.wav


▶️ Segmento 8: 2b_seg_0007.wav


▶️ Segmento 9: 2b_seg_0008.wav


▶️ Segmento 10: 2b_seg_0009.wav


Entrenamiento

In [11]:
# 🔄 Convertir mp3 → wav (mono 48 kHz) en batch


src_dir = "/content/RVC/dataset/2B/"            # tu carpeta original
dst_dir = "/content/RVC/dataset/2B_wav/"        # nueva carpeta para wav
os.makedirs(dst_dir, exist_ok=True)

for src in glob.glob(os.path.join(src_dir, "*.*")):
    name, ext = os.path.splitext(os.path.basename(src))
    dst = os.path.join(dst_dir, f"{name}.wav")
    # saltar si ya es WAV
    if ext.lower() == ".wav":
        subprocess.run([
            "ffmpeg", "-y", "-i", src,
            "-ac", "1", "-ar", "48000", dst
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    else:
        subprocess.run([
            "ffmpeg", "-y", "-i", src,
            "-ac", "1", "-ar", "48000", dst
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print(f"Todos convertidos a WAV en `{dst_dir}`.")


Todos convertidos a WAV en `/content/RVC/dataset/2B_wav/`.


In [12]:
# 🔊 Normalizar todos los audios de dataset/2B_wav usando ffmpeg
import os

input_dir = '/content/RVC/dataset/2B_wav/'
output_dir = '/content/RVC/dataset/2B_wav_normalized/'
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".wav"):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)
        !ffmpeg -y -i "{input_path}" -af "loudnorm" "{output_path}"


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [13]:
# Comprimir toda la carpeta 2B_wav_normalized
!zip -r /kaggle/working/2B_wav_normalized.zip /content/RVC/dataset/2B_wav_normalized

updating: content/RVC/dataset/2B_wav_normalized/ (stored 0%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0076.wav (deflated 27%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0091.wav (deflated 36%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0062.wav (deflated 47%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0391.wav (deflated 47%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0107.wav (deflated 22%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0328.wav (deflated 28%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0168.wav (deflated 35%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0466.wav (deflated 28%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0081.wav (deflated 26%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0194.wav (deflated 41%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0401.wav (deflated 38%)
updating: content/RVC/dataset/2B_wav_normalized/2b_seg_0307.wav (defla