#  Notebook 2: Prétraitement Audio

Ce notebook teste et démontre le pipeline de prétraitement audio.

**Objectifs:**
- Tester le chargement audio
- Conversion audio → mel-spectrogram
- Conversion mel-spectrogram → audio (Griffin-Lim)
- Visualiser les transformations
- Valider la qualité

In [None]:
# Imports
import sys
sys.path.append('..')

import torch
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from pathlib import Path

from src.preprocessing.audio_processor import AudioProcessor
from src.preprocessing.mel_spectrogram import MelSpectrogramProcessor
from config.model_config import Config

%matplotlib inline

## 1. Configuration

In [None]:
# Charger configuration
config = Config('../config/config.yaml')

print("Audio Config:")
print(f"  Sample Rate: {config.audio.sample_rate}")
print(f"  N-FFT: {config.audio.n_fft}")
print(f"  Hop Length: {config.audio.hop_length}")
print(f"  N-Mels: {config.audio.n_mels}")
print(f"  Segment Length: {config.audio.segment_length}")

## 2. Initialiser les Processeurs

In [None]:
# Créer processeurs
audio_processor = AudioProcessor(
    sample_rate=config.audio.sample_rate,
    segment_length=config.audio.segment_length,
    normalize=True
)

mel_processor = MelSpectrogramProcessor(
    sample_rate=config.audio.sample_rate,
    n_fft=config.audio.n_fft,
    hop_length=config.audio.hop_length,
    win_length=config.audio.win_length,
    n_mels=config.audio.n_mels,
    fmin=config.audio.fmin,
    fmax=config.audio.fmax
)

print("✅ Processors initialized")

## 3. Test: Chargement Audio

In [None]:
# Trouver un fichier audio de test
DATA_DIR = Path('../data/train')
speakers = [d for d in DATA_DIR.iterdir() if d.is_dir()]
test_file = list(speakers[0].glob('*.wav'))[0]

print(f"Test file: {test_file}")

# Charger audio
audio = audio_processor.load_audio(test_file)

print(f"\nAudio shape: {audio.shape}")
print(f"Duration: {len(audio) / config.audio.sample_rate:.2f}s")
print(f"Min value: {audio.min():.4f}")
print(f"Max value: {audio.max():.4f}")
print(f"Mean: {audio.mean():.4f}")
print(f"Std: {audio.std():.4f}")

# Écouter
ipd.Audio(audio.numpy(), rate=config.audio.sample_rate)

## 4. Test: Segmentation

In [None]:
# Segmenter audio
segment = audio_processor.segment_audio(audio, random=True)

print(f"Original length: {len(audio)}")
print(f"Segment length: {len(segment)}")
print(f"Segment duration: {len(segment) / config.audio.sample_rate:.2f}s")

# Visualiser
fig, axes = plt.subplots(2, 1, figsize=(14, 6))

librosa.display.waveshow(audio.numpy(), sr=config.audio.sample_rate, ax=axes[0])
axes[0].set_title('Original Audio')
axes[0].set_ylabel('Amplitude')

librosa.display.waveshow(segment.numpy(), sr=config.audio.sample_rate, ax=axes[1])
axes[1].set_title('Segmented Audio')
axes[1].set_ylabel('Amplitude')
axes[1].set_xlabel('Time (s)')

plt.tight_layout()
plt.show()

# Écouter segment
ipd.Audio(segment.numpy(), rate=config.audio.sample_rate)

## 5. Test: Conversion Audio → Mel-Spectrogram

In [None]:
# Convertir en mel-spectrogram
mel = mel_processor.wav_to_mel(segment)

print(f"Mel-spectrogram shape: {mel.shape}")
print(f"  N-mels: {mel.shape[0]}")
print(f"  Time steps: {mel.shape[1]}")
print(f"Min value: {mel.min():.4f}")
print(f"Max value: {mel.max():.4f}")
print(f"Mean: {mel.mean():.4f}")
print(f"Std: {mel.std():.4f}")

# Visualiser
plt.figure(figsize=(14, 5))
plt.imshow(mel.numpy(), aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-Spectrogram (Log Scale)')
plt.ylabel('Mel Frequency Bin')
plt.xlabel('Time Frame')
plt.tight_layout()
plt.show()

## 6. Test: Conversion Mel-Spectrogram → Audio (Griffin-Lim)

In [None]:
# Reconstruire audio depuis mel
print("Reconstructing audio using Griffin-Lim...")
reconstructed = mel_processor.mel_to_wav(mel)

print(f"\nReconstructed shape: {reconstructed.shape}")
print(f"Duration: {len(reconstructed) / config.audio.sample_rate:.2f}s")

# Comparer
fig, axes = plt.subplots(3, 1, figsize=(14, 9))

# Original
librosa.display.waveshow(segment.numpy(), sr=config.audio.sample_rate, ax=axes[0])
axes[0].set_title('Original Audio')
axes[0].set_ylabel('Amplitude')

# Reconstructed
librosa.display.waveshow(reconstructed.numpy(), sr=config.audio.sample_rate, ax=axes[1])
axes[1].set_title('Reconstructed Audio (Griffin-Lim)')
axes[1].set_ylabel('Amplitude')

# Différence
# Aligner les longueurs
min_len = min(len(segment), len(reconstructed))
diff = segment[:min_len] - reconstructed[:min_len]
librosa.display.waveshow(diff.numpy(), sr=config.audio.sample_rate, ax=axes[2])
axes[2].set_title('Difference')
axes[2].set_ylabel('Amplitude')
axes[2].set_xlabel('Time (s)')

plt.tight_layout()
plt.show()

# Calculer MSE
mse = torch.nn.functional.mse_loss(segment[:min_len], reconstructed[:min_len])
print(f"\nMSE between original and reconstructed: {mse:.6f}")

print("\nListen to:")
print("Original:")
display(ipd.Audio(segment.numpy(), rate=config.audio.sample_rate))
print("\nReconstructed:")
display(ipd.Audio(reconstructed.numpy(), rate=config.audio.sample_rate))

## 7. Test: Pipeline Complet

In [None]:
def test_full_pipeline(audio_path):
    """Tester le pipeline complet"""
    print(f"Testing: {audio_path.name}")
    
    # 1. Charger
    audio = audio_processor.load_audio(audio_path)
    print(f"  Loaded: {audio.shape}")
    
    # 2. Segmenter
    segment = audio_processor.segment_audio(audio, random=False)
    print(f"  Segmented: {segment.shape}")
    
    # 3. Mel
    mel = mel_processor.wav_to_mel(segment)
    print(f"  Mel: {mel.shape}")
    
    # 4. Reconstruire
    reconstructed = mel_processor.mel_to_wav(mel)
    print(f"  Reconstructed: {reconstructed.shape}")
    
    # Visualiser
    fig, axes = plt.subplots(2, 2, figsize=(16, 8))
    
    # Waveforms
    librosa.display.waveshow(segment.numpy(), sr=config.audio.sample_rate, ax=axes[0, 0])
    axes[0, 0].set_title('Original Waveform')
    
    librosa.display.waveshow(reconstructed.numpy(), sr=config.audio.sample_rate, ax=axes[0, 1])
    axes[0, 1].set_title('Reconstructed Waveform')
    
    # Spectrograms
    axes[1, 0].imshow(mel.numpy(), aspect='auto', origin='lower', cmap='viridis')
    axes[1, 0].set_title('Mel-Spectrogram')
    axes[1, 0].set_ylabel('Mel Bin')
    axes[1, 0].set_xlabel('Time')
    
    # Mel du reconstructed
    mel_reconstructed = mel_processor.wav_to_mel(reconstructed)
    axes[1, 1].imshow(mel_reconstructed.numpy(), aspect='auto', origin='lower', cmap='viridis')
    axes[1, 1].set_title('Mel-Spectrogram (Reconstructed)')
    axes[1, 1].set_ylabel('Mel Bin')
    axes[1, 1].set_xlabel('Time')
    
    plt.tight_layout()
    plt.show()
    
    return segment, mel, reconstructed

# Tester sur plusieurs fichiers
test_files = list(speakers[0].glob('*.wav'))[:2]

for test_file in test_files:
    print(f"\n{'='*60}")
    segment, mel, reconstructed = test_full_pipeline(test_file)
    print("")

## 8. Test: Normalisation

In [None]:
# Charger plusieurs fichiers pour calculer stats
mel_list = []
for audio_file in list(speakers[0].glob('*.wav'))[:20]:
    audio = audio_processor.load_audio(audio_file)
    segment = audio_processor.segment_audio(audio)
    mel = mel_processor.wav_to_mel(segment)
    mel_list.append(mel)

# Calculer stats
mean, std = mel_processor.compute_stats(mel_list)
print(f"Dataset Statistics:")
print(f"  Mean: {mean:.4f}")
print(f"  Std: {std:.4f}")

# Normaliser
mel_normalized = mel_processor.normalize(mel_list[0], mean, std)

print(f"\nOriginal mel:")
print(f"  Min: {mel_list[0].min():.4f}")
print(f"  Max: {mel_list[0].max():.4f}")
print(f"  Mean: {mel_list[0].mean():.4f}")
print(f"  Std: {mel_list[0].std():.4f}")

print(f"\nNormalized mel:")
print(f"  Min: {mel_normalized.min():.4f}")
print(f"  Max: {mel_normalized.max():.4f}")
print(f"  Mean: {mel_normalized.mean():.4f}")
print(f"  Std: {mel_normalized.std():.4f}")

# Visualiser
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

im0 = axes[0].imshow(mel_list[0].numpy(), aspect='auto', origin='lower', cmap='viridis')
axes[0].set_title('Original Mel-Spectrogram')
plt.colorbar(im0, ax=axes[0])

im1 = axes[1].imshow(mel_normalized.numpy(), aspect='auto', origin='lower', cmap='viridis')
axes[1].set_title('Normalized Mel-Spectrogram')
plt.colorbar(im1, ax=axes[1])

plt.tight_layout()
plt.show()

## 9. Test: Batch Processing

In [None]:
# Simuler un batch
batch_size = 4
batch_audio = []
batch_mels = []

for audio_file in list(speakers[0].glob('*.wav'))[:batch_size]:
    audio = audio_processor.load_audio(audio_file)
    segment = audio_processor.segment_audio(audio)
    mel = mel_processor.wav_to_mel(segment)
    
    batch_audio.append(segment)
    batch_mels.append(mel)

# Stack en batch
batch_audio_tensor = torch.stack(batch_audio)
batch_mels_tensor = torch.stack(batch_mels)

print(f"Batch audio shape: {batch_audio_tensor.shape}")
print(f"Batch mels shape: {batch_mels_tensor.shape}")

# Visualiser batch
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
axes = axes.flatten()

for i in range(batch_size):
    axes[i].imshow(batch_mels_tensor[i].numpy(), aspect='auto', origin='lower', cmap='viridis')
    axes[i].set_title(f'Sample {i+1}')
    axes[i].set_ylabel('Mel Bin')
    axes[i].set_xlabel('Time')

plt.tight_layout()
plt.show()

## 10. Résumé & Validation

### Tests Réussis 
1. Chargement audio
2. Segmentation
3. Conversion audio → mel
4. Conversion mel → audio
5. Normalisation
6. Batch processing

### Observations
- La reconstruction avec Griffin-Lim introduit des artefacts (normal)
- Pour la production, un vocoder neuronal (HiFi-GAN) sera meilleur
- La normalisation aide à stabiliser l'entraînement

### Prochaines Étapes
1. Tester avec le dataset complet
2. Intégrer dans le DataLoader
3. Entraîner le modèle

In [None]:
print(" Preprocessing pipeline validated!")
print("\nReady for training!")