#  Notebook 1: Exploration des Données

Ce notebook explore le dataset pour la conversion vocale A→B.

**Objectifs:**
- Charger et analyser le dataset
- Visualiser les distributions audio
- Analyser les caractéristiques des locuteurs
- Vérifier la qualité des données

In [None]:
# Imports
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import IPython.display as ipd
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Configuration et Chemins

In [None]:
# Chemins des données
DATA_DIR = Path('../data')
TRAIN_DIR = DATA_DIR / 'train'
VAL_DIR = DATA_DIR / 'val'
TEST_DIR = DATA_DIR / 'test'

# Paramètres audio
SAMPLE_RATE = 22050
N_FFT = 1024
HOP_LENGTH = 256
N_MELS = 80

print(f"Data directory: {DATA_DIR}")
print(f"Train directory exists: {TRAIN_DIR.exists()}")
print(f"Val directory exists: {VAL_DIR.exists()}")
print(f"Test directory exists: {TEST_DIR.exists()}")

## 2. Analyse de la Structure des Données

In [None]:
def analyze_dataset_structure(data_dir):
    """Analyser la structure du dataset"""
    speakers = [d for d in data_dir.iterdir() if d.is_dir()]
    
    stats = []
    for speaker_dir in speakers:
        audio_files = list(speaker_dir.glob('*.wav')) + list(speaker_dir.glob('*.flac'))
        
        stats.append({
            'speaker': speaker_dir.name,
            'num_files': len(audio_files),
        })
    
    return pd.DataFrame(stats)

# Analyser chaque split
print("\n=== TRAIN SET ===")
train_stats = analyze_dataset_structure(TRAIN_DIR)
print(train_stats)
print(f"\nTotal speakers: {len(train_stats)}")
print(f"Total files: {train_stats['num_files'].sum()}")
print(f"Average files per speaker: {train_stats['num_files'].mean():.1f}")

print("\n=== VAL SET ===")
val_stats = analyze_dataset_structure(VAL_DIR)
print(val_stats)
print(f"Total files: {val_stats['num_files'].sum()}")

print("\n=== TEST SET ===")
test_stats = analyze_dataset_structure(TEST_DIR)
print(test_stats)
print(f"Total files: {test_stats['num_files'].sum()}")

In [None]:
# Visualisation de la distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (name, stats) in zip(axes, [('Train', train_stats), ('Val', val_stats), ('Test', test_stats)]):
    ax.bar(stats['speaker'], stats['num_files'])
    ax.set_title(f'{name} Set - Files per Speaker')
    ax.set_xlabel('Speaker')
    ax.set_ylabel('Number of Files')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 3. Analyse des Propriétés Audio

In [None]:
def analyze_audio_properties(data_dir, num_samples=50):
    """Analyser les propriétés des fichiers audio"""
    speakers = [d for d in data_dir.iterdir() if d.is_dir()]
    
    audio_stats = []
    
    for speaker_dir in tqdm(speakers, desc="Analyzing audio"):
        audio_files = list(speaker_dir.glob('*.wav')) + list(speaker_dir.glob('*.flac'))
        
        # Échantillonner si trop de fichiers
        if len(audio_files) > num_samples:
            audio_files = np.random.choice(audio_files, num_samples, replace=False)
        
        for audio_path in audio_files:
            try:
                y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
                
                audio_stats.append({
                    'speaker': speaker_dir.name,
                    'duration': len(y) / sr,
                    'sample_rate': sr,
                    'rms': librosa.feature.rms(y=y).mean(),
                    'zero_crossing_rate': librosa.feature.zero_crossing_rate(y).mean(),
                    'spectral_centroid': librosa.feature.spectral_centroid(y=y, sr=sr).mean(),
                })
            except Exception as e:
                print(f"Error loading {audio_path}: {e}")
    
    return pd.DataFrame(audio_stats)

# Analyser train set
print("Analyzing audio properties...")
audio_stats = analyze_audio_properties(TRAIN_DIR, num_samples=30)
print(audio_stats.describe())

In [None]:
# Visualisations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Duration distribution
axes[0, 0].hist(audio_stats['duration'], bins=50, edgecolor='black')
axes[0, 0].set_title('Duration Distribution')
axes[0, 0].set_xlabel('Duration (seconds)')
axes[0, 0].set_ylabel('Count')

# RMS distribution
axes[0, 1].hist(audio_stats['rms'], bins=50, edgecolor='black', color='orange')
axes[0, 1].set_title('RMS Energy Distribution')
axes[0, 1].set_xlabel('RMS')
axes[0, 1].set_ylabel('Count')

# Spectral centroid by speaker
speaker_means = audio_stats.groupby('speaker')['spectral_centroid'].mean().sort_values()
axes[1, 0].barh(range(len(speaker_means)), speaker_means.values)
axes[1, 0].set_yticks(range(len(speaker_means)))
axes[1, 0].set_yticklabels(speaker_means.index)
axes[1, 0].set_title('Average Spectral Centroid by Speaker')
axes[1, 0].set_xlabel('Spectral Centroid (Hz)')

# Zero crossing rate
axes[1, 1].boxplot([audio_stats[audio_stats['speaker'] == s]['zero_crossing_rate'].values 
                     for s in audio_stats['speaker'].unique()],
                    labels=audio_stats['speaker'].unique())
axes[1, 1].set_title('Zero Crossing Rate by Speaker')
axes[1, 1].set_xlabel('Speaker')
axes[1, 1].set_ylabel('Zero Crossing Rate')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Visualisation d'Exemples Audio

In [None]:
def visualize_audio_sample(audio_path, title="Audio Sample"):
    """Visualiser un échantillon audio"""
    # Charger audio
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
    
    # Créer figure
    fig, axes = plt.subplots(4, 1, figsize=(14, 10))
    
    # Waveform
    librosa.display.waveshow(y, sr=sr, ax=axes[0])
    axes[0].set_title(f'{title} - Waveform')
    axes[0].set_ylabel('Amplitude')
    
    # Spectrogram
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
    axes[1].set_title('Spectrogram')
    fig.colorbar(img, ax=axes[1], format='%+2.0f dB')
    
    # Mel-Spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, 
                                         hop_length=HOP_LENGTH, n_mels=N_MELS)
    mel_db = librosa.amplitude_to_db(mel, ref=np.max)
    img = librosa.display.specshow(mel_db, sr=sr, hop_length=HOP_LENGTH,
                                   x_axis='time', y_axis='mel', ax=axes[2])
    axes[2].set_title('Mel-Spectrogram')
    fig.colorbar(img, ax=axes[2], format='%+2.0f dB')
    
    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    img = librosa.display.specshow(mfccs, sr=sr, x_axis='time', ax=axes[3])
    axes[3].set_title('MFCCs')
    axes[3].set_ylabel('MFCC')
    fig.colorbar(img, ax=axes[3])
    
    plt.tight_layout()
    plt.show()
    
    # Play audio
    print(f"Duration: {len(y)/sr:.2f}s")
    print(f"Sample rate: {sr} Hz")
    return ipd.Audio(y, rate=sr)

# Sélectionner un fichier aléatoire de chaque locuteur
speakers = [d for d in TRAIN_DIR.iterdir() if d.is_dir()][:3]  # 3 premiers speakers

for speaker_dir in speakers:
    audio_files = list(speaker_dir.glob('*.wav')) + list(speaker_dir.glob('*.flac'))
    if audio_files:
        sample_file = np.random.choice(audio_files)
        print(f"\n{'='*60}")
        print(f"Speaker: {speaker_dir.name}")
        print(f"File: {sample_file.name}")
        print(f"{'='*60}")
        audio = visualize_audio_sample(sample_file, f"Speaker {speaker_dir.name}")
        display(audio)

## 5. Comparaison Entre Locuteurs

In [None]:
# Comparer les spectrogrammes de 2 locuteurs différents
speakers = [d for d in TRAIN_DIR.iterdir() if d.is_dir()][:2]

fig, axes = plt.subplots(2, 2, figsize=(16, 8))

for i, speaker_dir in enumerate(speakers):
    audio_files = list(speaker_dir.glob('*.wav')) + list(speaker_dir.glob('*.flac'))
    sample_file = np.random.choice(audio_files)
    
    # Charger audio
    y, sr = librosa.load(sample_file, sr=SAMPLE_RATE)
    
    # Waveform
    librosa.display.waveshow(y, sr=sr, ax=axes[i, 0])
    axes[i, 0].set_title(f'Speaker {speaker_dir.name} - Waveform')
    axes[i, 0].set_ylabel('Amplitude')
    
    # Mel-spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT,
                                        hop_length=HOP_LENGTH, n_mels=N_MELS)
    mel_db = librosa.amplitude_to_db(mel, ref=np.max)
    img = librosa.display.specshow(mel_db, sr=sr, hop_length=HOP_LENGTH,
                                   x_axis='time', y_axis='mel', ax=axes[i, 1])
    axes[i, 1].set_title(f'Speaker {speaker_dir.name} - Mel-Spectrogram')
    fig.colorbar(img, ax=axes[i, 1], format='%+2.0f dB')

plt.tight_layout()
plt.show()

## 6. Vérification de la Qualité

In [None]:
def check_data_quality(data_dir):
    """Vérifier la qualité des données"""
    issues = []
    speakers = [d for d in data_dir.iterdir() if d.is_dir()]
    
    for speaker_dir in tqdm(speakers, desc="Checking quality"):
        audio_files = list(speaker_dir.glob('*.wav')) + list(speaker_dir.glob('*.flac'))
        
        for audio_path in audio_files:
            try:
                y, sr = librosa.load(audio_path, sr=None)
                
                # Vérifications
                if len(y) < sr * 0.5:  # Moins de 0.5s
                    issues.append({
                        'file': str(audio_path),
                        'issue': 'Too short',
                        'duration': len(y) / sr
                    })
                
                if np.max(np.abs(y)) < 0.01:  # Trop silencieux
                    issues.append({
                        'file': str(audio_path),
                        'issue': 'Too quiet',
                        'max_amp': np.max(np.abs(y))
                    })
                    
                if np.any(np.isnan(y)) or np.any(np.isinf(y)):
                    issues.append({
                        'file': str(audio_path),
                        'issue': 'Invalid values (NaN/Inf)'
                    })
                    
            except Exception as e:
                issues.append({
                    'file': str(audio_path),
                    'issue': f'Load error: {str(e)}'
                })
    
    return pd.DataFrame(issues)

# Vérifier train set
print("Checking data quality...")
quality_issues = check_data_quality(TRAIN_DIR)

if len(quality_issues) > 0:
    print(f"\n️ Found {len(quality_issues)} issues:")
    print(quality_issues)
else:
    print("\n No quality issues found!")

## 7. Résumé de l'Exploration

### Points Clés:
1. **Nombre de locuteurs**: [À remplir après exécution]
2. **Total fichiers audio**: [À remplir après exécution]
3. **Durée moyenne**: [À remplir après exécution]
4. **Qualité des données**: [À remplir après exécution]

### Observations:
- [Vos observations ici]

### Recommandations:
- [Vos recommandations ici]

In [None]:
# Sauvegarder un rapport
report = {
    'dataset': 'train',
    'num_speakers': len(train_stats),
    'total_files': train_stats['num_files'].sum(),
    'avg_files_per_speaker': train_stats['num_files'].mean(),
    'avg_duration': audio_stats['duration'].mean(),
    'avg_rms': audio_stats['rms'].mean(),
    'num_issues': len(quality_issues)
}

print("\n=== DATA EXPLORATION REPORT ===")
for key, value in report.items():
    print(f"{key}: {value}")

# Sauvegarder
import json
with open('../outputs/data_exploration_report.json', 'w') as f:
    json.dump(report, f, indent=2)
    
print("\nReport saved to: outputs/data_exploration_report.json")