In [1]:
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd


In [2]:
class AudioFeatureExtractor:
    def __init__(self, sample_rate=22050):
        self.sample_rate = sample_rate
    
    def extract_features(self, audio_path):
        """Extract audio features including pitch, tone and pace metrics."""
        # Load audio file
        y, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        # Pitch features
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
        pitch_mean = np.mean(pitches[pitches > 0])
        pitch_std = np.std(pitches[pitches > 0])
        
        # Tone features (using spectral features)
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        
        # Pace features
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
        
        return {
            'pitch_mean': pitch_mean,
            'pitch_std': pitch_std,
            'spectral_centroid_mean': np.mean(spectral_centroids),
            'spectral_rolloff_mean': np.mean(spectral_rolloff),
            'mfccs': mfccs.mean(axis=1),
            'tempo': tempo
        }

class AudioDataset(Dataset):
    def __init__(self, features_list, labels):
        self.features = torch.FloatTensor(features_list)
        self.labels = torch.FloatTensor(labels)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class AudioAnalysisModel(nn.Module):
    def __init__(self, input_size):
        super(AudioAnalysisModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 3)  # 3 outputs: tone, pitch, pace scores
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.output(x)
        return x

class AudioAnalyzer:
    def __init__(self, model_path=None):
        self.feature_extractor = AudioFeatureExtractor()
        self.model = None
        if model_path:
            self.load_model(model_path)
    
    def train_model(self, train_data, train_labels, epochs=50, batch_size=32):
        """Train the deep learning model on extracted features."""
        dataset = AudioDataset(train_data, train_labels)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        self.model = AudioAnalysisModel(input_size=train_data.shape[1])
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.model.parameters())
        
        for epoch in range(epochs):
            total_loss = 0
            for batch_features, batch_labels in dataloader:
                optimizer.zero_grad()
                outputs = self.model(batch_features)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            
            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}')
    
    def analyze_audio(self, audio_path):
        """Analyze an audio file and return tone, pitch, and pace metrics."""
        if not self.model:
            raise ValueError("Model not trained or loaded")
        
        # Extract features
        features = self.feature_extractor.extract_features(audio_path)
        
        # Prepare features for model input
        feature_vector = np.concatenate([
            [features['pitch_mean'], features['pitch_std'],
             features['spectral_centroid_mean'], features['spectral_rolloff_mean'],
             features['tempo']],
            features['mfccs']
        ])
        
        # Get model predictions
        with torch.no_grad():
            input_tensor = torch.FloatTensor(feature_vector).unsqueeze(0)
            predictions = self.model(input_tensor)
            tone_score, pitch_score, pace_score = predictions[0].numpy()
        
        return {
            'tone_score': float(tone_score),
            'pitch_score': float(pitch_score),
            'pace_score': float(pace_score),
            'raw_features': features
        }
    
    def save_model(self, path):
        """Save the trained model."""
        if self.model:
            torch.save(self.model.state_dict(), path)
    
    def load_model(self, path):
        """Load a trained model."""
        self.model = AudioAnalysisModel(input_size=18)  # 18 features total
        self.model.load_state_dict(torch.load(path))
        self.model.eval()

In [3]:
# 2. Initialize the analyzer
analyzer = AudioAnalyzer()

In [4]:
import os
import re
import numpy as np
from typing import Dict, Tuple, Optional, Union

class LabelExtractor:
    """Handles label extraction from different audio dataset filename formats."""
    
    def __init__(self):
        # Mapping for RAVDESS dataset emotions
        self.ravdess_emotions = {
            '01': 'neutral',
            '02': 'calm',
            '03': 'happy',
            '04': 'sad',
            '05': 'angry',
            '06': 'fearful',
            '07': 'disgust',
            '08': 'surprised'
        }
        
        # Mapping for RAVDESS intensity levels
        self.ravdess_intensity = {
            '01': 'normal',
            '02': 'strong'
        }
        
        # Mapping emotion to numerical scores for tone, pitch, and pace
        self.emotion_scores = {
            'neutral': {'tone': 0.5, 'pitch': 0.5, 'pace': 0.5},
            'calm': {'tone': 0.3, 'pitch': 0.3, 'pace': 0.3},
            'happy': {'tone': 0.8, 'pitch': 0.7, 'pace': 0.7},
            'sad': {'tone': 0.4, 'pitch': 0.3, 'pace': 0.4},
            'angry': {'tone': 0.8, 'pitch': 0.8, 'pace': 0.8},
            'fearful': {'tone': 0.7, 'pitch': 0.6, 'pace': 0.7},
            'disgust': {'tone': 0.6, 'pitch': 0.5, 'pace': 0.5},
            'surprised': {'tone': 0.7, 'pitch': 0.8, 'pace': 0.6}
        }

    def extract_ravdess_labels(self, filename: str) -> Dict[str, Union[str, float]]:
        """
        Extract labels from RAVDESS filename format.
        Format: modality-vocal_channel-emotion-emotional_intensity-statement-repetition-actor.wav
        Example: 03-01-06-02-02-01-12.wav
        """
        try:
            parts = filename.strip('.wav').split('-')
            
            if len(parts) != 7:
                raise ValueError(f"Invalid RAVDESS filename format: {filename}")
            
            emotion_code = parts[2]
            intensity_code = parts[3]
            
            emotion = self.ravdess_emotions.get(emotion_code, 'unknown')
            intensity = self.ravdess_intensity.get(intensity_code, 'normal')
            
            # Get base scores from emotion
            base_scores = self.emotion_scores.get(emotion, {'tone': 0.5, 'pitch': 0.5, 'pace': 0.5})
            
            # Adjust scores based on intensity
            intensity_multiplier = 1.2 if intensity == 'strong' else 1.0
            
            return {
                'emotion': emotion,
                'intensity': intensity,
                'tone_score': min(1.0, base_scores['tone'] * intensity_multiplier),
                'pitch_score': min(1.0, base_scores['pitch'] * intensity_multiplier),
                'pace_score': min(1.0, base_scores['pace'] * intensity_multiplier)
            }
        except Exception as e:
            raise ValueError(f"Error parsing RAVDESS filename {filename}: {str(e)}")

    def extract_tess_labels(self, filename: str) -> Dict[str, Union[str, float]]:
        """
        Extract labels from TESS filename format.
        Format: OAF_emotion_word.wav or YAF_emotion_word.wav
        Example: OAF_angry_word.wav
        """
        try:
            # Extract emotion from filename
            match = re.search(r'[OY]AF_(\w+)_', filename)
            if not match:
                raise ValueError(f"Invalid TESS filename format: {filename}")
            
            emotion = match.group(1).lower()
            
            # Map TESS emotion to scores (using same mapping as RAVDESS for consistency)
            scores = self.emotion_scores.get(emotion, {'tone': 0.5, 'pitch': 0.5, 'pace': 0.5})
            
            return {
                'emotion': emotion,
                'tone_score': scores['tone'],
                'pitch_score': scores['pitch'],
                'pace_score': scores['pace']
            }
        except Exception as e:
            raise ValueError(f"Error parsing TESS filename {filename}: {str(e)}")

    def extract_custom_labels(self, filename: str) -> Dict[str, float]:
        """
        Extract labels from custom filename format.
        Format: tone_X_pitch_Y_pace_Z.wav
        Example: tone_0.8_pitch_0.6_pace_0.7.wav
        """
        try:
            # Extract scores from filename using regex
            tone_match = re.search(r'tone_(0?\.\d+)', filename)
            pitch_match = re.search(r'pitch_(0?\.\d+)', filename)
            pace_match = re.search(r'pace_(0?\.\d+)', filename)
            
            if not all([tone_match, pitch_match, pace_match]):
                raise ValueError(f"Invalid custom filename format: {filename}")
            
            return {
                'tone_score': float(tone_match.group(1)),
                'pitch_score': float(pitch_match.group(1)),
                'pace_score': float(pace_match.group(1))
            }
        except Exception as e:
            raise ValueError(f"Error parsing custom filename {filename}: {str(e)}")

    def extract_labels(self, filename: str, dataset_type: str = 'auto') -> Dict[str, Union[str, float]]:
        """
        Main function to extract labels from filename based on dataset type.
        """
        filename = os.path.basename(filename)
        
        if dataset_type == 'auto':
            # Try to automatically determine dataset type from filename
            if re.match(r'\d{2}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}\.wav', filename):
                dataset_type = 'ravdess'
            elif re.match(r'[OY]AF_\w+_\w+\.wav', filename):
                dataset_type = 'tess'
            elif re.match(r'tone_[\d.]+_pitch_[\d.]+_pace_[\d.]+\.wav', filename):
                dataset_type = 'custom'
            else:
                raise ValueError(f"Could not automatically determine dataset type for: {filename}")
        
        # Extract labels based on dataset type
        if dataset_type == 'ravdess':
            return self.extract_ravdess_labels(filename)
        elif dataset_type == 'tess':
            return self.extract_tess_labels(filename)
        elif dataset_type == 'custom':
            return self.extract_custom_labels(filename)
        else:
            raise ValueError(f"Unsupported dataset type: {dataset_type}")

def extract_label_from_filename(filename: str, dataset_type: str = 'auto') -> np.ndarray:
    """
    Wrapper function to extract labels and return them in the format expected by the model.
    Returns a numpy array of [tone_score, pitch_score, pace_score]
    """
    extractor = LabelExtractor()
    labels = extractor.extract_labels(filename, dataset_type)
    return np.array([
        labels['tone_score'],
        labels['pitch_score'],
        labels['pace_score']
    ])

In [5]:
import os
from tqdm import tqdm  # for progress tracking

def prepare_dataset(dataset_path, analyzer):
    features_list = []
    labels = []
    
    # Walk through the dataset directory
    for root, dirs, files in os.walk(dataset_path):
        for file in tqdm(files):
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                
                # Extract features
                try:
                    features = analyzer.feature_extractor.extract_features(file_path)
                    feature_vector = np.concatenate([
                        [features['pitch_mean'], features['pitch_std'],
                         features['spectral_centroid_mean'], features['spectral_rolloff_mean'],
                         features['tempo']],
                        features['mfccs']
                    ])
                    features_list.append(feature_vector)
                    
                    # Get label from filename or metadata
                    # This will depend on the specific dataset structure
                    label = extract_label_from_filename(file)  # You'll need to implement this
                    labels.append(label)
                except Exception as e:
                    print(f"Error processing {file}: {e}")
                    continue
    
    return np.array(features_list), np.array(labels)

In [6]:
# Prepare the training data
features, labels = prepare_dataset("data/Audio_Speech_Actors_01-24/",analyzer)

0it [00:00, ?it/s]
	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
100%|██████████| 60/60 [00:35<00:00,  1.67it/s]
100%|██████████| 60/60 [00:02<00:00, 25.41it/s]
100%|██████████| 60/60 [00:02<00:00, 25.33it/s]
100%|██████████| 60/60 [00:02<00:00, 25.68it/s]
100%|██████████| 60/60 [00:02<00:00, 24.59it/s]
100%|██████████| 60/60 [00:02<00:00, 23.32it/s]
100%|██████████| 60/60 [00:02<00:00, 24.83it/s]
100%|██████████| 60/60 [00:02<00:00, 24.51it/s]
100%|██████████| 60/60 [00:02<00:00, 26.53it/s]
100%|██████████| 60/60 [00:02<00:00, 25.89it/s]
100%|██████████| 60/60 [00:02<00:00, 28.45it/s]
100%|██████████| 60/60 [00:02<00:00, 24.57it/s]
100%|██████████| 60/60 [00:02<00:00, 27.92it/s]
100%|██████████| 60/60 [00:02<00:00, 22.48it/s]
100%|██████████| 60/60 [00:02<00:00, 25.83it/s]
100%|██████████| 60/60 [00:02<00:00, 25.08it/s]
100%|█████

In [None]:
def interpret_audio_analysis(results, scale_to_10=True):
    """
    Interprets audio analysis results and prints them in a human-readable format.
    
    Args:
        results (dict): The analysis results from the AudioAnalyzer
        scale_to_10 (bool): If True, scales scores from 0-1 to 1-10
    """
    def scale_score(score):
        """Scales score from 0-1 to 1-10"""
        return round(score * 9 + 1, 1)
    
    def get_tone_description(score):
        if score >= 8.5: return "Exceptionally clear and pure"
        elif score >= 7: return "Very clear and clean"
        elif score >= 5.5: return "Moderately clear"
        elif score >= 4: return "Slightly unclear"
        else: return "Unclear or noisy"
        
    def get_pitch_description(score):
        if score >= 8.5: return "Very high"
        elif score >= 7: return "High"
        elif score >= 5.5: return "Medium-high"
        elif score >= 4: return "Medium-low"
        else: return "Low"
        
    def get_pace_description(score):
        if score >= 8.5: return "Very fast"
        elif score >= 7: return "Fast"
        elif score >= 5.5: return "Moderate to fast"
        elif score >= 4: return "Moderate"
        else: return "Slow"
    
    # Scale scores if requested
    if scale_to_10:
        tone = scale_score(results['tone_score'])
        pitch = scale_score(results['pitch_score'])
        pace = scale_score(results['pace_score'])
    else:
        tone = results['tone_score']
        pitch = results['pitch_score']
        pace = results['pace_score']
    
    # Get raw features for additional context
    raw = results['raw_features']
    
    # Format the interpretation
    print("=== Audio Analysis Results ===")
    print(f"\nScores (1-10 scale):")
    print(f"Tone Quality: {tone}/10 - {get_tone_description(tone)}")
    print(f"Pitch Level: {pitch}/10 - {get_pitch_description(pitch)}")
    print(f"Pace/Tempo: {pace}/10 - {get_pace_description(pace)}")
    
    print("\nDetailed Measurements:")
    print(f"Average Pitch: {raw['pitch_mean']:.1f} Hz")
    print(f"Pitch Variation: ±{raw['pitch_std']:.1f} Hz")
    print(f"Tempo: {raw['tempo']:.1f} BPM")
    
    print("\nSuggested Classification:")
    if raw['pitch_mean'] > 1500:
        voice_range = "Very high frequency content"
    elif raw['pitch_mean'] > 800:
        voice_range = "High frequency content"
    elif raw['pitch_mean'] > 300:
        voice_range = "Mid-range frequency content"
    else:
        voice_range = "Low frequency content"
    
    print(f"- {voice_range}")
    print(f"- {'High' if raw['tempo'] > 100 else 'Moderate' if raw['tempo'] > 70 else 'Low'} tempo")
    print(f"- {'High' if tone > 7 else 'Moderate' if tone > 4 else 'Low'} clarity")

In [14]:
# 4. Train the model
analyzer.train_model(features, labels, epochs=50)

# 5. Save the trained model
analyzer.save_model("audio_analysis_model.pth")

# 6. Analyze a new audio file
results = analyzer.analyze_audio("test_data/Actor_24/03-01-08-01-01-02-24.wav")
print(results)

Epoch [10/50], Loss: 0.4012
Epoch [20/50], Loss: 0.1936
Epoch [30/50], Loss: 0.1129
Epoch [40/50], Loss: 0.0926
Epoch [50/50], Loss: 0.0830
{'tone_score': 1.0694383382797241, 'pitch_score': 0.7080138921737671, 'pace_score': 0.6949702501296997, 'raw_features': {'pitch_mean': np.float32(1922.2175), 'pitch_std': np.float32(1117.2708), 'spectral_centroid_mean': np.float64(2815.4766442985297), 'spectral_rolloff_mean': np.float64(5268.95947265625), 'mfccs': array([-572.27435  ,   33.545666 ,  -15.067412 ,   -3.5327373,
        -12.325186 ,  -10.237376 ,  -14.086486 ,  -13.915781 ,
         -6.127895 ,   -4.269773 ,   -9.902204 ,   -3.2373993,
         -6.0632944], dtype=float32), 'tempo': np.float64(117.45383522727273)}}


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]


In [21]:


# 6. Analyze a new audio file
results = analyzer.analyze_audio("test_data/Actor_24/03-01-02-01-02-01-24.wav")
print(results)

{'tone_score': 0.3368957042694092, 'pitch_score': 0.3546028435230255, 'pace_score': 0.23705440759658813, 'raw_features': {'pitch_mean': np.float32(2000.1384), 'pitch_std': np.float32(1145.5349), 'spectral_centroid_mean': np.float64(3595.106462082038), 'spectral_rolloff_mean': np.float64(6334.114193488024), 'mfccs': array([-730.6705   ,   38.349815 ,    2.6743178,    9.384072 ,
         -5.97962  ,   -3.0296078,  -11.018359 ,   -9.151866 ,
         -7.464394 ,   -5.854757 ,   -5.191709 ,   -2.5191557,
         -7.5516753], dtype=float32), 'tempo': np.float64(135.99917763157896)}}


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]


In [22]:
interpret_audio_analysis(results)

=== Audio Analysis Results ===

Scores (1-10 scale):
Tone Quality: 4.0/10 - Slightly unclear
Pitch Level: 4.2/10 - Medium-low
Pace/Tempo: 3.1/10 - Slow

Detailed Measurements:
Average Pitch: 2000.1 Hz
Pitch Variation: ±1145.5 Hz
Tempo: 136.0 BPM

Suggested Classification:
- Very high frequency content
- High tempo
- Low clarity
