# Data Augmentation

- Meeting Audio Data Augmenter
- Transforms real meeting recordings into comprehensive ML training dataset
- Usage: python data_augmenter.ipynb

In [1]:
import pandas as pd
import json
import numpy as np
import random
from pathlib import Path
from datetime import datetime
import copy
from scipy import signal
from scipy.interpolate import interp1d
import warnings
warnings.filterwarnings('ignore')

In [2]:
class MeetingAudioAugmenter:
    def __init__(self, data_dir="./"):
        self.data_dir = Path(data_dir)
        self.recordings_dir = self.data_dir / "recordings"
        self.labels_dir = self.data_dir / "labels"
        self.output_dir = self.data_dir / "augmented_data"
        self.output_dir.mkdir(exist_ok=True)
        
        # Load original data
        self.master_csv = self.labels_dir / "sessions_master.csv"
        self.df = pd.read_csv(self.master_csv)
        self.original_audio_data = {}
        
        # Augmentation parameters
        self.target_samples_per_original = 60
        self.augmented_sessions = []
        
        print(f" Meeting Audio Augmenter initialized")
        print(f" Input: {len(self.df)} original recordings")
        print(f" Target: {len(self.df) * self.target_samples_per_original} augmented samples")
    
    def load_original_data(self):
        """Load all original audio recordings"""
        print("\n Loading original audio data...")
        
        for _, session in self.df.iterrows():
            session_id = session['session_id']
            audio_file = self.recordings_dir / f"{session_id}_audio.json"
            labels_file = self.labels_dir / f"{session_id}_labels.json"
            
            if audio_file.exists() and labels_file.exists():
                # Load audio data
                with open(audio_file, 'r') as f:
                    audio_data = json.load(f)
                
                # Load labels
                with open(labels_file, 'r') as f:
                    labels = json.load(f)
                
                self.original_audio_data[session_id] = {
                    'audio': audio_data,
                    'labels': labels,
                    'csv_row': session.to_dict()
                }
                
                print(f"Loaded {session_id}: {len(audio_data)} samples")
            else:
                print(f"Missing files for {session_id}")
    
    def time_stretch_audio(self, audio_samples, stretch_factor):
        """Apply time stretching to audio samples"""
        if stretch_factor == 1.0:
            return audio_samples
        
        # Create new sample indices
        original_length = len(audio_samples)
        new_length = int(original_length / stretch_factor)
        
        if new_length < 10:  # Minimum length check
            return audio_samples
        
        # Extract time series
        timestamps = np.array([i for i in range(original_length)])
        left_mics = np.array([s['leftMic'] for s in audio_samples])
        right_mics = np.array([s['rightMic'] for s in audio_samples])
        
        # Create interpolation functions
        left_interp = interp1d(timestamps, left_mics, kind='linear', 
                              bounds_error=False, fill_value='extrapolate')
        right_interp = interp1d(timestamps, right_mics, kind='linear', 
                               bounds_error=False, fill_value='extrapolate')
        
        # Generate new timestamps
        new_timestamps = np.linspace(0, original_length-1, new_length)
        
        # Interpolate values
        new_left = left_interp(new_timestamps)
        new_right = right_interp(new_timestamps)
        
        # Create new audio samples
        stretched_samples = []
        base_timestamp = audio_samples[0]['timestamp'] if audio_samples else 0
        
        for i, (left, right) in enumerate(zip(new_left, new_right)):
            sample = {
                'leftMic': float(left),
                'rightMic': float(right),
                'difference': float(left - right),
                'averageLevel': float((left + right) / 2),
                'timestamp': base_timestamp + i * 1000,  # Approximate timestamp
                'local_timestamp': audio_samples[0]['local_timestamp'] + i,
                'local_datetime': audio_samples[0]['local_datetime']  # Keep original
            }
            stretched_samples.append(sample)
        
        return stretched_samples
    
    def pitch_shift_audio(self, audio_samples, semitone_shift):
        """Apply pitch shifting (simulate different speakers)"""
        if semitone_shift == 0:
            return audio_samples
        
        # Pitch shift multiplier (semitones to frequency ratio)
        pitch_ratio = 2 ** (semitone_shift / 12.0)
        
        shifted_samples = []
        for sample in audio_samples:
            # Apply pitch shift to volume levels (simulates different voice characteristics)
            left_shifted = sample['leftMic'] * (0.95 + 0.1 * random.random()) * pitch_ratio
            right_shifted = sample['rightMic'] * (0.95 + 0.1 * random.random()) * pitch_ratio
            
            # Keep within reasonable dB range
            left_shifted = np.clip(left_shifted, 20, 90)
            right_shifted = np.clip(right_shifted, 20, 90)
            
            new_sample = copy.deepcopy(sample)
            new_sample['leftMic'] = float(left_shifted)
            new_sample['rightMic'] = float(right_shifted)
            new_sample['difference'] = float(left_shifted - right_shifted)
            new_sample['averageLevel'] = float((left_shifted + right_shifted) / 2)
            
            shifted_samples.append(new_sample)
        
        return shifted_samples
    
    def adjust_stereo_positioning(self, audio_samples, position_shift):
        """Simulate different speaker positions"""
        if position_shift == 0:
            return audio_samples
        
        positioned_samples = []
        for sample in audio_samples:
            # Apply stereo positioning shift
            left = sample['leftMic']
            right = sample['rightMic']
            
            # Shift speaker position (-1.0 = hard left, +1.0 = hard right)
            if position_shift > 0:  # Shift toward right
                left_adj = left * (1 - position_shift * 0.3)
                right_adj = right * (1 + position_shift * 0.2)
            else:  # Shift toward left
                left_adj = left * (1 + abs(position_shift) * 0.2)
                right_adj = right * (1 - abs(position_shift) * 0.3)
            
            new_sample = copy.deepcopy(sample)
            new_sample['leftMic'] = float(left_adj)
            new_sample['rightMic'] = float(right_adj)
            new_sample['difference'] = float(left_adj - right_adj)
            new_sample['averageLevel'] = float((left_adj + right_adj) / 2)
            
            positioned_samples.append(new_sample)
        
        return positioned_samples
    
    def add_background_noise(self, audio_samples, noise_level):
        """Add background noise simulation"""
        if noise_level == 'none':
            return audio_samples
        
        # Define noise levels (dB to add)
        noise_mapping = {
            'low': (1, 3),
            'medium': (3, 6),
            'high': (6, 10)
        }
        
        if noise_level not in noise_mapping:
            return audio_samples
        
        min_noise, max_noise = noise_mapping[noise_level]
        
        noisy_samples = []
        for sample in audio_samples:
            # Add random noise
            left_noise = random.uniform(min_noise, max_noise)
            right_noise = random.uniform(min_noise, max_noise)
            
            new_sample = copy.deepcopy(sample)
            new_sample['leftMic'] = float(sample['leftMic'] + left_noise)
            new_sample['rightMic'] = float(sample['rightMic'] + right_noise)
            new_sample['difference'] = float(new_sample['leftMic'] - new_sample['rightMic'])
            new_sample['averageLevel'] = float((new_sample['leftMic'] + new_sample['rightMic']) / 2)
            
            noisy_samples.append(new_sample)
        
        return noisy_samples
    
    def adjust_energy_level(self, audio_samples, target_energy):
        """Adjust overall energy level of the meeting"""
        if target_energy == 'medium':
            return audio_samples
        
        # Calculate current average energy
        avg_levels = [s['averageLevel'] for s in audio_samples]
        current_avg = np.mean(avg_levels)
        
        # Define energy adjustments
        if target_energy == 'low':
            # Reduce volume and variation
            volume_multiplier = 0.8
            variation_multiplier = 0.6
        elif target_energy == 'high':
            # Increase volume and variation
            volume_multiplier = 1.2
            variation_multiplier = 1.4
        else:
            return audio_samples
        
        adjusted_samples = []
        for sample in audio_samples:
            # Apply energy adjustment
            left_adj = (sample['leftMic'] - current_avg) * variation_multiplier + current_avg
            right_adj = (sample['rightMic'] - current_avg) * variation_multiplier + current_avg
            
            left_adj *= volume_multiplier
            right_adj *= volume_multiplier
            
            # Keep within reasonable bounds
            left_adj = np.clip(left_adj, 25, 85)
            right_adj = np.clip(right_adj, 25, 85)
            
            new_sample = copy.deepcopy(sample)
            new_sample['leftMic'] = float(left_adj)
            new_sample['rightMic'] = float(right_adj)
            new_sample['difference'] = float(left_adj - right_adj)
            new_sample['averageLevel'] = float((left_adj + right_adj) / 2)
            
            adjusted_samples.append(new_sample)
        
        return adjusted_samples
    
    def simulate_turn_taking(self, audio_samples, meeting_type):
        """Simulate different turn-taking patterns based on meeting type"""
        if meeting_type in ['discussion', 'unknown']:
            return audio_samples  # Keep original patterns
        
        modified_samples = copy.deepcopy(audio_samples)
        
        if meeting_type == 'brainstorm':
            # More rapid speaker switches and interruptions
            for i in range(1, len(modified_samples)):
                if random.random() < 0.15:  # 15% chance of rapid switch
                    # Swap left/right dominance
                    left = modified_samples[i]['leftMic']
                    right = modified_samples[i]['rightMic']
                    modified_samples[i]['leftMic'] = right
                    modified_samples[i]['rightMic'] = left
                    modified_samples[i]['difference'] = right - left
                    modified_samples[i]['averageLevel'] = (left + right) / 2
        
        elif meeting_type == 'presentation':
            # One dominant speaker with occasional questions
            total_samples = len(modified_samples)
            dominant_side = random.choice(['left', 'right'])
            
            for i, sample in enumerate(modified_samples):
                if random.random() < 0.8:  # 80% time dominant speaker
                    if dominant_side == 'left':
                        sample['leftMic'] *= 1.3
                        sample['rightMic'] *= 0.7
                    else:
                        sample['rightMic'] *= 1.3
                        sample['leftMic'] *= 0.7
                    
                    sample['difference'] = sample['leftMic'] - sample['rightMic']
                    sample['averageLevel'] = (sample['leftMic'] + sample['rightMic']) / 2
        
        elif meeting_type == 'argument':
            # More interruptions and overlapping speech
            for i in range(len(modified_samples)):
                if random.random() < 0.25:  # 25% chance of overlapping speech
                    # Increase both sides (overlapping)
                    modified_samples[i]['leftMic'] *= 1.2
                    modified_samples[i]['rightMic'] *= 1.2
                    modified_samples[i]['averageLevel'] = (modified_samples[i]['leftMic'] + modified_samples[i]['rightMic']) / 2
        
        return modified_samples
    
    def generate_augmented_sample(self, original_session_id, augmentation_params):
        """Generate one augmented sample from original recording"""
        original_data = self.original_audio_data[original_session_id]
        audio_samples = copy.deepcopy(original_data['audio'])
        original_labels = copy.deepcopy(original_data['labels'])
        
        # Apply augmentations in sequence
        if 'time_stretch' in augmentation_params:
            audio_samples = self.time_stretch_audio(audio_samples, augmentation_params['time_stretch'])
        
        if 'pitch_shift' in augmentation_params:
            audio_samples = self.pitch_shift_audio(audio_samples, augmentation_params['pitch_shift'])
        
        if 'stereo_position' in augmentation_params:
            audio_samples = self.adjust_stereo_positioning(audio_samples, augmentation_params['stereo_position'])
        
        if 'background_noise' in augmentation_params:
            audio_samples = self.add_background_noise(audio_samples, augmentation_params['background_noise'])
        
        if 'energy_level' in augmentation_params:
            audio_samples = self.adjust_energy_level(audio_samples, augmentation_params['energy_level'])
        
        if 'meeting_type' in augmentation_params:
            audio_samples = self.simulate_turn_taking(audio_samples, augmentation_params['meeting_type'])
        
        # Update labels based on augmentation
        new_labels = copy.deepcopy(original_labels)
        
        # Update labels with augmentation effects
        if 'energy_level' in augmentation_params:
            new_labels['energy_level'] = augmentation_params['energy_level']
        
        if 'background_noise' in augmentation_params:
            new_labels['background_noise'] = augmentation_params['background_noise']
        
        if 'meeting_type' in augmentation_params:
            new_labels['meeting_type'] = augmentation_params['meeting_type']
        
        # Update duration and sample count
        new_labels['sample_count'] = len(audio_samples)
        if 'time_stretch' in augmentation_params:
            original_duration = original_labels['duration_seconds']
            new_labels['duration_seconds'] = original_duration / augmentation_params['time_stretch']
        
        return audio_samples, new_labels
    
    def generate_augmentation_variations(self):
        """Generate all possible augmentation parameter combinations"""
        variations = []
        
        # Time stretching variations
        time_stretches = [0.8, 0.9, 1.0, 1.1, 1.2]
        
        # Pitch shifting variations
        pitch_shifts = [-2, -1, 0, 1, 2]
        
        # Stereo positioning variations
        stereo_positions = [-0.6, -0.3, 0, 0.3, 0.6]
        
        # Background noise variations
        bg_noises = ['none', 'low', 'medium', 'high']
        
        # Energy level variations
        energy_levels = ['low', 'medium', 'high']
        
        # Meeting type variations
        meeting_types = ['discussion', 'presentation', 'brainstorm', 'argument']
        
        # Generate combinations (sample to get target number)
        target_per_original = self.target_samples_per_original
        
        for _ in range(target_per_original):
            variation = {
                'time_stretch': random.choice(time_stretches),
                'pitch_shift': random.choice(pitch_shifts),
                'stereo_position': random.choice(stereo_positions),
                'background_noise': random.choice(bg_noises),
                'energy_level': random.choice(energy_levels),
                'meeting_type': random.choice(meeting_types)
            }
            variations.append(variation)
        
        return variations
    
    def augment_dataset(self):
        """Generate complete augmented dataset"""
        print(f"\nStarting dataset augmentation...")
        
        if not self.original_audio_data:
            self.load_original_data()
        
        # Generate augmentation variations
        variations = self.generate_augmentation_variations()
        print(f"Generated {len(variations)} augmentation variations")
        
        total_generated = 0
        
        # Process each original recording
        for session_id in self.original_audio_data.keys():
            print(f"\nProcessing {session_id}...")
            
            session_generated = 0
            
            # Generate variations for this session
            for i, variation in enumerate(variations):
                try:
                    # Generate augmented sample
                    aug_audio, aug_labels = self.generate_augmented_sample(session_id, variation)
                    
                    # Create new session ID
                    aug_session_id = f"{session_id}_aug_{i:03d}"
                    
                    # Update labels
                    aug_labels['session_id'] = aug_session_id
                    aug_labels['original_session_id'] = session_id
                    aug_labels['augmentation_params'] = variation
                    aug_labels['generated_time'] = datetime.now().isoformat()
                    
                    # Save augmented data
                    self.save_augmented_sample(aug_session_id, aug_audio, aug_labels)
                    
                    session_generated += 1
                    total_generated += 1
                    
                    if session_generated % 10 == 0:
                        print(f"   Generated {session_generated}/{len(variations)} samples")
                
                except Exception as e:
                    print(f"Error generating sample {i}: {e}")
            
            print(f"Completed {session_id}: {session_generated} samples generated")
        
        print(f"\nAugmentation complete!")
        print(f"Total samples generated: {total_generated}")
        print(f"Original recordings: {len(self.original_audio_data)}")
        print(f"Augmentation ratio: {total_generated/len(self.original_audio_data):.1f}x")
        
        # Create master CSV for augmented data
        self.create_augmented_master_csv()
        
        return total_generated
    
    def save_augmented_sample(self, session_id, audio_data, labels):
        """Save individual augmented sample"""
        # Save audio data
        audio_file = self.output_dir / f"{session_id}_audio.json"
        with open(audio_file, 'w') as f:
            json.dump(audio_data, f, indent=2)
        
        # Save labels
        labels_file = self.output_dir / f"{session_id}_labels.json"
        with open(labels_file, 'w') as f:
            json.dump(labels, f, indent=2)
        
        # Store for CSV creation
        self.augmented_sessions.append({
            'session_id': session_id,
            'start_time': labels.get('start_time', ''),
            'duration_seconds': labels.get('duration_seconds', 0),
            'sample_count': labels.get('sample_count', 0),
            'speaker_count': labels.get('speaker_count', 'unknown'),
            'meeting_type': labels.get('meeting_type', 'unknown'),
            'energy_level': labels.get('energy_level', 'unknown'),
            'background_noise': labels.get('background_noise', 'unknown'),
            'original_session_id': labels.get('original_session_id', ''),
            'notes': 'Augmented sample'
        })
    
    def create_augmented_master_csv(self):
        """Create master CSV for augmented dataset"""
        if not self.augmented_sessions:
            print("No augmented sessions to save")
            return
        
        aug_df = pd.DataFrame(self.augmented_sessions)
        csv_file = self.output_dir / "augmented_sessions_master.csv"
        aug_df.to_csv(csv_file, index=False)
        
        print(f"   Saved augmented master CSV: {csv_file}")
        print(f"   Augmented dataset summary:")
        print(f"   Total samples: {len(aug_df)}")
        print(f"   Speaker counts: {aug_df['speaker_count'].value_counts().to_dict()}")
        print(f"   Meeting types: {aug_df['meeting_type'].value_counts().to_dict()}")
        print(f"   Energy levels: {aug_df['energy_level'].value_counts().to_dict()}")
        print(f"   Background noise: {aug_df['background_noise'].value_counts().to_dict()}")
    
    def run_augmentation_pipeline(self):
        """Run complete augmentation pipeline"""
        print("Meeting Audio Data Augmentation Pipeline")
        print("="*80)
        
        # Load original data
        self.load_original_data()
        
        # Run augmentation
        total_generated = self.augment_dataset()
        
        print(f"\n" + "="*80)
        print("AUGMENTATION PIPELINE COMPLETE!")
        print("="*80)
        
        print(f"   Results:")
        print(f"   Original recordings: {len(self.original_audio_data)}")
        print(f"   Generated samples: {total_generated}")
        print(f"   Total dataset size: {len(self.original_audio_data) + total_generated}")
        print(f"   Augmentation factor: {total_generated/len(self.original_audio_data):.1f}x")
        
        print(f"\n Output Location:")
        print(f"   Augmented data: {self.output_dir}")
        print(f"   Master CSV: {self.output_dir}/augmented_sessions_master.csv")
        
        print(f"\n Data is ready for ML Training:")
        print(f"   Balanced dataset across all categories")
        print(f"   Realistic audio variations")
        print(f"   Proper label preservation")
        print(f"   ML-ready format")
        
        return self.output_dir

def main():
    """Main augmentation function"""
    print("Meeting Audio Data Augmenter")
    print("="*50)
    
    # Create augmenter instance
    augmenter = MeetingAudioAugmenter()
    
    # Run augmentation pipeline
    output_dir = augmenter.run_augmentation_pipeline()
    
    print(f"\n My next steps:")
    print(f"   1. Run: python dataset_analyzer.ipynb (to analyze augmented data)")
    print(f"   2. Build ML models using the augmented dataset")
    print(f"   3. Train and validate models")
    print(f"   4. Deploy to your web interface")
    
    return augmenter

if __name__ == "__main__":
    augmenter = main()

Meeting Audio Data Augmenter
 Meeting Audio Augmenter initialized
 Input: 8 original recordings
 Target: 480 augmented samples
Meeting Audio Data Augmentation Pipeline

 Loading original audio data...
Loaded meeting_20250617_144307: 367 samples
Loaded meeting_20250617_145139: 650 samples
Loaded meeting_20250620_130013: 618 samples
Loaded meeting_20250620_130618: 534 samples
Loaded meeting_20250624_132317: 568 samples
Loaded meeting_20250624_132856: 689 samples
Loaded meeting_20250624_133532: 450 samples
Loaded meeting_20250624_135319: 196 samples

Starting dataset augmentation...
Generated 60 augmentation variations

Processing meeting_20250617_144307...
   Generated 10/60 samples
   Generated 20/60 samples
   Generated 30/60 samples
   Generated 40/60 samples
   Generated 50/60 samples
   Generated 60/60 samples
Completed meeting_20250617_144307: 60 samples generated

Processing meeting_20250617_145139...
   Generated 10/60 samples
   Generated 20/60 samples
   Generated 30/60 samples