# Pre-Process AutoRPT
In this notebook, I will pre-process the AutoRPT data so that we can experiment with traditional ML models and neural networks.

Each code block below is explained based on what they do.

In [1]:
# Import the libraries:

import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
import pickle
from sklearn.model_selection import train_test_split
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [5]:
class AutoRPTPreprocessor:
    """
    Preprocessor for AutoRPT dataset - converts timestamps to frame-level labels
    """
    
    def __init__(self, data_root, frame_shift_ms=10, target_sr=16000):
        self.data_root = Path(data_root)
        self.frame_shift_ms = frame_shift_ms
        self.frame_shift_samples = int(target_sr * frame_shift_ms / 1000)  # 160 samples at 16kHz
        self.target_sr = target_sr
        
        # Storage for processed data
        self.file_pairs = []  # (audio_file, annotation_files)
        self.processed_data = []
        
    def scan_and_match_files(self):
        """
        Scan dataset and create audio-annotation file pairs
        """
        print("🔍 Scanning and matching files...")
        
        # Get all files
        audio_files = list(self.data_root.rglob("*.wav"))
        annotation_files = list(self.data_root.rglob("*_annotations.csv"))
        
        print(f"Found {len(audio_files)} audio files, {len(annotation_files)} annotation files")
        
        # Group annotation files by audio basename
        audio_to_annotations = defaultdict(list)
        
        for ann_file in annotation_files:
            # Extract basename: f1arrlp5_annotations.csv -> f1arrlp5
            basename = ann_file.name.replace('_annotations.csv', '')
            audio_to_annotations[basename].append(ann_file)
        
        # Match with audio files
        for audio_file in audio_files:
            basename = audio_file.stem
            if basename in audio_to_annotations:
                annotations = audio_to_annotations[basename]
                self.file_pairs.append((audio_file, annotations))
                print(f"✅ Matched {basename}: 1 audio + {len(annotations)} annotations")
            else:
                print(f"⚠️ No annotations found for {basename}")
        
        print(f"\n📊 Final matching: {len(self.file_pairs)} complete sets")
        return len(self.file_pairs)
    
    def timestamps_to_frame_labels(self, timestamps, audio_duration, tolerance_ms=50):
        """
        Convert timestamp list to frame-level binary labels
        
        Args:
            timestamps: List of event timestamps (in seconds)
            audio_duration: Total audio duration (seconds)
            tolerance_ms: Tolerance window around each timestamp (milliseconds)
        
        Returns:
            Binary array of frame labels
        """
        # Calculate total frames
        total_frames = int(audio_duration * 1000 / self.frame_shift_ms)
        frame_labels = np.zeros(total_frames, dtype=np.int8)
        
        # Convert timestamps to frame indices with tolerance window
        tolerance_frames = int(tolerance_ms / self.frame_shift_ms)
        
        for timestamp in timestamps:
            if pd.isna(timestamp):
                continue
                
            # Convert timestamp to frame index
            frame_idx = int(timestamp * 1000 / self.frame_shift_ms)
            
            # Mark frames within tolerance window
            start_frame = max(0, frame_idx - tolerance_frames)
            end_frame = min(total_frames, frame_idx + tolerance_frames + 1)
            
            frame_labels[start_frame:end_frame] = 1
            
        return frame_labels
    
    def process_annotation_pair(self, annotation_files, audio_duration):
        """
        Process the two annotation files for one audio file
        
        Returns:
            Dictionary with aggregated labels from both annotators
        """
        all_prominence_events = []
        all_boundary_events = []
        
        annotator_data = []
        
        for i, ann_file in enumerate(annotation_files):
            df = pd.read_csv(ann_file)
            
            # Extract events (non-NaN timestamps)
            prominence_events = df['Prominence'].dropna().tolist()
            boundary_events = df['Boundary'].dropna().tolist()
            
            all_prominence_events.extend(prominence_events)
            all_boundary_events.extend(boundary_events)
            
            # Store individual annotator data
            annotator_data.append({
                'annotator_id': i,
                'prominence_events': prominence_events,
                'boundary_events': boundary_events,
                'num_prominence': len(prominence_events),
                'num_boundary': len(boundary_events)
            })
        
        # Convert to frame labels
        prominence_labels = self.timestamps_to_frame_labels(all_prominence_events, audio_duration)
        boundary_labels = self.timestamps_to_frame_labels(all_boundary_events, audio_duration)
        
        return {
            'prominence_labels': prominence_labels,
            'boundary_labels': boundary_labels,
            'prominence_events': all_prominence_events,
            'boundary_events': all_boundary_events,
            'annotator_data': annotator_data,
            'num_annotators': len(annotation_files)
        }
    
    def extract_basic_features(self, audio_path):
        """
        Extract basic prosodic features from audio
        """
        # Load audio
        y, sr = librosa.load(audio_path, sr=self.target_sr)
        
        # Extract features at frame level
        frame_length = int(sr * 0.025)  # 25ms window
        hop_length = self.frame_shift_samples  # 10ms hop
        
        # F0 (pitch)
        f0 = librosa.yin(y, fmin=50, fmax=400, sr=sr, 
                        frame_length=frame_length, hop_length=hop_length)
        
        # Energy/Intensity  
        energy = librosa.feature.rms(y=y, frame_length=frame_length, 
                                   hop_length=hop_length)[0]
        
        # Spectral features
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr, 
                                                              hop_length=hop_length)[0]
        
        # MFCCs (first 13)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, 
                                   hop_length=hop_length)
        
        # Stack features
        features = np.vstack([
            f0.reshape(1, -1),
            energy.reshape(1, -1), 
            spectral_centroids.reshape(1, -1),
            mfccs
        ])  # Shape: (16, n_frames)
        
        return features.T  # Shape: (n_frames, 16)
    
    def process_single_file(self, audio_file, annotation_files):
        """
        Process one audio file with its annotation files
        """
        try:
            print(f"Processing {audio_file.name}...")
            
            # Get audio info
            audio_info = sf.info(audio_file)
            audio_duration = audio_info.duration
            
            # Process annotations
            annotation_data = self.process_annotation_pair(annotation_files, audio_duration)
            
            # Extract audio features
            features = self.extract_basic_features(audio_file)
            
            # Ensure label and feature alignment
            n_feature_frames = features.shape[0]
            n_label_frames = len(annotation_data['prominence_labels'])
            
            # Truncate to minimum length (handle small misalignments)
            min_frames = min(n_feature_frames, n_label_frames)
            
            features = features[:min_frames]
            prominence_labels = annotation_data['prominence_labels'][:min_frames]
            boundary_labels = annotation_data['boundary_labels'][:min_frames]
            
            return {
                'file_id': audio_file.stem,
                'audio_path': str(audio_file),
                'audio_duration': audio_duration,
                'features': features,
                'prominence_labels': prominence_labels,
                'boundary_labels': boundary_labels,
                'n_frames': min_frames,
                'sample_rate': audio_info.samplerate,
                'annotation_data': annotation_data
            }
            
        except Exception as e:
            print(f"❌ Error processing {audio_file.name}: {e}")
            return None
    
    def process_all_files(self, max_files=None):
        """
        Process all matched files
        """
        if not self.file_pairs:
            print("❌ No file pairs found. Run scan_and_match_files() first.")
            return
            
        file_pairs_to_process = self.file_pairs[:max_files] if max_files else self.file_pairs
        
        print(f"\n🚀 Processing {len(file_pairs_to_process)} file pairs...")
        
        for i, (audio_file, annotation_files) in enumerate(file_pairs_to_process):
            result = self.process_single_file(audio_file, annotation_files)
            
            if result is not None:
                self.processed_data.append(result)
                
            if (i + 1) % 10 == 0:
                print(f"   Processed {i + 1}/{len(file_pairs_to_process)} files...")
        
        print(f"\n✅ Successfully processed {len(self.processed_data)} files")
        
    def get_dataset_statistics(self):
        """
        Calculate dataset statistics
        """
        if not self.processed_data:
            print("❌ No processed data available")
            return
            
        print("\n📊 Dataset Statistics:")
        print("=" * 40)
        
        total_frames = sum(d['n_frames'] for d in self.processed_data)
        total_prominence = sum(d['prominence_labels'].sum() for d in self.processed_data)
        total_boundary = sum(d['boundary_labels'].sum() for d in self.processed_data)
        total_duration = sum(d['audio_duration'] for d in self.processed_data)
        
        print(f"Total files: {len(self.processed_data)}")
        print(f"Total duration: {total_duration/60:.1f} minutes")
        print(f"Total frames: {total_frames:,}")
        print(f"Frame resolution: {self.frame_shift_ms}ms")
        
        print(f"\nLabel Statistics:")
        print(f"  Prominence events: {total_prominence:,} frames ({100*total_prominence/total_frames:.2f}%)")
        print(f"  Boundary events: {total_boundary:,} frames ({100*total_boundary/total_frames:.2f}%)")
        
        # Feature statistics
        all_features = np.vstack([d['features'] for d in self.processed_data])
        print(f"\nFeature Statistics:")
        print(f"  Feature dimensions: {all_features.shape[1]}")
        print(f"  Feature matrix shape: {all_features.shape}")
        
        # Class balance
        prominence_ratio = total_prominence / total_frames
        boundary_ratio = total_boundary / total_frames
        
        print(f"\nClass Balance Assessment:")
        if prominence_ratio < 0.05:
            print(f"  ⚠️ Prominence highly imbalanced ({prominence_ratio:.3f})")
        else:
            print(f"  ✅ Prominence reasonably balanced ({prominence_ratio:.3f})")
            
        if boundary_ratio < 0.05:
            print(f"  ⚠️ Boundary highly imbalanced ({boundary_ratio:.3f})")
        else:
            print(f"  ✅ Boundary reasonably balanced ({boundary_ratio:.3f})")
    
    def create_train_val_test_splits(self, train_ratio=0.7, val_ratio=0.15, random_state=42):
        """
        Create train/validation/test splits at the file level
        """
        if not self.processed_data:
            print("❌ No processed data available")
            return None
            
        # Split at file level to avoid data leakage
        file_indices = list(range(len(self.processed_data)))
        
        # First split: train vs (val + test)
        train_idx, temp_idx = train_test_split(
            file_indices, 
            train_size=train_ratio, 
            random_state=random_state,
            shuffle=True
        )
        
        # Second split: val vs test
        val_size = val_ratio / (1 - train_ratio)
        val_idx, test_idx = train_test_split(
            temp_idx,
            train_size=val_size,
            random_state=random_state,
            shuffle=True
        )
        
        splits = {
            'train': [self.processed_data[i] for i in train_idx],
            'val': [self.processed_data[i] for i in val_idx],
            'test': [self.processed_data[i] for i in test_idx]
        }
        
        print(f"\nDataset Splits Created:")
        print(f"  Train: {len(splits['train'])} files")
        print(f"  Validation: {len(splits['val'])} files") 
        print(f"  Test: {len(splits['test'])} files")
        
        return splits
    
    def save_processed_data(self, output_path="autorpt_processed.pkl"):
        """
        Save processed data to disk
        """
        output_path = Path(output_path)
        
        save_data = {
            'processed_data': self.processed_data,
            'preprocessing_config': {
                'frame_shift_ms': self.frame_shift_ms,
                'target_sr': self.target_sr,
                'frame_shift_samples': self.frame_shift_samples
            },
            'file_pairs': [(str(audio), [str(ann) for ann in anns]) for audio, anns in self.file_pairs]
        }
        
        with open(output_path, 'wb') as f:
            pickle.dump(save_data, f)
            
        print(f"Processed data saved to {output_path}")
        print(f"   File size: {output_path.stat().st_size / (1024*1024):.1f} MB")

def main():
    """
    Main preprocessing pipeline
    """
    print("AutoRPT Data Preprocessing Pipeline")
    print("=" * 50)
    
    # Initialize preprocessor
    data_root = "/Users/hasancan/desktop/prosody/AutoRPT_Data"
    preprocessor = AutoRPTPreprocessor(data_root)
    
    # Step 1: Scan and match files
    num_pairs = preprocessor.scan_and_match_files()
    
    if num_pairs == 0:
        print("❌ No file pairs found. Check data directory.")
        return
    
    # Step 2: Process files (start with a subset for testing)
    print(f"\n🚀 Processing files...")
    preprocessor.process_all_files()
    
    # Step 3: Get statistics
    preprocessor.get_dataset_statistics()
    
    # Step 4: Create splits
    splits = preprocessor.create_train_val_test_splits()
    
    # Step 5: Save processed data
    preprocessor.save_processed_data("autorpt_processed_subset.pkl")

In [6]:
# TO RUN THE PREPROCESSING:
if __name__ == "__main__":
    main()

🎵 AutoRPT Data Preprocessing Pipeline
🔍 Scanning and matching files...
Found 142 audio files, 284 annotation files
✅ Matched f1arrlp7: 1 audio + 2 annotations
✅ Matched f1arrlp6: 1 audio + 2 annotations
✅ Matched f1arrlp4: 1 audio + 2 annotations
✅ Matched f1arrlp5: 1 audio + 2 annotations
✅ Matched f1arrlp1: 1 audio + 2 annotations
✅ Matched f1arrlp2: 1 audio + 2 annotations
✅ Matched f1arrlp3: 1 audio + 2 annotations
✅ Matched f1atrlp3: 1 audio + 2 annotations
✅ Matched f1atrlp2: 1 audio + 2 annotations
✅ Matched f1atrlp6: 1 audio + 2 annotations
✅ Matched f1atrlp7: 1 audio + 2 annotations
✅ Matched f1atrlp5: 1 audio + 2 annotations
✅ Matched f1atrlp4: 1 audio + 2 annotations
✅ Matched f1ajrlp5: 1 audio + 2 annotations
✅ Matched f1ajrlp4: 1 audio + 2 annotations
✅ Matched f1ajrlp6: 1 audio + 2 annotations
✅ Matched f1ajrlp3: 1 audio + 2 annotations
✅ Matched f1ajrlp2: 1 audio + 2 annotations
✅ Matched f1ajrlp1: 1 audio + 2 annotations
✅ Matched f1aprlp2: 1 audio + 2 annotations
✅ Mat

In [7]:
# Load full processed data
def load_full_dataset():
    print("Loading FULL AutoRPT dataset...")
    
    with open("autorpt_processed_subset.pkl", 'rb') as f:  # Note: still named "subset" but contains all 142 files
        data = pickle.load(f)
    
    processed_data = data['processed_data']
    config = data['preprocessing_config']
    
    print(f"Loaded {len(processed_data)} processed files")
    print(f"Total frames: {sum(len(d['prominence_labels']) for d in processed_data):,}")
    
    return processed_data, config

# Load and check the full dataset
processed_data, config = load_full_dataset()

Loading FULL AutoRPT dataset...
Loaded 142 processed files
Total frames: 420,045
