In [4]:
import numpy as np
import tensorflow as tf
import os
import json
import librosa
import soundfile as sf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard, ReduceLROnPlateau, EarlyStopping
import datetime
import pickle
import matplotlib.pyplot as plt
import random
import argparse
import multiprocessing
from tensorflow.python.framework import config

# Intel CPU optimizations using standard TensorFlow
config.set_visible_devices([], 'GPU')  # Disable GPU for Intel CPU optimization

# Enable OneDNN optimizations (built into standard TF)
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'

class NSynthDataLoader:
    def __init__(self, data_path: str, max_files: int = 1000, num_workers: int = 4):
        self.data_path = os.path.join(data_path, "nsynth-train")
        self.max_files = max_files
        self.num_workers = num_workers
        self.metadata = {}
        self._load_metadata()
        
    def _load_metadata(self):
        metadata_path = os.path.join(self.data_path, "examples.json")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(f"Metadata file not found at {metadata_path}")
        
        print(f"Loading metadata from {metadata_path}")
        with open(metadata_path, 'r') as f:
            self.metadata = json.load(f)
        print(f"Loaded metadata for {len(self.metadata)} samples")
    
    def extract_features(self, y: np.ndarray, sr: int) -> np.ndarray:
        # Validate input audio
        if y.shape[0] == 0:
            raise ValueError("Empty audio input")
        
        # Intel-optimized feature extraction
        with tf.device('/CPU:0'):
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
            mel_db = librosa.power_to_db(mel_spec, ref=np.max)
            mel_mean = np.mean(mel_db, axis=1)

            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            mfccs_mean = np.mean(mfccs, axis=1)

            contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
            contrast_mean = np.mean(contrast, axis=1)

            features = np.concatenate([mel_mean, mfccs_mean, contrast_mean])
        
        # Validate extracted features
        if np.any(np.isnan(features)) or np.any(np.isinf(features)):
            raise ValueError("Invalid features detected (NaN or Inf values)")
            
        return features

    def _process_file(self, wav_file):
        try:
            file_id = wav_file.split('.')[0]
            if file_id not in self.metadata:
                return None, None
            
            audio_path = os.path.join(self.data_path, "audio")
            y, sr = librosa.load(os.path.join(audio_path, wav_file), sr=16000, duration=4.0)
            
            # Validate audio data
            if len(y) < sr * 0.5:  # Ensure at least 0.5 seconds of audio
                return None, None
                
            features = self.extract_features(y, sr)
            
            metadata = self.metadata[file_id]
            sample_params = [
                float(metadata['pitch']) / 127.0,
                float(metadata['velocity']) / 127.0,
                float(metadata['instrument_family']) / 11.0,
                float(metadata['instrument_source']) / 3.0
            ]
            
            return sample_params, features
            
        except Exception as e:
            print(f"Error processing {wav_file}: {str(e)}")
            return None, None

    def load_data(self):
        sample_data, audio_features = [], []
        audio_path = os.path.join(self.data_path, "audio")
        wav_files = [f for f in os.listdir(audio_path) if f.endswith('.wav')]
        
        # Randomly shuffle the list of files
        random.shuffle(wav_files)
        wav_files = wav_files[:self.max_files]
        
        print(f"Processing {len(wav_files)} files with {self.num_workers} workers")
        
        # Parallel processing with Intel optimizations
        with multiprocessing.Pool(processes=self.num_workers) as pool:
            results = pool.map(self._process_file, wav_files)
        
        # Filter out None results and separate data
        for sample, feature in results:
            if sample is not None and feature is not None:
                sample_data.append(sample)
                audio_features.append(feature)
        
        print(f"Successfully processed {len(sample_data)} files")
        return np.array(sample_data), np.array(audio_features)

class NSynthSequencePreparation:
    def __init__(self, seq_length: int = 32):
        self.seq_length = seq_length
        self.scaler = StandardScaler()
        self.feature_dims = None
    
    def validate_scaler(self, data):
        """Validate scaler transformation"""
        transformed = self.scaler.transform(data)
        if np.any(np.isnan(transformed)) or np.any(np.isinf(transformed)):
            raise ValueError("Invalid scaling detected (NaN or Inf values)")
        return transformed

    def prepare_data(self, sample_data, audio_features):
        if len(sample_data) != len(audio_features):
            raise ValueError("Mismatched sample and feature lengths")
            
        combined_data = np.concatenate([sample_data, audio_features], axis=1)
        self.feature_dims = combined_data.shape[1]
        
        # Fit the scaler to the combined data
        self.scaler.fit(combined_data)
        combined_data = self.validate_scaler(combined_data)
        
        # Use optimized NumPy operations for sequence creation
        sequences, targets = [], []
        for i in range(len(combined_data) - self.seq_length):
            sequences.append(combined_data[i:i + self.seq_length])
            targets.append(combined_data[i + self.seq_length])
        
        return np.array(sequences), np.array(targets)
    
    def save_scaler(self, path: str):
        """Save the fitted scaler and feature dimensions"""
        if not hasattr(self.scaler, 'mean_'):
            raise ValueError("Scaler has not been fitted to data")
            
        scaler_data = {
            'scaler': self.scaler,
            'feature_dims': self.feature_dims
        }
        
        with open(path, 'wb') as f:
            pickle.dump(scaler_data, f)
        print(f"Scaler saved to {path}")
    
    @staticmethod
    def load_scaler(path: str):
        """Load and validate saved scaler"""
        with open(path, 'rb') as f:
            scaler_data = pickle.load(f)
            
        if not isinstance(scaler_data, dict) or 'scaler' not in scaler_data:
            raise ValueError("Invalid scaler file format")
            
        return scaler_data['scaler'], scaler_data['feature_dims']

def create_model(input_shape, output_shape):
    # Use TensorFlow optimized for Intel CPUs
    inputs = tf.keras.Input(shape=input_shape)
    
    # First LSTM layer - optimized for CPU
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(512, return_sequences=True, 
                          implementation=2)  # Implementation 2 is faster on CPU
    )(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.6)(x)
    
    # Second LSTM layer
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(512, implementation=2)
    )(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.6)(x)
    
    # Dense layers with appropriate kernel initializers
    x = tf.keras.layers.Dense(512, activation='relu', 
                            kernel_initializer='he_normal')(x)
    x = tf.keras.layers.Dense(512, activation='relu', 
                            kernel_initializer='he_normal')(x)
    
    outputs = tf.keras.layers.Dense(output_shape, activation='linear')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    return model

def validate_model(model, val_data, val_targets):
    """Validate model predictions"""
    predictions = model.predict(val_data, batch_size=64)
    mse = np.mean((predictions - val_targets) ** 2)
    print(f"Validation MSE: {mse}")
    return mse < 2.0  # Threshold based on your specific needs

def main(data_path='nsynth_small', 
         max_files=70000, 
         seq_length=32, 
         batch_size=64, 
         epochs=10, 
         num_workers=4,
         checkpoint_dir='./checkpoints'):
    """
    Main function with parameters that can be called directly in a Jupyter notebook
    """
    # Create checkpoint directory if it doesn't exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Configure Intel optimizations
    print("Configuring Intel optimizations...")
    os.environ['KMP_BLOCKTIME'] = '1'
    os.environ['KMP_SETTINGS'] = '1'
    os.environ['KMP_AFFINITY'] = 'granularity=fine,verbose,compact,1,0'
    os.environ['OMP_NUM_THREADS'] = str(multiprocessing.cpu_count())
    os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
    
    # Initialize components
    data_loader = NSynthDataLoader(data_path, max_files=max_files, num_workers=num_workers)
    sequence_prep = NSynthSequencePreparation(seq_length=seq_length)
    
    # Load and prepare data
    print("Loading data...")
    sample_data, audio_features = data_loader.load_data()
    print("Preparing sequences...")
    x, y = sequence_prep.prepare_data(sample_data, audio_features)
    
    # Split data
    x_train, x_val, y_train, y_val = train_test_split(
        x, y, test_size=0.2, random_state=42
    )
    
    # Save scaler
    sequence_prep.save_scaler(os.path.join(checkpoint_dir, 'scaler.pkl'))
    
    # Create and compile model
    print("Creating model...")
    model = create_model(
        input_shape=(x.shape[1], x.shape[2]), 
        output_shape=y.shape[1]
    )
    
    # Standard optimizer with good defaults for Intel CPUs
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.Huber(),
        metrics=['mae', 'mse']
    )
    
    # Print model summary
    model.summary()
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(checkpoint_dir, 'model_checkpoint-{epoch:02d}.h5'),
            save_best_only=True,
            monitor='val_loss'
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        TensorBoard(
            log_dir=os.path.join(checkpoint_dir, 'logs'),
            histogram_freq=1
        )
    ]
    
    # Train model
    print("Training model...")
    history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        workers=num_workers,
        use_multiprocessing=True
    )
    
    # Validate final model
    print("Validating model...")
    if validate_model(model, x_val, y_val):
        print("Model validation successful")
        model.save(os.path.join(checkpoint_dir, 'nsynth_model_final.h5'))
        
        # Save training history
        with open(os.path.join(checkpoint_dir, 'training_history.pkl'), 'wb') as f:
            pickle.dump(history.history, f)
        
        # Plot training history
        plt.figure(figsize=(10, 6))
        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Val Loss')
        plt.title('Model Loss During Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.savefig(os.path.join(checkpoint_dir, 'training_history.png'))
        plt.close()
    else:
        print("Model validation failed")

if __name__ == "__main__":
    # If running as a script, use argparse
    try:
        parser = argparse.ArgumentParser(description='NSynth audio model training on Intel Tiber Cloud')
        parser.add_argument('--data_path', type=str, default='nsynth_small', help='Path to NSynth dataset')
        parser.add_argument('--max_files', type=int, default=70000, help='Maximum number of files to process')
        parser.add_argument('--seq_length', type=int, default=32, help='Sequence length for LSTM')
        parser.add_argument('--batch_size', type=int, default=64, help='Training batch size')
        parser.add_argument('--epochs', type=int, default=10, help='Number of training epochs')
        parser.add_argument('--num_workers', type=int, default=4, help='Number of worker processes for data loading')
        parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints', help='Directory to save checkpoints')
        args = parser.parse_args()
        
        main(
            data_path=args.data_path,
            max_files=args.max_files,
            seq_length=args.seq_length,
            batch_size=args.batch_size,
            epochs=args.epochs,
            num_workers=args.num_workers,
            checkpoint_dir=args.checkpoint_dir
        )
    except SystemExit:
        # Running in Jupyter notebook, parameters will be provided directly to main()
        pass

usage: ipykernel_launcher.py [-h] [--data_path DATA_PATH]
                             [--max_files MAX_FILES] [--seq_length SEQ_LENGTH]
                             [--batch_size BATCH_SIZE] [--epochs EPOCHS]
                             [--num_workers NUM_WORKERS]
                             [--checkpoint_dir CHECKPOINT_DIR]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/ue7a5adbf8d5127b839fde345138342d/.local/share/jupyter/runtime/kernel-190721db-f7a2-4717-afa6-379c61893fe6.json


In [None]:
# Configure paths and parameters for your environment
main(
    data_path='nsynth_small',  # Update with your actual dataset path
    max_files=10000,  # Start with a smaller number for testing
    seq_length=32,
    batch_size=64,
    epochs=10,
    num_workers=8,  # Adjust based on your CPU cores
    checkpoint_dir='./nsynth_checkpoints'
)

Configuring Intel optimizations...
Loading metadata from nsynth_small/nsynth-train/examples.json
Loaded metadata for 289205 samples
Loading data...
Processing 902 files with 8 workers
