In [None]:
# Install necessary packages
!pip install tensorflow
!pip install mmap_ninja
!pip install pyyaml
!pip install datasets

In [26]:
# Save a yaml config that controls the training process
import yaml
import os

config = {}

config['train_dir'] = 'trained_models/okay_nabu_no_end_depthwise'

# Each feature_dir should have at least one of the following folders with this structure:
#  training/
#    ragged_mmap_folders_ending_in_mmap
#  testing/
#    ragged_mmap_folders_ending_in_mmap
#  testing_ambient/
#    ragged_mmap_folders_ending_in_mmap
#  validation/
#    ragged_mmap_folders_ending_in_mmap
#  validation_ambient/
#    ragged_mmap_folders_ending_in_mmap
#
#  sampling_weight: Weight for choosing a spectrogram from this set in the batch
#  penalty_weight: Penalizing weight for incorrect predictions from this set
#  truth: Boolean whether this set has positive samples or negative samples
#  truncation_strategy = If spectrograms in the set are longer than necessary for training, how are they truncated
#       - random: choose a random portion of the entire spectrogram - useful for long negative samples
#       - truncate_start: remove the start of the spectrogram
#       - truncate_end: remove the end of the spectrogram
#       - split: Split the longer spectrogram into separate spectrograms offset by 100 ms. Only for ambient sets

config['features'] = [
        {
            'features_dir': '/Volumes/MachineLearning/training_data/okay_nabu/generated_positive',
            'sampling_weight': 1,
            'penalty_weight': 1,
            'truth': True,
            'truncation_strategy': 'truncate_start',
            'type': "mmap",
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/okay_nabu/generated_negative',
            'sampling_weight': 1,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'truncate_start',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/english_speech_background',
            'sampling_weight': 2,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/non_english_speech_background',
            'sampling_weight': 1,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/dinner_party_background',
            'sampling_weight': 2,
            'penalty_weight': 3,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/no_speech_background',
            'sampling_weight': 1,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/ambient_background',
            'sampling_weight': 0.0,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'split',
            'type': "mmap",            
        },
    ]

# config['features'] = [
#         {
#             'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/no_speech_background',
#             'sampling_weight': 150,
#             'penalty_weight': 1,
#             'truth': False,
#             'truncation_strategy': 'random',
#             'type': "mmap",            
#         },
#     ]
# features_directory = '/Volumes/MachineLearning/multilingual_stuff/mswc_features/'
# words = os.listdir(features_directory)

# words = [x for x in words if not x.startswith('.')]

# for index, word in enumerate(words):
#     config['features'].append({
#             'features_dir': os.path.join(features_directory, word),
#             'sampling_weight': 1,
#             'penalty_weight': 1,
#             'truth': True,
#             'truncation_strategy': 'truncate_start',
#             'type': "mmap",        
#     })


# Number of training steps in each iteration - various other settings are configured as lists that corresponds to different steps
config['training_steps'] = [10000]#[20000,20000,20000,20000]        

# Penalizing weight for incorrect class predictions - lists that correspond to training steps
config["positive_class_weight"] = [1]               
config["negative_class_weight"] = [5]

config['learning_rates'] = [0.0001]#[0.0005,0.0002,0.0001]#[0.002, 0.001, 0.0005,0.0002] # Learning rates for Adam optimizer - list that corresponds to training steps
config['batch_size'] = 256

config['mix_up_augmentation_prob'] =  [0]       # Probability of applying MixUp augmentation - list that corresponds to training steps
config['freq_mix_augmentation_prob'] = [0]      # Probability of applying FreqMix augmentation - list that corresponds to training steps
config['time_mask_max_size'] = [5]              # SpecAugment - list that corresponds to training steps
config['time_mask_count'] = [2]                 # SpecAugment - list that corresponds to training steps
config['freq_mask_max_size'] = [7]              # SpecAugment - list that corresponds to training steps
config['freq_mask_count'] = [2]                 # SpecAugment - list that corresponds to training steps
config['eval_step_interval'] = 500              # Test the validation sets after every this many steps

config['clip_duration_ms'] = 1490   # Maximum length of wake word that the streaming model will accept
# config['window_stride_ms'] = 20     # Fixed setting for default feature generator
# config['window_size_ms'] = 30       # Fixed setting for default feature generator
# config['sample_rate'] = 16000       # Fixed setting for default feature generator

# The best model weights are chosen first by minimizing the specified minimization metric below the specified target_minimization
# Once the target has been met, it chooses the maximum of the maximization metric. Set 'minimization_metric' to None to only maximize
# Available metrics:
#   - "loss" - cross entropy error on validation set
#   - "accuracy" - accuracy of validation set
#   - "recall" - recall of validation set
#   - "precision" - precision of validation set
#   - "false_positive_rate" - false positive rate of validation set
#   - "false_negative_rate" - false negative rate of validation set
#   - "ambient_false_positives" - count of false positives from the split validation_ambient set
#   - "ambient_false_positives_per_hour" - estimated number of false positives per hour on the split validation_ambient set
config['minimization_metric'] = 'ambient_false_positives_per_hour'  # Set to None to disable
config['target_minimization'] = 0.25
config['maximization_metric'] = 'recall'
config['binary_classification'] = False

with open(os.path.join('training_parameters.yaml'), 'w') as file:
    documents = yaml.dump(config, file)

In [28]:
!python -m microwakeword.model_train_eval \
--training_config='training_parameters.yaml' \
--train 0 \
--restore_checkpoint 1 \
--test_tf_nonstreaming 0 \
--test_tflite_nonstreaming 0 \
--test_tflite_streaming 0 \
--test_tflite_streaming_quantized 1 \
--use_weights "best_weights" \
wide_matchbox \
--activation 'relu' \
--dropout 0.0 \
--ds_filters '40, 20, 20, 20, 40, 40' \
--ds_repeat '1, 1, 1, 1, 1, 1' \
--ds_residual '0, 1, 1, 1, 0, 0' \
--ds_kernel_size '11, 13, 15, 17, 29, 1' \
--ds_stride '1, 1, 1, 1, 1, 1' \
--ds_dilation '1, 1, 1, 1, 1, 1'
# ds_tc_resnet \
# --activation 'relu' \
# --dropout 0.0 \
# --ds_filters '96, 48, 48, 48, 96, 96' \
# --ds_repeat '1, 1, 1, 1, 1, 1' \
# --ds_residual '0, 1, 1, 1, 0, 0' \
# --ds_kernel_size '11, 13, 15, 17, 29, 1' \
# --ds_stride '1, 1, 1, 1, 1, 1' \
# --ds_dilation '1, 1, 1, 1, 1, 1'
# inception \
# --cnn1_filters '32' \
# --cnn1_kernel_sizes '5' \
# --cnn1_subspectral_groups '1' \
# --cnn2_filters1 '24,24,24' \
# --cnn2_filters2 '32,64,96' \
# --cnn2_kernel_sizes '3,5,5' \
# --cnn2_subspectral_groups '1,1,1' \
# --cnn2_dilation '1,1,1' \
# --dropout 0.8


INFO:absl:Loading and analyzing data sets.
INFO:absl:training mode has 4497821 spectrograms representing 12787.0 hours of audio
INFO:absl:validation mode has 19909 spectrograms representing 27.5 hours of audio
INFO:absl:validation_ambient mode has 10 spectrograms representing 5.3 hours of audio
INFO:absl:testing mode has 30258 spectrograms representing 33.2 hours of audio
INFO:absl:testing_ambient mode has 10 spectrograms representing 5.3 hours of audio
2024-04-19 15:42:38.235770: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-04-19 15:42:38.235794: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-04-19 15:42:38.235801: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-04-19 15:42:38.235832: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
20