In [None]:
# Install necessary packages
!pip install tensorflow
!pip install mmap_ninja
!pip install pyyaml
!pip install datasets

In [6]:
# Save a yaml config that controls the training process
import yaml
import os

from microwakeword.feature_generation import ClipsHandler

config = {}

# config['train_dir'] = 'trained_models/test_new_formulation'
config['train_dir'] = 'trained_models/alexa_mixednet_5_min_jitter'


# Each feature_dir should have at least one of the following folders with this structure:
#  training/
#    ragged_mmap_folders_ending_in_mmap
#  testing/
#    ragged_mmap_folders_ending_in_mmap
#  testing_ambient/
#    ragged_mmap_folders_ending_in_mmap
#  validation/
#    ragged_mmap_folders_ending_in_mmap
#  validation_ambient/
#    ragged_mmap_folders_ending_in_mmap
#
#  sampling_weight: Weight for choosing a spectrogram from this set in the batch
#  penalty_weight: Penalizing weight for incorrect predictions from this set
#  truth: Boolean whether this set has positive samples or negative samples
#  truncation_strategy = If spectrograms in the set are longer than necessary for training, how are they truncated
#       - random: choose a random portion of the entire spectrogram - useful for long negative samples
#       - truncate_start: remove the start of the spectrogram
#       - truncate_end: remove the end of the spectrogram
#       - split: Split the longer spectrogram into separate spectrograms offset by 100 ms. Only for ambient sets

config['features'] = [
        {
            'features_dir': '/Volumes/MachineLearning/training_data/alexa/positive_min_end_jitter',
            'sampling_weight': 0.0,
            'penalty_weight': 1,
            'truth': True,
            'truncation_strategy': 'truncate_start',
            'type': "mmap",
        },
        {
            'features_dir': '/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/piper-sample-generator/output/alexa',
            'input_glob': '*2/*.wav',
            # 'impulse_paths': ['mit_rirs/'], 
            'impulse_paths': ['/Volumes/MachineLearning/audio_samples/background_noise_samples/wav_16k/bird_rir_all_channels_16k'], 
            'background_paths': ['/Volumes/MachineLearning/audio_samples/background_noise_samples/background_clips_for_mixing'], 
            'augmentation_probabilities': {
                    "SevenBandParametricEQ": 0.1,
                    "TanhDistortion": 0.1,
                    "PitchShift": 0.1,
                    "BandStopFilter": 0.1,
                    "AddColorNoise": 0.25,
                    "AddBackgroundNoise": 0.75,
                    "Gain": 1.0,
                    "RIR": 0.5,
                },
            'augmented_duration_s': 2.99,
            'max_start_time_from_right_s': None,
            'max_jitter_s': 0.2,
            'min_jitter_s': 0.05,
            'max_clip_duration_s': 1.37,   
            'min_clip_duration_s': None,
            'sampling_weight': 0.5,
            'penalty_weight': 1,
            'truth': True,
            'truncation_strategy': 'truncate_start',
            'type': "clips",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/alexa/old_negative',
            'sampling_weight': 0.5,
            'penalty_weight': 0.5,
            'truth': False,
            'truncation_strategy': 'truncate_start',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/speech_background_new',
            'sampling_weight': 4,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/dinner_party_background_new',
            'sampling_weight': 3,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/no_speech_background_new',
            'sampling_weight': 3,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/ambient_background',
            'sampling_weight': 0.0,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'split',
            'type': "mmap",            
        },
    ]

# config['features'] = [
#         {
#             'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/no_speech_background',
#             'sampling_weight': 150,
#             'penalty_weight': 1,
#             'truth': False,
#             'truncation_strategy': 'random',
#             'type': "mmap",            
#         },
#     ]
# features_directory = '/Volumes/MachineLearning/multilingual_stuff/mswc_features/'
# words = os.listdir(features_directory)

# words = [x for x in words if not x.startswith('.')]

# for index, word in enumerate(words):
#     config['features'].append({
#             'features_dir': os.path.join(features_directory, word),
#             'sampling_weight': 1,
#             'penalty_weight': 1,
#             'truth': True,
#             'truncation_strategy': 'truncate_start',
#             'type': "mmap",        
#     })


# Number of training steps in each iteration - various other settings are configured as lists that corresponds to different steps
config['training_steps'] = [20000,20000,20000]#,15000]#,15000]#[15000,15000,15000]#[20000,20000]#,20000]#[20000,20000,20000]#,20000]#20000]#,20000,20000]        

# Penalizing weight for incorrect class predictions - lists that correspond to training steps
config["positive_class_weight"] = [1]               
config["negative_class_weight"] = [1,3,5]#7.5,10]#[1,1.5,2]

config['learning_rates'] = [0.001,0.0005,0.0002]#[0.001, 0.0005,0.0005,0.0002]#,0.001] # Learning rates for Adam optimizer - list that corresponds to training steps
config['batch_size'] = 128

config['mix_up_augmentation_prob'] =  [0]       # Probability of applying MixUp augmentation - list that corresponds to training steps
config['freq_mix_augmentation_prob'] = [0]      # Probability of applying FreqMix augmentation - list that corresponds to training steps
config['time_mask_max_size'] = [10]              # SpecAugment - list that corresponds to training steps
config['time_mask_count'] = [0]                 # SpecAugment - list that corresponds to training steps
config['freq_mask_max_size'] = [4]              # SpecAugment - list that corresponds to training steps
config['freq_mask_count'] = [2]                 # SpecAugment - list that corresponds to training steps
config['eval_step_interval'] = 500              # Test the validation sets after every this many steps

config['clip_duration_ms'] = 210#650   # Maximum length of wake word that the streaming model will accept
# config['window_stride_ms'] = 20     # Fixed setting for default feature generator
# config['window_size_ms'] = 30       # Fixed setting for default feature generator
# config['sample_rate'] = 16000       # Fixed setting for default feature generator

# The best model weights are chosen first by minimizing the specified minimization metric below the specified target_minimization
# Once the target has been met, it chooses the maximum of the maximization metric. Set 'minimization_metric' to None to only maximize
# Available metrics:
#   - "loss" - cross entropy error on validation set
#   - "accuracy" - accuracy of validation set
#   - "recall" - recall of validation set
#   - "precision" - precision of validation set
#   - "false_positive_rate" - false positive rate of validation set
#   - "false_negative_rate" - false negative rate of validation set
#   - "ambient_false_positives" - count of false positives from the split validation_ambient set
#   - "ambient_false_positives_per_hour" - estimated number of false positives per hour on the split validation_ambient set
config['target_minimization'] = 1.0

# config['minimization_metric'] = 'ambient_false_positives_per_hour'  # Set to None to disable
# config['maximization_metric'] = 'recall'

config['minimization_metric'] = None  # Set to None to disable
config['maximization_metric'] = 'recall_at_no_faph'

config['binary_classification'] = False

with open(os.path.join('training_parameters.yaml'), 'w') as file:
    documents = yaml.dump(config, file)

In [7]:
!python -m microwakeword.model_train_eval \
--training_config='training_parameters.yaml' \
--train 1 \
--restore_checkpoint 1 \
--test_tf_nonstreaming 0 \
--test_tflite_nonstreaming 0 \
--test_tflite_streaming 0 \
--test_tflite_streaming_quantized 1 \
--use_weights "best_weights" \
mixednet \
--pointwise_filters "64, 32, 48, 64, 64" \
--repeat_in_block  "1, 1, 1, 1, 1" \
--mixconv_kernel_sizes "[5], [5,9], [9,13], [13,17], [17,29]" \
--residual_connection "0,0,0,0,0" \
--first_conv_filters 0
# mixednet \
# --activation 'relu' \
# --dropout 0.0 \
# --dropout_final_layer 0.0 \
# --ds_filters '32, 40, 40, 40, 48, 64' \
# --ds_filters2 '32, 32, 32, 32, 32, 32' \
# --ds_repeat '1, 2, 2, 2, 1, 0' \
# --ds_residual '0, 0, 1, 1, 0, 0' \
# --ds_kernel_size '5, 7, 9, 11, 19, 1' \
# --ds_stride '1, 1, 1, 1, 1, 1' \
# --ds_dilation '1, 1, 1, 1, 1, 1' \
# --ds_padding "'valid','valid','valid','valid','valid','valid'" \
# --ds_pool '1,1,1,1,1,1' \
# --max_pool '0' \
# --first_conv_filters 32
# bc_wide_matchbox \
# --activation 'relu' \
# --dropout 0.0 \
# --dropout_final_layer 0.0 \
# --ds_filters '32, 24, 24, 24, 32, 32' \
# --ds_filters2 '32, 32, 32, 32, 32, 32' \
# --ds_repeat '1, 2, 2, 2, 1, 0' \
# --ds_residual '0, 0, 0, 0, 0, 0' \
# --ds_kernel_size '5, 7, 9, 11, 19, 1' \
# --ds_stride '1, 1, 1, 1, 1, 1' \
# --ds_dilation '1, 1, 1, 1, 1, 1' \
# --ds_padding "'valid','valid','valid','valid','valid','valid'" \
# --ds_pool '1,1,1,1,1,1' \
# --max_pool '1' \
# --freq_stride '1,2,1,2,1,1' \
# --first_conv_filters 24



INFO:absl:Loading and analyzing data sets.
INFO:absl:training mode has 1146843 spectrograms representing 1417.5 hours of audio
INFO:absl:validation mode has 17358 spectrograms representing 19.2 hours of audio
INFO:absl:validation_ambient mode has 10 spectrograms representing 5.3 hours of audio
INFO:absl:testing mode has 17359 spectrograms representing 19.2 hours of audio
INFO:absl:testing_ambient mode has 10 spectrograms representing 5.3 hours of audio
2024-05-12 07:46:37.641952: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-05-12 07:46:37.641980: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-05-12 07:46:37.641985: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-05-12 07:46:37.642014: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
202