In [None]:
# Install necessary packages
!pip install tensorflow
!pip install mmap_ninja
!pip install pyyaml
!pip install datasets

In [202]:
# Save a yaml config that controls the training process
import yaml
import os

from microwakeword.feature_generation import ClipsHandler

config = {}

# config['train_dir'] = 'trained_models/test_new_formulation'
config['train_dir'] = 'trained_models/alexa_spatial_attention2'


# Each feature_dir should have at least one of the following folders with this structure:
#  training/
#    ragged_mmap_folders_ending_in_mmap
#  testing/
#    ragged_mmap_folders_ending_in_mmap
#  testing_ambient/
#    ragged_mmap_folders_ending_in_mmap
#  validation/
#    ragged_mmap_folders_ending_in_mmap
#  validation_ambient/
#    ragged_mmap_folders_ending_in_mmap
#
#  sampling_weight: Weight for choosing a spectrogram from this set in the batch
#  penalty_weight: Penalizing weight for incorrect predictions from this set
#  truth: Boolean whether this set has positive samples or negative samples
#  truncation_strategy = If spectrograms in the set are longer than necessary for training, how are they truncated
#       - random: choose a random portion of the entire spectrogram - useful for long negative samples
#       - truncate_start: remove the start of the spectrogram
#       - truncate_end: remove the end of the spectrogram
#       - split: Split the longer spectrogram into separate spectrograms offset by 100 ms. Only for ambient sets

config['features'] = [
        {
            'features_dir': 'training_sets/alexa/fast_speech_eval',
            'sampling_weight': 0.0,
            'penalty_weight': 1,
            'truth': True,
            'truncation_strategy': 'truncate_start',
            'type': "mmap",
        },
        {
            'input_path': '/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/piper-sample-generator/output/alexa/fast_speech',
            'input_glob': '*.wav',
            # 'impulse_paths': ['mit_rirs/'], 
            'impulse_paths': ['/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/microWakeWord/notebooks/mixing/rir'], 
            'background_paths': ['/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/microWakeWord/notebooks/mixing/background_clips'], 
            'augmentation_probabilities': {
                    "SevenBandParametricEQ": 0.1,
                    "TanhDistortion": 0.1,
                    "PitchShift": 0.1,
                    "BandStopFilter": 0.1,
                    "AddColorNoise": 0.25,
                    "AddBackgroundNoise": 0.75,
                    "Gain": 1.0,
                    "RIR": 0.5,
                },
            'augmented_duration_s': 3.99,
            'max_start_time_from_right_s': None,
            'max_jitter_s': 0.03,
            'min_jitter_s': None,#0.00,
            'max_clip_duration_s': 1.1,   
            'min_clip_duration_s': None,
            'sampling_weight': 0.3,
            'penalty_weight': 1,
            'truth': True,
            'truncation_strategy': 'truncate_start',
            'type': "clips",   
            'generate': False,
            'generator_settings': {
                'text': ['ə lɛk sə, ', 'ə lɛk sʌ, ', 'ɐ lɛk sə, ', 'ɐ lɛk sʌ, '],
                'batch_size': 200,
                'slerp_weights': [0.8],
                'length_scales': [0.7, 0.8, 0.9, 1.0],
                'noise_scales': [0.98],
                'noise_scales_ws': [0.98],
                'max_speakers': 600,
                'phoneme_input': True,
                             #             batch_size=200, slerp_weights = [0.8], length_scales=[0.8,0.9,1.0], #noise_scales=[0.98],max_speakers=600, phoneme_input=True)
            }         
        },
        {
            'input_path': '/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/piper-sample-generator/output/alexa/prepend',
            'input_glob': '*.wav',
            # 'impulse_paths': ['mit_rirs/'], 
            'impulse_paths': ['/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/microWakeWord/notebooks/mixing/rir'], 
            'background_paths': ['/Users/kahrendt/Documents/Hobbies/Programming/Git-Repositories/microWakeWord/notebooks/mixing/background_clips'], 
            'augmentation_probabilities': {
                    "SevenBandParametricEQ": 0.1,
                    "TanhDistortion": 0.1,
                    "PitchShift": 0.1,
                    "BandStopFilter": 0.1,
                    "AddColorNoise": 0.25,
                    "AddBackgroundNoise": 0.75,
                    "Gain": 1.0,
                    "RIR": 0.5,
                },
            'augmented_duration_s': 3.99,
            'max_start_time_from_right_s': None,
            'max_jitter_s': 0.03,
            'min_jitter_s': None,# 0.00,
            'max_clip_duration_s': 3.0,   
            'min_clip_duration_s': None,
            'sampling_weight': 0.2,
            'penalty_weight': 1,
            'truth': True,
            'truncation_strategy': 'truncate_start',
            'type': "clips",  
            'generate': False,          
        },
        {
            'features_dir': 'training_sets/alexa/new_negatives',
            'sampling_weight': 0.5,
            'penalty_weight': 0.5,
            'truth': False,
            'truncation_strategy': 'truncate_start',
            'type': "mmap",            
        },
        {
            'features_dir': 'training_sets/speech_background_new',
            'sampling_weight': 4,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': 'training_sets/dinner_party_background_new',
            'sampling_weight': 3,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': 'training_sets/no_speech_background_new',
            'sampling_weight': 3,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'random',
            'type': "mmap",            
        },
        {
            'features_dir': 'training_sets/ambient_background',
            'sampling_weight': 0.0,
            'penalty_weight': 1,
            'truth': False,
            'truncation_strategy': 'split',
            'type': "mmap",            
        },
    ]

# config['features'] = [
#         {
#             'features_dir': '/Volumes/MachineLearning/training_data/negative_datasets/no_speech_background',
#             'sampling_weight': 150,
#             'penalty_weight': 1,
#             'truth': False,
#             'truncation_strategy': 'random',
#             'type': "mmap",            
#         },
#     ]
# features_directory = '/Volumes/MachineLearning/multilingual_stuff/mswc_features/'
# words = os.listdir(features_directory)

# words = [x for x in words if not x.startswith('.')]

# for index, word in enumerate(words):
#     config['features'].append({
#             'features_dir': os.path.join(features_directory, word),
#             'sampling_weight': 1,
#             'penalty_weight': 1,
#             'truth': True,
#             'truncation_strategy': 'truncate_start',
#             'type': "mmap",        
#     })


# Number of training steps in each iteration - various other settings are configured as lists that corresponds to different steps
config['training_steps'] = [20000,20000,20000]#,20000,20000]#,15000]#,15000]#[15000,15000,15000]#[20000,20000]#,20000]#[20000,20000,20000]#,20000]#20000]#,20000,20000]        

# Penalizing weight for incorrect class predictions - lists that correspond to training steps
config["positive_class_weight"] = [1]               
config["negative_class_weight"] = [1,1.5,2,2.5]#[1,3,5]#7.5,10]#[1,1.5,2]

config['learning_rates'] = [0.001,0.0005,0.0002,0.0001]#[0.001, 0.0005,0.0005,0.0002]#,0.001] # Learning rates for Adam optimizer - list that corresponds to training steps
config['batch_size'] = 128

config['mix_up_augmentation_prob'] =  [0]       # Probability of applying MixUp augmentation - list that corresponds to training steps
config['freq_mix_augmentation_prob'] = [0]      # Probability of applying FreqMix augmentation - list that corresponds to training steps
config['time_mask_max_size'] = [5]              # SpecAugment - list that corresponds to training steps
config['time_mask_count'] = [2]                 # SpecAugment - list that corresponds to training steps
config['freq_mask_max_size'] = [5]              # SpecAugment - list that corresponds to training steps
config['freq_mask_count'] = [2]                 # SpecAugment - list that corresponds to training steps
config['eval_step_interval'] = 500              # Test the validation sets after every this many steps

config['clip_duration_ms'] = 650   # Maximum length of wake word that the streaming model will accept
# config['window_stride_ms'] = 20     # Fixed setting for default feature generator
# config['window_size_ms'] = 30       # Fixed setting for default feature generator
# config['sample_rate'] = 16000       # Fixed setting for default feature generator

# The best model weights are chosen first by minimizing the specified minimization metric below the specified target_minimization
# Once the target has been met, it chooses the maximum of the maximization metric. Set 'minimization_metric' to None to only maximize
# Available metrics:
#   - "loss" - cross entropy error on validation set
#   - "accuracy" - accuracy of validation set
#   - "recall" - recall of validation set
#   - "precision" - precision of validation set
#   - "false_positive_rate" - false positive rate of validation set
#   - "false_negative_rate" - false negative rate of validation set
#   - "ambient_false_positives" - count of false positives from the split validation_ambient set
#   - "ambient_false_positives_per_hour" - estimated number of false positives per hour on the split validation_ambient set
config['target_minimization'] = 1.0

# config['minimization_metric'] = 'ambient_false_positives_per_hour'  # Set to None to disable
# config['maximization_metric'] = 'recall'

config['minimization_metric'] = None  # Set to None to disable
config['maximization_metric'] = 'recall_at_no_faph'

config['binary_classification'] = False

with open(os.path.join('training_parameters.yaml'), 'w') as file:
    documents = yaml.dump(config, file)

In [203]:
!python -m microwakeword.model_train_eval \
--training_config='training_parameters.yaml' \
--train 1 \
--restore_checkpoint 1 \
--test_tf_nonstreaming 0 \
--test_tflite_nonstreaming 0 \
--test_tflite_streaming 0 \
--test_tflite_streaming_quantized 1 \
--use_weights "best_weights" \
mixednet \
--pointwise_filters "64, 32, 48, 64, 64" \
--repeat_in_block  "1, 1, 1, 1, 1" \
--mixconv_kernel_sizes "[5], [5,9], [9,13], [13,17], [29]" \
--residual_connection "0,0,0,0,0" \
--first_conv_filters 0
# inception \
# --cnn1_filters '32' \
# --cnn1_kernel_sizes '5' \
# --cnn1_subspectral_groups '4' \
# --cnn2_filters1 '24,24,24' \
# --cnn2_filters2 '32,64,96' \
# --cnn2_kernel_sizes '3,5,5' \
# --cnn2_subspectral_groups '1,1,1' \
# --cnn2_dilation '1,1,1' \
# --dropout 0.8 

# bc_wide_matchbox \
# --activation 'relu' \
# --dropout 0.0 \
# --dropout_final_layer 0.0 \
# --ds_filters '32, 24, 24, 24, 32, 32' \
# --ds_filters2 '32, 32, 32, 32, 32, 32' \
# --ds_repeat '1, 2, 2, 2, 1, 0' \
# --ds_residual '0, 0, 0, 0, 0, 0' \
# --ds_kernel_size '5, 7, 9, 11, 19, 1' \
# --ds_stride '1, 1, 1, 1, 1, 1' \
# --ds_dilation '1, 1, 1, 1, 1, 1' \
# --ds_padding "'valid','valid','valid','valid','valid','valid'" \
# --ds_pool '1,1,1,1,1,1' \
# --max_pool '1' \
# --freq_stride '1,2,1,2,1,1' \
# --first_conv_filters 24



INFO:absl:Loading and analyzing data sets.
2024-05-24 13:28:40.348976: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-05-24 13:28:40.349003: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-05-24 13:28:40.349006: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-05-24 13:28:40.349037: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-24 13:28:40.349054: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)           

In [86]:
import numpy as np
false_accepts_per_hour = [2, 1.5, 0.2, 0.2, 0, 0]
# false_rejects = [0.03, 0.03, 0.10, 0.2, 0.3, 0.5]
recall_at_cutoff = [0.99, 0.97, 0.97, 0.9, 0.85, 0.75]

no_faph_cutoff_index = false_accepts_per_hour.index(0)
print(no_faph_cutoff_index)

# x_coordinates = [0]
# y_coordinates = [recall_at_cutoff[no_faph_cutoff_index]]

# for index in reversed(range(no_faph_cutoff_index)):
#     x = false_accepts_per_hour[index]
#     y = recall_at_cutoff[index]
#     if false_accepts_per_hour[index] > 10:
#         # we only care about faph rates up to 10
#         break
    
    
#     # if x == x_coordinates[-1]:
#     #     # same faph as previous test, so go onto the next cutoff
#     #     continue
#     # else:
#     #     # new faph, so add previous one and move on
#     #     x_coordinates.append(false_accepts_per_hour[index])
#     #     y_coordinates.append(recall_at_cutoff[index])
# print(x_coordinates)
# print(y_coordinates)

x_coordinates = []
y_coordinates = []
for index in range(len(false_accepts_per_hour)):
    x = false_accepts_per_hour[index]
    y = recall_at_cutoff[index]
    
    if x > 5:
        continue
    
    x_coordinates.append(x)
    y_coordinates.append(y)


x_coordinates = np.array(x_coordinates)
y_coordinates = np.array(y_coordinates)

x_coordinates_good = [x_coordinates[0]]
y_coordinates_good = [y_coordinates[0]]
for index in range(1,len(x_coordinates)):
    print(index)
    if x_coordinates[-1] == x_coordinates[index]:
        continue
    else:
        x_coordinates_good.append(x_coordinates[index])
        y_coordinates_good.append(y_coordinates[index])
print(x_coordinates_good)
print(y_coordinates_good)



sum(predictions[testing_ground_truth[i : i + test_batch_size].nonzero()] > cutoff)
    
    


4
6
1
2
3
4
5
[2, 1.5, 0.2, 0.2]
[0.99, 0.97, 0.97, 0.9]


In [151]:
total_false_positive_tests = 100
false_positives_at_cutoff = np.array([75,50, 29, 27, 27, 13, 0, 0])
recall_at_cutoff = np.array([0.97,0.97, 0.95, 0.94, 0.9, 0.9, 0.6, 0.4])

false_positive_rates = false_positives_at_cutoff/total_false_positive_tests

x_coordinates = [1.0]
y_coordinates = [1.0]

for index in range(0, len(false_positive_rates)):
    if false_positive_rates[index] != x_coordinates[-1]:
        x_coordinates.append(false_positive_rates[index])
        y_coordinates.append(recall_at_cutoff[index])
        
x_coordinates = np.array(x_coordinates)
y_coordinates = np.array(y_coordinates)
print(x_coordinates)
print(y_coordinates)
# print(x_coordinates[(x_coordinates < 30).nonzero()])
# print(y_coordinates[(x_coordinates < 30).nonzero()])

print(np.trapz(np.flip(y_coordinates), np.flip(x_coordinates)))


[1.   0.75 0.5  0.29 0.27 0.13 0.  ]
[1.   0.97 0.97 0.95 0.94 0.9  0.6 ]
0.93555
