In [None]:
!pip install torchaudio-augmentations
!pip install audio_augmentations
!pip install wandb --upgrade

import torch
import torchaudio
import torchtext
import torchaudio.functional as F
import torchaudio.transforms as T
from audio_augmentations import *

import os, re, random
import numpy as np
import sklearn
import itertools
import time

import pickle
from tqdm.auto import tqdm
from IPython.display import clear_output
import IPython.display as ipd
import gc
import matplotlib.pyplot as plt
import wandb

print(torch.__version__)
print(torchaudio.__version__)

import sys

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
sys.path.append('/kaggle/working/Voice-commands-recognition')
sys.path.append('/kaggle/working/Voice-commands-recognition/notebooks')

In [None]:
!conda install -y gdown

In [None]:
!gdown --id 1yhD3dA8fmKncYaHmfbj4I0W640DLBeND

In [None]:
!rm -rf ./Voice-commands-recognition
!git clone https://github.com/litvan007/Voice-commands-recognition.git
!mkdir /kaggle/working/Voice-commands-recognition/notebooks/signal_plots
!mkdir /kaggle/working/Voice-commands-recognition/notebooks/spec_plots
!mkdir /kaggle/working/Voice-commands-recognition/notebooks/mfcc_plots
!mkdir /kaggle/working/Voice-commands-recognition/checkpoints
!ls ./Voice-commands-recognition

In [None]:
path_to_zip_file = "/kaggle/working/data.zip"
directory_to_extract_to = "./"

import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [None]:
random.seed(123456)
np.random.seed(123456)
torch.manual_seed(123456)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

In [None]:
data_path = '/kaggle/working/data'
data_list = []
with open(os.path.join(data_path, 'data_base_audio.pickle'), 'rb') as fh:
    data_list = pickle.load(fh)

In [None]:
len(data_list)

In [None]:
def create_plots(signal, feature_map, name_fig, sample_rate=16000, 
                            signal_plot_dir='signal_plots', 
                            spec_plot_dir='spec_plots', 
                            mfcc_plot_dir='mfcc_plots',
                            notebook_path='/kaggle/working/Voice-commands-recognition/notebooks'):
    
    signal = signal.numpy()

    num_frames = signal.size
    time_axis = torch.arange(0, num_frames) / sample_rate

    plt.plot(time_axis, signal, linewidth=1)
    plt.xlabel('Time')
    plt.title('Signal')
    plt.grid()
    plt.savefig(os.path.join(notebook_path, signal_plot_dir, name_fig))
    plt.clf()

    plt.specgram(signal, Fs=sample_rate)
    plt.xlabel('Time')
    plt.title('Spectrogram')
    plt.savefig(os.path.join(notebook_path, spec_plot_dir, name_fig))
    plt.clf()

    plt.imshow(feature_map, interpolation='nearest', origin='lower', aspect='auto')
    plt.xlabel('Frame')
    plt.title('MFCC')
    plt.savefig(os.path.join(notebook_path, mfcc_plot_dir, name_fig))
    plt.clf()
    

In [None]:
all_labels = set()
for example in data_list:
    all_labels.add(example['label'])
token_to_idx = {x: idx for idx, x in enumerate(all_labels)}

In [None]:
token_to_idx

In [None]:
lengths_subsets = {'train': int(0.8 * len(data_list)), 'valid': round(0.1 * len(data_list)), 'test': round(0.1 * len(data_list))}
train_vaild_subset, test_subset = torch.utils.data.random_split(data_list, 
                                                                [lengths_subsets['train']+lengths_subsets['valid'], lengths_subsets['test']])
train_subset, valid_subset = torch.utils.data.random_split(train_vaild_subset, 
                                                                [lengths_subsets['train'], lengths_subsets['valid']])
lengths_subsets

In [None]:
class Sound_dataset_commands(torch.utils.data.Dataset):
    def __init__(self, rootdir, subset, transform=None):
        self.transform = transform
        self.rootdir = rootdir
        self.subset = subset
        self.n_subset = len(self.subset)
        self.token_to_idx = {x: idx for idx, x in enumerate(all_labels)}
        self.idx_to_token = {idx: x for idx, x in enumerate(all_labels)}

    def __getitem__(self, index):
        name, label = self.subset[index].values()
        signal, sample_rate = torchaudio.load(os.path.join(self.rootdir, name))
        signal = signal[0]

        if self.transform:
            feature_map = self.transform(signal)
            signal = self.transform.transforms[0](signal)
        idx_label = self.token_to_idx[label]

        return feature_map, idx_label, signal, name, label

    def __len__(self):
        return len(self.subset)

In [None]:
def create_data_sets(n_mfcc):
    n_fft = 480
    win_length = None
    hop_length = 160
    mfcc_transform = T.MFCC(
        sample_rate=16000,
        n_mfcc=n_mfcc,
        melkwargs={
            "n_fft": n_fft,
            "n_mels": n_mfcc * 2,
            "hop_length": hop_length,
            "f_min": 20,
            "f_max": 4000
        },
    )

    transforms = [
        RandomApply([Noise(min_snr=0.1, max_snr=0.3)], p=0.5),
        mfcc_transform
    ]
    transform = Compose(transforms=transforms)
    data_set = {
                'train': Sound_dataset_commands(rootdir=data_path, subset=train_subset, transform=transform),
                'valid': Sound_dataset_commands(rootdir=data_path, subset=valid_subset, transform=transform),
                'test': Sound_dataset_commands(rootdir=data_path, subset=test_subset, transform=transform)
            }
    return data_set

In [None]:
def create_data_loaders(batch_size, data_set):
    def collate_fn(batch):
        X = torch.nn.utils.rnn.pad_sequence([sample[0].transpose(0, 1) for sample in batch], batch_first=True, padding_value=0).unsqueeze(1)
        y = torch.tensor([sample[1] for sample in batch])

        signal = [sample[2] for sample in batch]
        name = [sample[3] for sample in batch]
        label = [sample[4] for sample in batch]

        return X, y, signal, name, label

    loaders = {
            'train': torch.utils.data.DataLoader(data_set['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_fn),
            'valid': torch.utils.data.DataLoader(data_set['valid'], batch_size=batch_size, shuffle=True, collate_fn=collate_fn),
            'test': torch.utils.data.DataLoader(data_set['test'], batch_size=1, shuffle=False, collate_fn=collate_fn)
            }
    return loaders

In [None]:
from models.model import Speech_recognition_model

def create_model_utils(params, weight_decay, amsgrad, max_lr, div_factor, epochs, steps_per_epoch):
    model = Speech_recognition_model(**params).to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)

    optimizer = torch.optim.Adam(model.parameters(),
                                lr=max_lr/div_factor,
                                weight_decay=weight_decay,
                                amsgrad=amsgrad)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr,
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs, div_factor=div_factor)
    
    return model, criterion, optimizer, scheduler


In [None]:
def get_accuracy(y_pred, y_test):
    correct_results_sum = (y_pred == y_test).sum().float()
    acc = correct_results_sum/(y_test.size(0))
    
    return acc

In [None]:
from models.model import Speech_recognition_model
import yaml

# Сделать разные конфиги
params = None
with open("/kaggle/working/Voice-commands-recognition/configs/model_params.yaml", "r") as stream:
    try:
        params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
params

In [None]:
sweep_configuration = {
    'method': 'bayes',
        'metric': {
        'goal': 'maximize', 
        'name': 'valid_accuracy'
        },
    'parameters': 
        {  "n_mfcc": {"values": [24, 36, 48, 60]},
           "CNN_out_channels": {'values': [32, 64]}, "CNN_kernel_size": {'values': [3, 5]},
            "ResCNN_kernel_size": {'values': [3, 5]}, "ResCNN_dropout": {'min': 0.1, 'max': 0.6}, "ResCNN_n_cnn_layers": {'min': 1, "max": 5},
            "FC_out_features": {"values": [128, 256, 512, 1024]},
            "RNN_num_layers": {'min': 2, 'max': 6}, "RNN_bi": {'values': [True]}, "RNN_dropout": {'min': 0.1, 'max': 0.5},
            "Classifier_dropout": {'min': 0.1, 'max': 0.5},
            'weight_decay': {'min': 0.0001, 'max': 0.0007}, 'amsgrad': {'values': [False, True]}, 'max_lr': {'min': 0.01, 'max': 0.1}, 'div_factor': {'min': 100, 'max': 300}, 'max_norm': {'min': 0.7, 'max': 1.5}, 
            'epochs': {'values': [10]}, 'batch_size': {'values': [128]}
        }
}

In [None]:
sweep_configuration_test_best = {
    'method': 'bayes',
        'metric': {
        'goal': 'maximize', 
        'name': 'valid_accuracy'
        },
    'parameters': 
        {  "n_mfcc": {"values": [36]},
           "CNN_out_channels": {'values': [64]}, "CNN_kernel_size": {'values': [3]},
            "ResCNN_kernel_size": {'values': [3]}, "ResCNN_dropout":  {'values': [0.2557663304729153]}, "ResCNN_n_cnn_layers": {'values': [2]},
            "FC_out_features": {"values": [1024]},
            "RNN_num_layers": {'values': [2]}, "RNN_bi": {'values': [True]}, "RNN_dropout": {'values': [0.27401574970155235]},
            "Classifier_dropout": {'values': [0.451876615914206]},
            'weight_decay': {'values': [0.0005163181608617645]}, 'amsgrad': {'values': [False]}, 'max_lr': {'values': [0.06894893452161045]}, 'div_factor': {'values': [124]}, 'max_norm': {'values': [1.4018824914643906]}, 
            'epochs': {'values': [10]}, 'batch_size': {'values': [64]}
        }
}

In [None]:
sweep_id = wandb.sweep(sweep=sweep_configuration_test_best, project="Commands Recognition")

In [None]:
signal_plot_dir = 'signal_plots'
spec_plot_dir = 'spec_plots'
mfcc_plot_dir = 'mfcc_plots'
notebook_path = '/kaggle/working/Voice-commands-recognition/notebooks'

def sweep_func():
    torch.cuda.empty_cache()
    gc.collect()
    
    wandb.init(project="Commands Recognition")
    columns = ["name", "song_file", "signal_plot", "spec_plot", "mfcc_plot", "pred_label", "true_label"]
    valid_table = wandb.Table(columns=columns)
    data_set = create_data_sets(wandb.config['n_mfcc'])
    loaders = create_data_loaders(wandb.config['batch_size'], data_set)
    epochs = wandb.config['epochs']
    max_norm = wandb.config['max_norm']
    params['Architecture']['CNN_params']['kernel_size'] = wandb.config['CNN_kernel_size']
    params['Architecture']['CNN_params']['padding'] = wandb.config['CNN_kernel_size'] // 2
    params['Architecture']['CNN_params']['out_channels'] = wandb.config['CNN_out_channels']
    params['Architecture']['ResCNN_params']['in_channels'] = wandb.config['CNN_out_channels']
    params['Architecture']['ResCNN_params']['out_channels'] = wandb.config['CNN_out_channels']
    params['Architecture']['ResCNN_params']['kernel_size'] = wandb.config['ResCNN_kernel_size']
    params['Architecture']['ResCNN_params']['padding'] = wandb.config['ResCNN_kernel_size'] // 2
    params['Architecture']['ResCNN_params']['dropout'] = wandb.config['ResCNN_dropout'] 
    params['Architecture']['ResCNN_params']['n_cnn_layers'] = wandb.config['ResCNN_n_cnn_layers']
    params['Architecture']['ResCNN_params']['n_feats'] = wandb.config['n_mfcc'] // 2
    params['Architecture']['Fully_connected_params']['in_features'] = wandb.config['CNN_out_channels'] * params['Architecture']['ResCNN_params']['n_feats']
    params['Architecture']['Fully_connected_params']['out_features'] = wandb.config['FC_out_features']
    params['Architecture']['RNN_params']['input_size'] = wandb.config['FC_out_features']
    params['Architecture']['RNN_params']['hidden_size'] = wandb.config['FC_out_features']
    params['Architecture']['RNN_params']['num_layers'] = wandb.config['RNN_num_layers']
    params['Architecture']['RNN_params']['bidirectional'] = wandb.config['RNN_bi']
    params['Architecture']['RNN_params']['dropout'] = wandb.config['RNN_dropout']
    params['Architecture']['Attention_params']['feature_dim'] = wandb.config['FC_out_features']
    params['Architecture']['Attention_params']['step_dim'] = params['Architecture']['RNN_params']['num_layers'] * 2
    params['Architecture']['Classifier_params']['in_features'] = wandb.config['FC_out_features']
    params['Architecture']['Classifier_params']['dropout'] = wandb.config['Classifier_dropout']
    params['Architecture']['Classifier_params']['out_features'] = params['Architecture']['Classifier_params']['in_features'] // 2
    params['Settings']['Other']['batch_size'] = wandb.config['batch_size']


    model, criterion, optimizer, scheduler = create_model_utils(params['Architecture'],
                                                                wandb.config['weight_decay'],
                                                                wandb.config['amsgrad'],
                                                                wandb.config['max_lr'],
                                                                wandb.config['div_factor'],
                                                                epochs,
                                                                len(loaders['train']))
    

    print(params)
    def run_one_epoch(epoch, cross_valid=False, print_freq=1):
        start = time.time()
        total_loss = 0
        total_accuracy = 0

        data_loader = loaders['train'] if not cross_valid else loaders['valid']

        for i, (data) in enumerate(tqdm(data_loader)):
            feature_map, idx_label, signal, name, label = data
            input_data = feature_map.to(device)
            target_labels = idx_label.to(device)

            output_logits = model(input_data)
            loss = criterion(output_logits, target_labels)
            if not cross_valid:
                optimizer.zero_grad()
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                            max_norm)
                optimizer.step()

            total_loss += loss.item()
            pred_labels = torch.round(torch.sigmoid(output_logits)).type(torch.int8).argmax(1)
            accuracy = get_accuracy(pred_labels, target_labels)
            total_accuracy += accuracy
            optim_state = optimizer.state_dict()
            curr_lr = optim_state['param_groups'][0]['lr']
            
            if i % print_freq == 0:
                    print('Epoch {0} | Iter {1} | Average Loss {2:.3f} | '
                        'Current Loss {3:.6f} | Current accuracy {4:.6f} | Current lr {5:.5f} | {6:.1f} ms/batch'.format(
                            epoch + 1, i + 1, total_loss / (i + 1),
                            loss.item(), accuracy, curr_lr, 1000 * (time.time() - start) / (i + 1)),
                        flush=True)

            if cross_valid:
                name_fig = f'{name[0].replace("/", "_").split(".")[0]}.png'
                create_plots(signal[0], feature_map[0][0], name_fig)    
                temp = [name[0], 
                    wandb.Audio(signal[0], sample_rate=16000), 
                    wandb.Image(os.path.join( notebook_path, signal_plot_dir, name_fig )),
                    wandb.Image(os.path.join( notebook_path, spec_plot_dir, name_fig )),
                    wandb.Image(os.path.join( notebook_path, mfcc_plot_dir, name_fig )),
                    pred_labels[0],
                    target_labels[0]]
                valid_table.add_row(*temp)
            torch.cuda.empty_cache()
            gc.collect()
            
        return total_loss / (i + 1), total_accuracy / (i + 1)

    epoch = 0
    checkpoint = False
    visdom = True
    save_folder = '/kaggle/working/Voice-commands-recognition/checkpoints'

    tr_loss = []
    cv_loss = []
    tr_acc = []
    cv_acc = []
    best_val_loss = 0.1

    wandb.watch(model, log_freq=1, log='all')
    for epoch in tqdm(np.arange(epoch, epochs)):
        print("Training...")
        model.train()
        start = time.time()
        tr_avg_loss, tr_avg_acc = run_one_epoch(epoch)
        tr_loss.append(tr_avg_loss)
        tr_acc.append(tr_avg_acc)

        print('-' * 85)
        print('Train Summary | End of Epoch {0} | Time {1:.2f}s | '
                    'Train Loss {2:.3f} | Accuracy {3:.3f}'.format(
                        epoch + 1, time.time() - start, tr_avg_loss, tr_avg_acc))
        print('-' * 85)

        if checkpoint:
            file_path = os.path.join(
            save_folder, 'epoch_{0}_loss_{1:.4f}.pth.tar'.format(epoch + 1, cv_avg_loss))
            torch.save(model.serialize(model, optimizer, scheduler, epoch + 1,
                                            tr_loss=tr_loss,
                                            cv_loss=cv_loss),
                    file_path)
            print('Saving checkpoint model to %s' % file_path)

        print('Cross validation...')
        model.eval()  # Turn off Batchnorm & Dropout
        cv_avg_loss, cv_avg_acc = run_one_epoch(epoch, cross_valid=True)
        cv_loss.append(cv_avg_loss)
        cv_loss.append(cv_avg_acc)

        print('-' * 185)
        print('Valid Summary | End of Epoch {0} | Time {1:.2f}s | '
                'Valid Loss {2:.3f} | Accuracy {3:.3f}'.format(
                    epoch + 1, time.time() - start, cv_avg_loss, cv_avg_acc))
        print('-' * 185)

        # Save the best model
        if cv_avg_loss < best_val_loss:
            best_val_loss = cv_avg_loss
            model_path = 'epoch_{0}_loss_{1:.4f}_best.pth.tar'.format(epoch + 1, cv_avg_loss)
            file_path = os.path.join(save_folder, model_path)
            torch.save(model.serialize(model, optimizer, scheduler, epoch + 1,
                                        tr_loss=tr_loss,
                                        cv_loss=cv_loss),
                    file_path)
            print("Find better validated model, saving to %s" % file_path)
        

        wandb.log({"epoch": epoch, "train_loss": tr_avg_loss})
        wandb.log({"epoch": epoch, "valid_loss": cv_avg_loss})
        wandb.log({"epoch": epoch, "train_accuracy": tr_avg_acc})
        wandb.log({"epoch": epoch, "valid_accuracy": cv_avg_acc})
        wandb.run.log({"valid_inference" : valid_table}) 

In [None]:
wandb.agent(sweep_id, function=sweep_func, count=10)