In [69]:
import torch
import torchaudio
import torchtext
import torchaudio.functional as F
import torchaudio.transforms as T
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from audio_augmentations import *

import os, re, random
import numpy as np
import sklearn
import itertools

import pickle
from tqdm.auto import tqdm
from IPython.display import clear_output
import IPython.display as ipd
import gc
import matplotlib.pyplot as plt
import wandb

import sys
sys.path.append('..')
from models.cnn import ResidualCNN
from models.encoder import Encoder
from models.attention import Attention
from models.model import Speech_recognition_model

print(torch.__version__)
print(torchaudio.__version__)

2.0.0
2.0.1


In [5]:
random.seed(123456)
np.random.seed(123456)
torch.manual_seed(123456)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [7]:
data_1, _ = torchaudio.load('../data/user_192546140/Domoj_3_1_new_2.wav')
data_2, _ = torchaudio.load('../data/user_192546140/Domoj_3_1_new_2.wav')

In [8]:
# 30мс 10мс шаг
n_fft = 480
win_length = None
hop_length = 160
n_mels = 128
n_mfcc = 64
mfcc_transform = T.MFCC(
    sample_rate=16000,
    n_mfcc=n_mfcc,
    melkwargs={
        "n_fft": n_fft,
        "n_mels": n_mels,
        "hop_length": hop_length,
        "mel_scale": "htk",
    },
)

# mfcc_transform = torchaudio.transforms.MFCC(sample_rate=16000)
# temp = mfcc_transform(X)



In [14]:
mfcc_transform(data_1).shape

torch.Size([1, 60, 287])

In [15]:
pre_X1 = mfcc_transform(data_1).transpose(1, 2).squeeze()
pre_X2 = mfcc_transform(data_2).transpose(1, 2).squeeze()
# input_lengths = [pre_X1.size(0), pre_X2.size(0)]
pre_X1.shape, pre_X2.shape

(torch.Size([287, 60]), torch.Size([287, 60]))

In [16]:
X = torch.nn.utils.rnn.pad_sequence([pre_X1, pre_X2], batch_first=True, padding_value=0)
X = X.unsqueeze(1)
X.shape

torch.Size([2, 1, 287, 60])

In [17]:
rnn_dim = 256
hidden_size = 256
n_rnn_layers = 2
n_cnn_layers = 4
n_class = 18
stride = 2
n_feats = 30
dropout = 0.1
bidirectional = True

In [65]:
class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) 


class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)

class SpeechRecognitionModel(nn.Module):
    
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) 
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        return x


In [86]:
learning_rate=5e-4
batch_size=20
epochs=10
hparams = {
    "n_cnn_layers": 3,
    "n_rnn_layers": 5,
    "rnn_dim": 512,
    "n_class": 29,
    "n_feats": 128,
    "stride":2,
    "dropout": 0.1,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs
}
test_asr_model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        )

TypeError: ResidualCNN.__init__() got an unexpected keyword argument 'kernel'

In [87]:
test_asr_model.load_state_dict(torch.load('../checkpoints/sub_model_asr.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [89]:
import yaml

params = None
with open("../configs/model_params_tl.yaml", "r") as stream:
    try:
        params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
model = Speech_recognition_model(**(params['Architecture']))
# torch.save(model, '../checkpoints/new_model_asr.pth')

In [90]:
model.state_dict()

OrderedDict([('cnn.weight',
              tensor([[[[-0.1073,  0.1674, -0.2916],
                        [ 0.2768, -0.3327, -0.0636],
                        [-0.0188,  0.2971,  0.0567]]],
              
              
                      [[[-0.1674, -0.0271, -0.2305],
                        [ 0.2938,  0.1589,  0.1819],
                        [ 0.0388, -0.3036, -0.2837]]],
              
              
                      [[[-0.2043, -0.3062, -0.2758],
                        [-0.2592,  0.1128, -0.2762],
                        [-0.1761,  0.2386,  0.0229]]],
              
              
                      [[[-0.0326, -0.3311, -0.3323],
                        [-0.0937,  0.1437,  0.2102],
                        [ 0.2217, -0.3281,  0.1079]]],
              
              
                      [[[ 0.2717, -0.0950,  0.1135],
                        [-0.2567, -0.0309,  0.1395],
                        [ 0.0706, -0.1838, -0.3227]]],
              
              
                 

In [92]:
model.load_state_dict(torch.load('../checkpoints/sub_model_asr.pth', map_location=torch.device('cpu')), strict=False)

_IncompatibleKeys(missing_keys=['encoder.rnn.weight_ih_l0', 'encoder.rnn.weight_hh_l0', 'encoder.rnn.bias_ih_l0', 'encoder.rnn.bias_hh_l0', 'encoder.rnn.weight_ih_l0_reverse', 'encoder.rnn.weight_hh_l0_reverse', 'encoder.rnn.bias_ih_l0_reverse', 'encoder.rnn.bias_hh_l0_reverse', 'encoder.rnn.weight_ih_l1', 'encoder.rnn.weight_hh_l1', 'encoder.rnn.bias_ih_l1', 'encoder.rnn.bias_hh_l1', 'encoder.rnn.weight_ih_l1_reverse', 'encoder.rnn.weight_hh_l1_reverse', 'encoder.rnn.bias_ih_l1_reverse', 'encoder.rnn.bias_hh_l1_reverse', 'encoder.rnn.weight_ih_l2', 'encoder.rnn.weight_hh_l2', 'encoder.rnn.bias_ih_l2', 'encoder.rnn.bias_hh_l2', 'encoder.rnn.weight_ih_l2_reverse', 'encoder.rnn.weight_hh_l2_reverse', 'encoder.rnn.bias_ih_l2_reverse', 'encoder.rnn.bias_hh_l2_reverse', 'encoder.rnn.weight_ih_l3', 'encoder.rnn.weight_hh_l3', 'encoder.rnn.bias_ih_l3', 'encoder.rnn.bias_hh_l3', 'encoder.rnn.weight_ih_l3_reverse', 'encoder.rnn.weight_hh_l3_reverse', 'encoder.rnn.bias_ih_l3_reverse', 'encoder.r

In [94]:
model.state_dict()

OrderedDict([('cnn.weight',
              tensor([[[[-2.1007e-03, -8.3343e-02, -7.8526e-04],
                        [ 2.8128e-04,  5.8829e-03, -1.9605e-04],
                        [-1.3582e-03, -8.5892e-02, -4.4249e-04]]],
              
              
                      [[[ 1.4868e-02,  8.4935e-03, -9.5815e-02],
                        [ 4.7094e-03, -1.9104e-02,  1.2676e-02],
                        [-2.7919e-02,  8.4048e-03, -4.4409e-01]]],
              
              
                      [[[ 7.1305e-02, -1.6534e-01,  9.3900e-02],
                        [-3.5325e-02,  5.1871e-02, -1.0762e-02],
                        [-3.2981e-02,  5.2358e-02, -3.1448e-03]]],
              
              
                      [[[ 1.1526e-02,  2.9924e-01, -1.2135e-01],
                        [ 8.0982e-04,  1.6567e-01,  1.5670e-02],
                        [ 4.9740e-03,  2.3941e-01,  1.0255e-01]]],
              
              
                      [[[-6.6552e-02, -1.6664e-01, -5.7732e-04],

In [63]:
from collections import OrderedDict
submodel = OrderedDict(list(test_asr_model.state_dict().items())[:28])
torch.save(submodel, '../checkpoints/sub_model_asr.pth')

In [106]:
cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)
rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel_size=3, stride=1, padding=1, dropout=dropout, n_feats=n_feats) 
            for _ in range(3)
        ])
fully_connected = nn.Linear(n_feats*32, rnn_dim)

In [107]:
X1 = cnn(X)
print(X1.shape)
X1 = X1.transpose(2, 3)
X2 = rescnn_layers(X1)
print(X2.shape)
sizes = X2.size()
X2 = X2.view(sizes[0], sizes[1] * sizes[2], sizes[3])
print(X2.shape)
X2 = X2.transpose(1, 2)
print(X2.shape)
X3 = fully_connected(X2)
print(X3.shape)

torch.Size([2, 32, 158, 30])
torch.Size([2, 32, 30, 158])
torch.Size([2, 960, 158])
torch.Size([2, 158, 960])
torch.Size([2, 158, 256])


In [108]:
encoder = Encoder(rnn_dim, rnn_dim, n_rnn_layers,
                      dropout=dropout, bidirectional=bidirectional,
                      rnn_type='lstm')


In [109]:
X3.size()

torch.Size([2, 158, 256])

In [110]:
output, hidden = encoder(X3)
temp = hidden[-1]
output.shape, hidden[-1].shape

(torch.Size([2, 158, 512]), torch.Size([4, 2, 256]))

In [111]:
temp = temp.transpose(0, 1)
temp.shape

torch.Size([2, 4, 256])

In [112]:
attention = Attention(hidden_size, n_rnn_layers * 2)

In [113]:
attr_output = attention(temp)
attr_output.shape

torch.Size([8, 256]) torch.Size([256, 1])


torch.Size([2, 256])

In [114]:
classifier = nn.Sequential(
            nn.Linear(rnn_dim, rnn_dim//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim//2, n_class)
        )

In [115]:
logits = classifier(attr_output)
logits.shape

torch.Size([2, 18])

In [73]:
target = torch.tensor([9, 11], dtype=torch.long)

In [74]:
criterion = nn.CrossEntropyLoss()

In [75]:
criterion(logits, target)

tensor(5.2402, grad_fn=<NllLossBackward0>)