In [1]:
import torch
import torchaudio
import torchtext
import torchaudio.functional as F
import torchaudio.transforms as T
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from audio_augmentations import *

import os, re, random
import numpy as np
import sklearn
import itertools

import pickle
from tqdm.auto import tqdm
from IPython.display import clear_output
import IPython.display as ipd
import gc
import matplotlib.pyplot as plt
import wandb

import sys
sys.path.append('..')
from models.cnn import ResidualCNN
from models.encoder import Encoder
from models.attention import Attention
from models.model import Speech_recognition_model

print(torch.__version__)
print(torchaudio.__version__)

  from .autonotebook import tqdm as notebook_tqdm


2.0.0
2.0.1


In [2]:
random.seed(123456)
np.random.seed(123456)
torch.manual_seed(123456)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [3]:
data_1, _ = torchaudio.load('../data/user_353665394/Сменить_3_1.wav')
data_2, _ = torchaudio.load('../data/user_192546140/Вверх_6_3_new_1.wav')

In [4]:
# 30мс 10мс шаг
n_fft = 480
win_length = None
hop_length = 160
n_mels = 256
n_mfcc = 256
mfcc_transform = T.MFCC(
    sample_rate=16000,
    n_mfcc=n_mfcc,
    melkwargs={
        "n_fft": n_fft,
        "n_mels": n_mels,
        "hop_length": hop_length,
        "mel_scale": "htk",
    },
)

# mfcc_transform = torchaudio.transforms.MFCC(sample_rate=16000)
# temp = mfcc_transform(X)



In [21]:
mfcc_transform(data_1).shape

torch.Size([1, 256, 316])

In [5]:
pre_X1 = mfcc_transform(data_1).transpose(1, 2).squeeze()
pre_X2 = mfcc_transform(data_2).transpose(1, 2).squeeze()
# input_lengths = [pre_X1.size(0), pre_X2.size(0)]
pre_X1.shape, pre_X2.shape

(torch.Size([316, 256]), torch.Size([216, 256]))

In [6]:
X = torch.nn.utils.rnn.pad_sequence([pre_X1, pre_X2], batch_first=True, padding_value=0)
X = X.unsqueeze(1)
X.shape

torch.Size([2, 1, 316, 256])

In [7]:
rnn_dim = 512
n_rnn_layers = 5
n_cnn_layers = 3
n_class = 18
stride = 2
n_feats = 128
dropout = 0.1
bidirectional = True

In [8]:
cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)
rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) 
            for _ in range(3)
        ])
fully_connected = nn.Linear(n_feats*32, rnn_dim)

In [9]:
X1 = cnn(X)
print(X1.shape)
X1 = X1.transpose(2, 3)
X2 = rescnn_layers(X1)
print(X2.shape)
sizes = X2.size()
X2 = X2.view(sizes[0], sizes[1] * sizes[2], sizes[3])
print(X2.shape)
X2 = X2.transpose(1, 2)
print(X2.shape)
X3 = fully_connected(X2)
print(X3.shape)

torch.Size([2, 32, 158, 128])
torch.Size([2, 32, 128, 158])
torch.Size([2, 4096, 158])
torch.Size([2, 158, 4096])
torch.Size([2, 158, 512])


In [10]:
encoder = Encoder(rnn_dim, rnn_dim, n_rnn_layers,
                      dropout=dropout, bidirectional=bidirectional,
                      rnn_type='lstm')


In [11]:
X3.size()

torch.Size([2, 158, 512])

In [12]:
output, hidden = encoder(X3)
temp = hidden[-1]
output.shape, hidden[-1].shape

(torch.Size([2, 158, 1024]), torch.Size([10, 2, 512]))

In [13]:
temp = temp.transpose(0, 1)
temp.shape

torch.Size([2, 10, 512])

In [14]:
attention = Attention(512, 10)

In [15]:
attr_output = attention(temp)
attr_output.shape

torch.Size([2, 512])

In [16]:
classifier = nn.Sequential(
            nn.Linear(rnn_dim, rnn_dim//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim//2, n_class)
        )

In [17]:
logits = classifier(attr_output)
logits

tensor([[ 0.1196, -0.1270,  0.3311,  0.1149,  0.4776, -0.0073,  0.1990, -0.2371,
         -0.0645, -0.1262,  0.1579, -0.0948, -0.6510,  0.3478,  0.0592, -0.0258,
          0.5294,  0.5974],
        [ 0.1298, -0.1323, -0.2997, -0.0238, -0.1419, -0.2612,  0.1929,  0.0642,
         -0.2821, -0.3324, -0.1738, -0.3343, -0.0517,  0.5660, -0.0455, -0.2841,
         -0.1107, -0.0975]], grad_fn=<AddmmBackward0>)

In [18]:
target = torch.tensor([9, 11], dtype=torch.long)

In [19]:
criterion = nn.CrossEntropyLoss()

In [20]:
criterion(logits, target)

tensor(3.1547, grad_fn=<NllLossBackward0>)