## Feature engineering

In [8]:
import utils
import scipy

In [9]:
import torch 
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F

In [10]:
# tensorboard 장착
from tensorboardX import SummaryWriter

In [11]:
writer = SummaryWriter()

In [12]:
# audio configuration for 푸리에 트랜스폼
audio_conf = {}
audio_conf["sample_rate"] = 16000 # dataset의 sample rate는 16000
audio_conf["window_size"] = 0.02
audio_conf["window_stride"] = 0.01
audio_conf["window"] = scipy.signal.hamming

In [13]:
# train 파라미터로 7:3 변환
dset_train = utils.SpectogramDataset(audio_conf,'./data/train/audio/',20,train=True)
train_loader = utils.AudioDataLoader(dset_train,20,4) # dataset, batch size, number of workers
dset_test = utils.SpectogramDataset(audio_conf,'./data/train/audio/',20,train=False)
test_loader = utils.AudioDataLoader(dset_test,20,4) # dataset, batch size, number of workers

In [14]:
len(dset_train)

44080

In [15]:
# index로 변환하기 위한 함수
files = {'yes':0,'no':1,'up':2,'down':3,'left':4,'right':5,'on':6,'off':7,'stop':8,'go':9, 'bed':10,'bird':11,'cat':12,'dog':13,'eight':14,
'five':15,'four':16,'happy':17,'house':18,'marvin':19,'nine':20,'one':21,'seven':22,'shella':23,'six':24,'three':25,'tree':26,'two':27,'wow':28,'zero':29,
}

In [143]:
# batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)

In [14]:
class DNN(nn.Module):
    def __init__(self,input_size, output_size, batch_size):
        super(DNN,self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.fc1 = nn.Linear(self.input_size,8192)
        self.b1 = nn.BatchNorm1d(8192)
        self.fc2 = nn.Linear(8192,2048)
        self.b2 = nn.BatchNorm1d(2048)
        self.fc3 = nn.Linear(2048,512)
        self.b3 = nn.BatchNorm1d(512)
        self.fc4 = nn.Linear(512,128)
        self.b4 = nn.BatchNorm1d(128)
        self.fc5 = nn.Linear(128,30)
        self.b5 = nn.BatchNorm1d(30)
    def forward(self, x):
        out = F.relu(self.b1(self.fc1(x)))
        out = F.relu(self.b2(self.fc2(out)))
        out = F.relu(self.b3(self.fc3(out)))
        out = F.relu(self.b4(self.fc4(out)))
        out = self.fc5(out)
        out = F.softmax(out)
        return out

In [20]:
class LSTM(nn.Module):
    def __init__(self,input_size,hidden_size, num_hidden_layer,batch_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_hidden_layer = num_hidden_layer
        self.batch_size = batch_size
        self.lstm = nn.LSTM(input_size,hidden_size,num_hidden_layer,bidirectional = True,batch_first = True) # input_size, hidden_size, num_layers
        self.fc1 = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size,30)
        self.softmax = nn.Softmax()
    def forward(self, x):
        h0 = Variable(torch.zeros(self.num_hidden_layer * 2,self.batch_size,self.hidden_size)).cuda()
        c0 = Variable(torch.zeros(self.num_hidden_layer * 2,self.batch_size,self.hidden_size)).cuda()
        x = x.transpose(1,2)
        out, _ = self.lstm(x, (h0,c0))
        out = out[:,-1,:]
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        out = self.softmax(out)
        return out

In [15]:
class SequenceWise(nn.Module):
    """
    Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
    Allows handling of variable sequence lengths and minibatch sizes.
    :param module: Module to apply input to.
    """
    """
    총 두 번 사용된다. CNN의 output이 RNN으로 들어갈 때. RNN의 output이 FC로 들어갈 때. 
    그 필요성은 위의 딥 스피치 모델 그림에서 데이터가 transpose됨을 보면 알 수 있다.
    """
    def __init__(self, module):
        super(SequenceWise, self).__init__()
        self.module = module

    def forward(self, x):
        t, n = x.size(0), x.size(1)
        x = x.view(t * n, -1)
        x = self.module(x)
        x = x.view(t, n, -1)
        return x

"""
class InferenceBatchSoftmax(nn.Module):
#    이건 굉장히 typical 한 minibatch softmax.    각 data의 softmax를 torch.stack을 해야한다는 부분이 했심.    QQQQQQQQQQQ 왜 if not self.training인가. 
    def forward(self, input_):
        if not self.training:
            batch_size = input_.size()[0]
            return torch.stack([F.log_softmax(input_[i]) for i in range(batch_size)], 0)
        else:
            return input_
"""        

class BatchRNN(nn.Module):
    """
    batchnormalization + RNN 구현.
    굳이 이런식으로 붙이는 이유는 sequencewise 때문에. 
    input : 
        input_size - T * N * H 에서 H 의 값. 여기서 T 는 time, N 은 mini batch 갯수, H는 P * C(convolution layers)
        hidden_size - output의 값. 
                      여기서 output은 input과 같지 않냐! 고 생각하는데, 단순히 hi = W1 * xi + W2 * xi 라고 했을 때 matrix의 행의 크기. 
                      즉, T * N * H에서 H가 얼마나 압축될 것이냐의 의미.
    """
    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=True, batch_norm=True):
        super(BatchRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
                            bidirectional=bidirectional, bias=False)
        self.num_directions = 2 if bidirectional else 1 # why do I need this?

    def forward(self, x):
        if self.batch_norm is not None:
            x = self.batch_norm(x)
        h0 = Variable(torch.zeros(1 * 2, 20, self.hidden_size)).cuda()# batch size = 20
        c0 = Variable(torch.zeros(1 * 2, 20, self.hidden_size)).cuda()
        x, _ = self.rnn(x,(h0,c0))
        if self.bidirectional:
            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
            return x

In [17]:
import math
"""
deep speech model. 
Input : 
    없어도 된다!
"""
class DeepSpeech(nn.Module):
    def __init__(self, rnn_type=nn.LSTM, num_classes=30 , rnn_hidden_size=768, audio_conf=None,
                 bidirectional=True):
        super(DeepSpeech, self).__init__()

        # model metadata needed for serialization/deserialization
        if audio_conf is None:
            audio_conf = {}
        self._hidden_size = rnn_hidden_size
#        self._hidden_layers = nb_layers
        self._rnn_type = rnn_type
        self._audio_conf = audio_conf or {}

        sample_rate = self._audio_conf.get("sample_rate", 16000)
        window_size = self._audio_conf.get("window_size", 0.02)

        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        )
        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size - 41) / 2 + 1)
        rnn_input_size = int(math.floor(rnn_input_size - 21) / 2 + 1)
        rnn_input_size *= 32

        rnns = []
        self.rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
                       bidirectional=bidirectional, batch_norm=True)

        fully_connected = nn.Sequential(
            nn.BatchNorm1d(rnn_hidden_size),
            nn.Linear(rnn_hidden_size, num_classes, bias=False)
        )
        self.fc = nn.Sequential(
            SequenceWise(fully_connected),
        )

    def forward(self, x):
        x = self.conv(x)

        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
        x = self.rnn(x)

        x = self.fc(x)
        x = x.transpose(0, 1)
        x = F.log_softmax(x[:,-1,:])
        return x

In [16]:
def toidx(key):
    """
    file index를 onehot으로 바꿔줘야 한다.
    input - str. key
    output - list. onehot
    """
    # index로 변환하기 위한 함수
    files = {'yes':0,'no':1,'up':2,'down':3,'left':4,'right':5,'on':6,'off':7,'stop':8,'go':9, 'bed':10,'bird':11,'cat':12,'dog':13,'eight':14,
    'five':15,'four':16,'happy':17,'house':18,'marvin':19,'nine':20,'one':21,'seven':22,'shella':23,'six':24,'three':25,'tree':26,'two':27,'wow':28,'zero':29,
    }
    out = files[key]
    return out

In [18]:
def tolistidx(lis):
    out = []
    for idx in lis:
        out.append(toidx(idx))
    return out

In [18]:
model = DNN(161*101,30,20).cuda()

In [60]:
model = LSTM(161,128,5,20).cuda()

In [19]:
model = DeepSpeech().cuda()

In [20]:
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr = 0.001)

In [22]:
# training step DNN
epoch = 5
for i in range(epoch):
    print('epoch :', i)
    for j, (images, labels) in enumerate(train_loader):
        images = Variable(images.squeeze().unsqueeze(0).cuda()).view(20,-1)
        labels = Variable(torch.LongTensor(tolistidx(labels)).cuda())
        
        opt.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        opt.step()
        if(j % 100 == 0):
            print(loss)

epoch : 5


RuntimeError: dimension out of range (expected to be in range of [-2, 1], but got 2)

In [62]:
# training step LSTM
epoch = 5
for i in range(epoch):
    print('epoch :', i)
    for j, (images, labels) in enumerate(train_loader):
        images = Variable(images.squeeze().cuda())
        labels = Variable(torch.LongTensor(tolistidx(labels)).cuda())
        
        opt.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        opt.step()
        if(j % 100 == 0):
            print(loss)

epoch : 0
Variable containing:
 3.4014
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4016
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4019
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4032
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.3973
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.3975
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4006
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.3990
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.3882
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4003
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4059
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.3965
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.4006
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 3.3930
[torch.cuda.

KeyboardInterrupt: 

In [None]:
# train 파라미터로 7:3 변환
dset_train = utils.SpectogramDataset(audio_conf,'./data/train/audio/',20,train=True)
train_loader = utils.AudioDataLoader(dset_train,20,4) # dataset, batch size, number of workers
dset_test = utils.SpectogramDataset(audio_conf,'./data/train/audio/',20,train=False)
test_loader = utils.AudioDataLoader(dset_test,20,4) # dataset, batch size, number of workers

In [None]:
# training step deepspeech
epoch = 50
for i in range(epoch):
    print('epoch :', i)
    for j, (images, labels) in enumerate(train_loader):
        images = Variable(images.squeeze().cuda()).unsqueeze(1)
        labels = Variable(torch.LongTensor(tolistidx(labels)).cuda())
    
        opt.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        opt.step()
        writer.add_scalar('./logs/loss_lstm',loss.data.cpu()[0],(epoch)*len(train_loader) + j)
        if(j % 100 == 0):
            print(loss)
            torch.save(model,'./deepspeech_LSTM_3.pth')

epoch : 0
Variable containing:
 3.7181
[torch.cuda.FloatTensor of size 1 (GPU 0)]



  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Variable containing:
 2.7105
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 2.6737
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 1.7832
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 1.7801
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 1.5782
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 1.3476
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.4715
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.4556
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.7913
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 1.2889
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.6633
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.2955
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.5256
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.6041
[torch.cuda.FloatTenso

Variable containing:
1.00000e-02 *
  6.7568
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  5.4991
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  8.8498
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.2220
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1002
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.0089
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  3.4952
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1208
[torch.cuda.FloatTensor of size 1 (GPU 0)]

epoch : 5
Variable containing:
1.00000e-02 *
  6.1318
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.8631
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.4623
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  5.7435
[torch.cuda.FloatTensor

Variable containing:
1.00000e-03 *
  2.2476
[torch.cuda.FloatTensor of size 1 (GPU 0)]

epoch : 9
Variable containing:
 0.3634
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.2148
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1193
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  3.7543
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  4.9978
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.0185
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1055
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-03 *
  4.5841
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.2894
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-03 *
  8.1137
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  3.2305
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable 

Variable containing:
 0.1818
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.7271
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.1544
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.6743
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.1622
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  5.3479
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1621
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1153
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1645
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  1.6243
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 0.1496
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
1.00000e-02 *
  4.7091
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing

In [46]:
# Test the Model DNN
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(images.squeeze().unsqueeze(0).cuda()).view(20,-1)
    labels = torch.LongTensor(tolistidx(labels)).cuda()
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
print(correct/total)

RuntimeError: dimension out of range (expected to be in range of [-2, 1], but got 2)

In [91]:
# Test the Model LSTM
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(images.squeeze().cuda())
    labels = torch.LongTensor(tolistidx(labels)).cuda()
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
print(correct/total)

0.18731501057082453


In [144]:
# Test the Model deep speech
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(images.squeeze().cuda()).unsqueeze(1)
    labels = torch.LongTensor(tolistidx(labels)).cuda()
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
print(correct/total)

0.6967105263157894
