In [1]:
import os
import random
from io import open
import unicodedata
import string
import re

import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from pathlib import Path
import kaldi_io
import sys
import gc
import json
import time
from data_4 import AudioDataLoader, AudioDataset, pad_list

%matplotlib inline

print_use = False

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
# train_json = "data.json"
# test_json = "data_test.json"
train_json="/home1/meichaoyang/workspace/git/Listen-Attend-Spell/egs/aishell2/dump/train/deltatrue/data.json"
test_json="/home1/meichaoyang/workspace/git/Listen-Attend-Spell/egs/aishell2/dump/test/deltatrue/data.json"
batch_size = 32
maxlen_in = 100000
maxlen_out = 30
num_workers = 4

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
tr_dataset = AudioDataset(train_json, batch_size,
                              maxlen_in, maxlen_out)
tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=num_workers)

In [5]:
te_dataset = AudioDataset(test_json, batch_size,
                              maxlen_in, maxlen_out)
te_loader = AudioDataLoader(te_dataset, batch_size=1, num_workers=num_workers, shuffle=True)

In [6]:
char_list = []
char_list_path = "/home1/meichaoyang/workspace/git/Listen-Attend-Spell/egs/aishell2/data/lang_1char/train_chars.txt"
with open(char_list_path, "r") as f:
    for line in f:
        data = line.split()
        char_list.append(data[0])

### 模型搭建

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size, bidirectional=False, dropout=0.0):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.cnn1 = nn.Conv2d(1, 128, (5, 3), stride=(2, 1), padding=(4, 2))
        
        self.cnn1_out_shape_h = (input_size.shape[0]+2*4-5)/2 + 1
        self.cnn1_out_shape_w = (input_size.shape[1]+2*2-3)/1 + 1
        
        self.rnn = nn.LSTM(input_size, hidden_size, 
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_size*2,
                      self.hidden_size),
            nn.Tanh(),
            nn.Linear(self.hidden_size, self.vocab_size))


    def forward(self, padded_input, input_lengths):
        
        total_length = padded_input.size(1)  # get the max sequence length
        packed_input = pack_padded_sequence(padded_input, input_lengths,
                                            batch_first=True)
        packed_output, hidden = self.rnn(packed_input)
        rnn_output, _ = pad_packed_sequence(packed_output,
                                        batch_first=True,
                                        total_length=total_length)
        predicted_y = self.mlp(rnn_output)
        return predicted_y

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [10]:
class CTC_Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, vocab_size, bidirectional=True, dropout=0.0):
        super(CTC_Model, self).__init__()
        
        self.vocab_size = vocab_size
        self.encoder = Encoder(input_size, hidden_size, vocab_size, bidirectional, dropout)
        self.ctc_loss = nn.CTCLoss()
    
    def forward(self, padded_input, input_lengths, padded_target, target_lengths):
        
        encoder_output = self.encoder(padded_input, input_lengths)
        encoder_output = torch.transpose(encoder_output,0,1)
        ctc_input = encoder_output.log_softmax(2)
        
        loss = self.ctc_loss(ctc_input, padded_target, input_lengths, target_lengths)
        
        return loss

### 训练

In [11]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [12]:
def trainIters(model, epoch, optimizier, print_every=10, plot_every=10, learning_rate=0.01):

    start = time.time()
    n_iters = len(tr_dataset)
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    
    criterion = nn.NLLLoss()

    for e in range(epoch):
        for i, (data) in enumerate(tr_loader):
            padded_input, input_lengths, padded_target, target_lengths= data
            padded_input = padded_input.cuda()
            input_lengths = input_lengths.cuda()
            padded_target = padded_target.cuda()
            target_lengths = target_lengths.cuda()
            loss = model(padded_input, input_lengths, padded_target, target_lengths)
    #         print(loss) #.requires_grad
            print_loss_total += float(loss)
            plot_loss_total += float(loss)

            optimizier.zero_grad()
            loss.backward()

            optimizier.step()

            if (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                txt = 'Epoch %d | Iter %d | %s (%d %d%%) %.4f' % (e+1, i+1, timeSince(start, (e *n_iters +i+1) / (n_iters*epoch)),
                                             (i+1), (e *n_iters +i+1) / (n_iters*epoch) * 100, print_loss_avg)
                print(txt)

            if i+1 % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0


In [None]:
input_size = 240

hidden_size = 256
vocab_size = len(char_list)
embedding_dim = 512
sos_id = 0
eos_id = 1
learning_rate = 1e-3
momentum = 0
l2 = 1e-5

IGNORE_ID=-1


model = CTC_Model(input_size, hidden_size)
print(model)
model.cuda()

optimizier = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
#                                      momentum=momentum,
                                     weight_decay=l2)
trainIters(model, 20,optimizier, print_every=200)

CTC_Model(
  (encoder): Encoder(
    (rnn): LSTM(240, 256, batch_first=True, bidirectional=True)
  )
  (ctc_loss): CTCLoss()
)
Epoch 1 | Iter 200 | 0m 23s (- 1123m 14s) (200 0%) 202.0096
Epoch 1 | Iter 400 | 0m 43s (- 1049m 37s) (400 0%) 171.0560


In [31]:
in_channels=16
out_channels=33
kernel_size=(3, 5)
stride=(2, 1)
padding=(1, 2)

m = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)

In [32]:
input = torch.randn(20, 16, 50, 100)

In [33]:
m(input).shape

torch.Size([20, 33, 25, 100])

In [16]:
T = 50
C = 20
N = 16
S = 30

# target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)

In [10]:
target_lengths

tensor([33, 11, 26,  5, 46, 18, 27, 38,  2, 15, 13, 43, 14, 33,  4,  5])

In [18]:
target.shape

torch.Size([16, 30])

In [19]:
for i, (data) in enumerate(tr_loader):
    print(data[0].shape)

torch.Size([16, 1930, 240])
torch.Size([32, 1615, 240])
torch.Size([32, 1450, 240])
torch.Size([32, 1335, 240])
torch.Size([32, 1277, 240])
torch.Size([32, 1245, 240])
torch.Size([32, 1219, 240])
torch.Size([16, 1196, 240])
torch.Size([32, 1185, 240])
torch.Size([32, 1169, 240])
torch.Size([32, 1156, 240])
torch.Size([32, 1144, 240])
torch.Size([32, 1131, 240])
torch.Size([16, 1118, 240])
torch.Size([32, 1114, 240])
torch.Size([32, 1106, 240])
torch.Size([32, 1099, 240])
torch.Size([32, 1092, 240])
torch.Size([32, 1084, 240])
torch.Size([32, 1079, 240])
torch.Size([32, 1073, 240])
torch.Size([32, 1068, 240])
torch.Size([32, 1061, 240])
torch.Size([32, 1057, 240])
torch.Size([32, 1051, 240])
torch.Size([32, 1047, 240])
torch.Size([32, 1041, 240])
torch.Size([32, 1037, 240])
torch.Size([32, 1033, 240])
torch.Size([32, 1028, 240])
torch.Size([32, 1023, 240])
torch.Size([32, 1019, 240])
torch.Size([32, 1014, 240])
torch.Size([32, 1012, 240])
torch.Size([32, 1009, 240])
torch.Size([32, 1007

KeyboardInterrupt: 

In [24]:
for i, (data) in enumerate(tr_loader):
    if(i>6706):
        print(data[2])
    if(i>6707):
        break

tensor([[  64, 3872, 4896,  874, 1727, 2970, 3445,  126,   76, 4620, 1549, 4129,
            0,    0,    0,    0,    0,    0,    0,    0],
        [4198, 3607, 2192, 1826, 3891, 1493, 3071,   87, 4695, 3561,  887, 2807,
         4348, 1967, 4186, 3385,    0,    0,    0,    0],
        [ 963, 2095, 2022,  577, 1398, 2982,  874, 1029,  385, 3071,  356, 2908,
          153, 5042,    0,    0,    0,    0,    0,    0],
        [  87,  886, 1788, 1802, 2989,   58,  176, 1195, 1370, 5186, 2034,  626,
          577,   73,    0,    0,    0,    0,    0,    0],
        [ 525, 2080,   63,  525,   58, 1990, 3746,  525,   58, 2080,   58, 1990,
         4516, 4099,  536, 4964,  393, 4363,    0,    0],
        [1670,  886, 2360,  121,  785,  370, 2877,  525,  359, 1398, 4815,   65,
         4862,  123, 4512, 3069,  428,  101,  871,  525],
        [  64, 2526, 1366, 2404, 2211, 1714,  587, 1366,  901,   58,  613, 4805,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 218, 5093,  128, 

In [None]:
IC0134W0228
IC0134W0337
IC0134W0382
