In [1]:
import os
import random
from io import open
import unicodedata
import string
import re

import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from pathlib import Path
import kaldiio
import sys
import gc
import json
import time
from data_4 import AudioDataLoader, AudioDataset, pad_list

%matplotlib inline

print_use = False

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
train_json = "/home1/meichaoyang/workspace/git/espnet/egs/aishell2/asr1/dump/train_sp/deltafalse/data.json"
test_json = "/home1/meichaoyang/workspace/git/espnet/egs/aishell2/asr1/dump/test/deltafalse/data.json"
batch_size = 32
maxlen_in = 1000
maxlen_out = 50
num_workers = 8

In [19]:
os.environ["CUDA_VISIBLE_DEVICES"]="2"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [4]:
tr_dataset = AudioDataset(train_json, batch_size,
                              maxlen_in, maxlen_out)
tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=num_workers)

In [5]:
te_dataset = AudioDataset(test_json, batch_size,
                              maxlen_in, maxlen_out)
te_loader = AudioDataLoader(te_dataset, batch_size=1, num_workers=num_workers, shuffle=True)

In [7]:
char_list = []
char_list_path = "/home1/meichaoyang/workspace/git/espnet/egs/aishell2/asr3/data/lang_1char/train_units.txt"
with open(char_list_path, "r") as f:
    for line in f:
        data = line.split()
        char_list.append(data[0])

### 模型搭建

In [27]:
class Encoder(nn.Module):
    def __init__(self, input_size, rnn_hidden_size, vocab_size, bidirectional=False, dropout=0.0):
        super(Encoder, self).__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.vocab_size = vocab_size
        
        self.cnn1 = nn.Conv2d(1, 128, (5, 3), stride=(2, 1), padding=(2, 1))
        
        self.cnn1_out_shape_h = (input_size+2*4-5)//2 + 1
#         self.cnn1_out_shape_w = (input_size.shape[1]+2*2-3)/1 + 1
        self.mlp1 = nn.Sequential(
            nn.Linear(128 * input_size,
                      self.rnn_hidden_size*2),
            nn.Tanh(),
            nn.Linear(self.rnn_hidden_size*2, self.rnn_hidden_size))
        
        self.rnn = nn.LSTM(self.rnn_hidden_size, self.rnn_hidden_size, 
                           batch_first=True,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.mlp2 = nn.Sequential(
            nn.Linear(self.rnn_hidden_size*2,
                      self.rnn_hidden_size),
            nn.Tanh(),
            nn.Linear(self.rnn_hidden_size, self.vocab_size))


    def forward(self, padded_input, input_lengths):
        # padded_input: N * T * D
        # input_lengths: N

        # padded_input: torch.Size([10, 2145, 83])
        # padded_input1: torch.Size([10, 1, 2145, 83])
        # cnn1_output: torch.Size([10, 128, 1073, 83])
        # mlp1_input: torch.Size([10, 1073, 128, 83])


        N = padded_input.shape[0]
#         print("padded_input:",padded_input.shape)
        padded_input1 = padded_input.unsqueeze(1) 
#         print("padded_input1:",padded_input1.shape)
        cnn1_output = self.cnn1(padded_input1)
#         print("cnn1_output:",cnn1_output.shape)
        
        mlp1_input = torch.transpose(cnn1_output, 1,2)
#         print("mlp1_input:",mlp1_input.shape)
        mlp1_input = mlp1_input.reshape((N,-1,128*83))
        
        mlp1_output = self.mlp1(mlp1_input)
#         print("mlp1_output:",mlp1_output.shape)
        
#         print("input_lengths:",input_lengths)
        input_lengths = input_lengths//2
        total_length = padded_input.size(1)  # get the max sequence length
        packed_input = pack_padded_sequence(mlp1_output, input_lengths,
                                            batch_first=True)
#         print("packed_input:",packed_input.shape)
        packed_output, hidden = self.rnn(packed_input)
        rnn_output, _ = pad_packed_sequence(packed_output,
                                        batch_first=True,
                                        total_length=total_length)
        predicted_y = self.mlp2(rnn_output)
        return predicted_y

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [33]:
class CTC_Model(nn.Module):
    
    def __init__(self, input_size, rnn_hidden_size, vocab_size, bidirectional=True, dropout=0.0):
        super(CTC_Model, self).__init__()
        
        self.vocab_size = vocab_size
        self.encoder = Encoder(input_size, rnn_hidden_size, vocab_size, bidirectional, dropout)
        self.ctc_loss = nn.CTCLoss()
    
    def forward(self, padded_input, input_lengths, padded_target, target_lengths):
        
        encoder_output = self.encoder(padded_input, input_lengths)
        encoder_output = torch.transpose(encoder_output,0,1)
        ctc_input = encoder_output.log_softmax(2)
        
        loss = self.ctc_loss(ctc_input, padded_target, input_lengths, target_lengths)
        
        return loss
        
    def recognize(self, input, input_lengths, char_list):
        """Sequence-to-Sequence beam search, decode one utterence now.
        Args:
            input: T x D
            char_list: list of characters
            args: args.beam
            padded_input: N * T * F_dim
            input_lengths: N *

        Returns:
            nbest_hyps:
        """
        encoder_output = self.encoder(input, input_lengths)
        ans=torch.max(encoder_output,-1)
        res = ""
        for j in range(ans[1].shape[1]):
            v = ans[1][0][j].item()
            if v>0:
                res=res+char_list[v]

        return res


### 训练

In [34]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [35]:
def trainIters(model, epoch, optimizier, print_every=10, plot_every=10, learning_rate=0.01):

    start = time.time()
    n_iters = len(tr_dataset)
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    
    criterion = nn.NLLLoss()

    for e in range(epoch):
        for i, (data) in enumerate(tr_loader):
            padded_input, input_lengths, padded_target, target_lengths= data
            padded_input = padded_input.to(device)
            input_lengths = input_lengths.to(device)
            padded_target = padded_target.to(device)
            target_lengths = target_lengths.to(device)
            loss = model(padded_input, input_lengths, padded_target, target_lengths)
    #         print(loss) #.requires_grad
            print_loss_total += float(loss)
            plot_loss_total += float(loss)

            optimizier.zero_grad()
            loss.backward()

            optimizier.step()

            if (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                txt = 'Epoch %d | Iter %d | %s (%d %d%%) %.4f' % (e+1, i+1, timeSince(start, (e *n_iters +i+1) / (n_iters*epoch)),
                                             (i+1), (e *n_iters +i+1) / (n_iters*epoch) * 100, print_loss_avg)
                print(txt)

            if i+1 % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0


In [None]:
input_size = 83

hidden_size = 256
vocab_size = len(char_list)
embedding_dim = 512
sos_id = 0
eos_id = 1
learning_rate = 1e-3
momentum = 0
l2 = 1e-5

IGNORE_ID=-1


model = CTC_Model(input_size, hidden_size,vocab_size)
print(model)
model.to(device)

optimizier = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
#                                      momentum=momentum,
                                     weight_decay=l2)
trainIters(model, 20,optimizier, print_every=100)

CTC_Model(
  (encoder): Encoder(
    (cnn1): Conv2d(1, 128, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))
    (mlp1): Sequential(
      (0): Linear(in_features=10624, out_features=512, bias=True)
      (1): Tanh()
      (2): Linear(in_features=512, out_features=256, bias=True)
    )
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
    (mlp2): Sequential(
      (0): Linear(in_features=512, out_features=256, bias=True)
      (1): Tanh()
      (2): Linear(in_features=256, out_features=6039, bias=True)
    )
  )
  (ctc_loss): CTCLoss()
)
Epoch 1 | Iter 100 | 0m 17s (- 4992m 19s) (100 0%) 222.5126
Epoch 1 | Iter 200 | 0m 30s (- 4501m 0s) (200 0%) 45.2970
Epoch 1 | Iter 300 | 0m 44s (- 4358m 44s) (300 0%) 7.5657
Epoch 1 | Iter 400 | 1m 6s (- 4837m 17s) (400 0%) 7.2154
Epoch 1 | Iter 500 | 1m 26s (- 5083m 28s) (500 0%) 7.1020
Epoch 1 | Iter 600 | 1m 47s (- 5230m 41s) (600 0%) 7.0447
Epoch 1 | Iter 700 | 2m 7s (- 5333m 20s) (700 0%) 7.0112
Epoch 1 | Iter 800 | 2m 28s (- 544