In [1]:
import os
import random
from io import open
import unicodedata
import string
import re

import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from pathlib import Path
import kaldi_io
import math
import torch.utils.data as Data

from lib.Data_show import Data_show
from lib.Phone_cla_Dataset import Phone_cla_Dataset
from lib.Decoder import Decoder
from collections import Counter

%matplotlib inline


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="3"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 加载数据SLR85训练集

In [3]:
SLR_feats = { u:d for u,d in kaldi_io.read_mat_scp("/home1/meichaoyang/Dataset/feats/SLR85/hifi/train/feats.scp") }
SLR_feats_dev = { u:d for u,d in kaldi_io.read_mat_scp("/home1/meichaoyang/Dataset/feats/SLR85/hifi/dev/feats.scp") }

In [48]:
phone_label = { u:d for u,d in kaldi_io.read_vec_int_ark("../../wake_up_align_44_1k/train_fbank/ali.1.ph") }
feats = { u:d for u,d in kaldi_io.read_mat_scp("../../wake_up_align_44_1k/train_fbank/feats.scp") }

In [5]:
feats_head_300k = { u:d for u,d in kaldi_io.read_mat_scp("../../wake_up_align_44_1k/train_fbank/feats_head_300000.scp") }
feats_mid_300k = { u:d for u,d in kaldi_io.read_mat_scp("../../wake_up_align_44_1k/train_fbank/feats_mid_300000.scp") }
feats_tail_300k = { u:d for u,d in kaldi_io.read_mat_scp("../../wake_up_align_44_1k/train_fbank/feats_tail_300000.scp") }

In [None]:
 "你"      "好"       "米"       "雅"
129 63    61 27     128 64     92 69

In [23]:
word =np.array([[129,63],[61,27], [128, 64], [92,69] ])

In [27]:
129 in word[:,0]

True

In [31]:
a = np.zeros(5) 
a[1:3] = 3
a

array([0., 3., 3., 0., 0.])

In [58]:
word_label={}
for utt, label in phone_label.items():
    word_label_tmp = np.zeros(label.shape[0]) 
    i = 0
    while i < label.shape[0]:
        finded = False
        if label[i] in word[:,0]:
            word_index = np.where(word == label[i])[0][0]
            start = i
            while i < label.shape[0]:
                
                if label[i] == label[start]:
                    i += 1
                elif label[i] == word[word_index][1]:
                    finded = True
                    end = i
                    i += 1
                else:
                    i -= 1
                    break
            
            if finded:
                word_label_tmp[start:end+1] = word_index+1
                        
                        
            
        i += 1
        
    word_label[utt] = word_label_tmp.astype(np.int16)

In [59]:
phone_label["SV0255_7_01_S3881"]

array([  3,   3,   3, 129,  63,  63,  63,  63,  61,  61,  61,  61,  61,
        27,  27,  27,  27,  27,  27,  27,  27,  27,  27,   1,   3,   1,
       128, 128, 128, 128, 128,  64,  64,  64,  92,  92,  69,  69,  69,
         3,   3,   3,   3,   3,   3,   1])

In [60]:
word_label["SV0255_7_01_S3881"]

array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0,
       0, 0], dtype=int16)

In [66]:
aishell_2_corp_raw = "/home1/meichaoyang/Dataset/data_aishell2/aishell2/aishell2/data/trans.txt"
utt_with_you = []
with open(aishell_2_corp_raw, "r") as f:
    for line in f:
        data = line.split()
        if "你" not in data[1]: ##删除小于10和非英文标注
            continue
        utt_with_you.append(data[0])

In [30]:
feats_mid_300k = {}
i=0
for u,d in kaldi_io.read_mat_scp("../../wake_up_align_44_1k/train_fbank/feats_mid_300000.scp"):
    if i > 1000000:
        break
    i += 1
    feats_mid_300k[u] = d

In [64]:
feats = {**SLR_feats, **feats_head_300k, **feats_mid_300k,**feats_tail_300k}

In [65]:
len(feats)

895983

In [37]:
len(utt_with_you)

27765

27765

In [68]:
feats_with_you = {}
for utt in utt_with_you:
    if utt in feats.keys():
        feats_with_you[utt]=feats[utt]

In [69]:
feats_aishell = {}
a = random.sample(feats.keys(), 20000)
for utt in a:
    feats_aishell[utt] = feats[utt]
    

In [70]:
len(feats_aishell)

20000

In [71]:
feat_new = {**feats_with_you, **SLR_feats, **feats_aishell }

In [76]:
word_label_new = {}
for utt in feat_new.keys():
    if utt in word_label.keys():
        word_label_new[utt] = word_label[utt]

In [72]:
len(feat_new)

48747

In [411]:
len(word_label_new)

6828

In [78]:
len(word_label)

91706

## 自定义数据集

In [386]:
class Phone_cla_Dataset(Dataset):
    """Face Landmarks dataset."""
    


    def __init__(self,word_label=None, feats=None, transform=None):
        """
        Args:
            phone_label (dict): utt to frame label.
            feats (dict): utt to frame features.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        
        if word_label == None or feats == None:
#             self.word_label = { u:d for u,d in kaldi_io.read_vec_int_ark("feats/ali.1.ph") }
            self.feats = { u:d for u,d in kaldi_io.read_mat_scp("feats/feats.scp") }
        else:
            self.word_label = word_label
            self.feats = feats
        
        self.feats_list = []
        self.word_label_list = []

        self.transform = transform
        
        for utt, feat in feats.items():
            if utt in word_label:
                self.feats_list.append(feat)
                a=np.zeros(feat.shape[0], int)
                for i in range(a.shape[0]):
                    a[i]=word_label[utt][(i)//3]
                self.word_label_list.append(a)

       
        self.feats_nd = np.concatenate(tuple(self.feats_list))
        self.word_label_nd = np.concatenate(tuple(self.word_label_list))
           

    def __len__(self):
        return len(self.word_label_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = [self.feats_list[idx], self.word_label_list[idx]]

        if self.transform:
            sample = self.transform(sample)

        return sample
    



In [421]:
class Data_show:
    class2word=[]
    
    def __init__(self, class2word =["other", "你","好","米","雅"]):
        
        Data_show.class2word = class2word


    def show_softmax(self, pred_label_with_softmax):
        
        title = ""
        for i in range(len(Data_show.class2word)):
            title += "\t" + Data_show.class2word[i]
        
        content = ""
        for i in range(0,pred_label_with_softmax.shape[0]):
            content += str(i) + ":\t"
            for j in range(0,pred_label_with_softmax.shape[1]):
                content += '%.2f\t' %pred_label_with_softmax[i][j]
            content += "\n"
     
        return title, content

In [387]:
range(feat_new["IC0103W0361"].shape[0])

range(0, 285)

In [388]:
print(feat_new["IC0103W0361"].shape)
print(word_label_new["IC0103W0361"].shape)

(285, 40)
(95,)


In [389]:
utt = "IC0101W0080"
data = feats[utt]
for i in range(20,data.shape[0]-10):
    feat = feat_new["IC0103W0361"]
    a=np.zeros(feat.shape[0], int)
    for i in range(a.shape[0]):
        a[i]=word_label_new["IC0103W0361"][math.floor((i)//3)]

In [390]:
np.concatenate(tuple([feat_new[utt][20:50].reshape(1,-1),feat_new[utt][20:50].reshape(1,-1)])).shape == (2,1200)

True

In [391]:
feat_new["IC0104W0039"][367-20:367+10].reshape(1,-1).shape

(1, 1200)

In [392]:
data_set_train = Phone_cla_Dataset(word_label_new, feat_new)

In [415]:
i = 0
j = 0

for key in word_label_new.keys():
    if "SV" in key:
        i += 1
    else :
        j += 1
j

4804

In [145]:
# data_set_dev = Phone_cla_Dataset(phone_label, SLR_feats_dev)

In [393]:
data_set_train.word_label_nd[-75:-61]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [394]:
print(data_set_train.feats_nd.shape)
print(data_set_train.word_label_nd.shape)
print(data_set_train.word_label_nd[1440])

(1818006, 40)
(1818006,)
0


In [395]:
data_set_train[1]

[array([[ 3.0408266,  5.7290993,  6.424262 , ..., 11.253042 , 11.324378 ,
         11.309543 ],
        [ 2.9193435,  4.9800253,  7.0417566, ..., 11.80622  , 11.501292 ,
         11.232543 ],
        [ 1.4321882,  4.7077107,  7.3267536, ..., 12.510785 , 12.359281 ,
         11.764354 ],
        ...,
        [-1.5421227,  4.163081 ,  5.7592683, ..., 11.922679 , 12.168289 ,
         11.852024 ],
        [-1.4004889,  3.3461366,  5.901767 , ..., 11.980908 , 12.216037 ,
         12.307909 ],
        [ 1.6446389,  2.9104328,  4.2867823, ..., 12.449646 , 12.59802  ,
         11.8812475]], dtype=float32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0

In [239]:
data_set_train[100][0].shape

(1, 1200)

In [237]:
data_set_train[100][1].shape

(237,)

In [402]:
train_data = torch.Tensor(data_set_train.feats_nd)#.to(device)
train_label = torch.LongTensor(data_set_train.word_label_nd)#.to(device)


In [396]:
print('Counter(data)\n',Counter(data_set_train.word_label_nd))

Counter(data)
 Counter({0: 1616613, 4: 65538, 2: 64926, 1: 36867, 3: 34062})


# 模型搭建

In [403]:
class DNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, 128)
        self.fc5 = nn.Linear(128, num_classes)


    def forward(self, input):
        x = torch.sigmoid(self.fc1(input))
        x = torch.sigmoid(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        x = self.fc5(x)
        
        return x

## 训练迭代

In [404]:
LEARNING_RATE = 0.001  #0.001
EPOCH = 2      #400 best
BATCH_SIZE = 150
input_size=40
num_classes=5

In [405]:
print(train_data.shape)
print(train_label.shape)

torch.Size([1818006, 40])
torch.Size([1818006])


In [416]:
training_set = Data.TensorDataset(train_data,
                                  train_label)
training_loader = Data.DataLoader(dataset=training_set,
                                      batch_size=BATCH_SIZE,
                                      shuffle=True)
# testing_set = Data.TensorDataset(test_data,
#                                  test_label)
# testing_loader = Data.DataLoader(dataset=testing_set,
#                                      batch_size=BATCH_SIZE,
#                                      shuffle=False)
model = DNN(input_size, num_classes)#.to(device)
# criterion = nn.CrossEntropyLoss(weight=torch.tensor([3.5,3.0,3.0,2.5,2.5,5,2.0,2.5,1.5,1.0]).to(device))
criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.0,300.0,300.0,200.5,200.5]))
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
for epoch in range(EPOCH):
    correct_train = 0
    total_train = 0
    
    total_loss = 0
    ite = 0
    print_every = 2000
    for (data, label) in training_loader:
        ite +=1
        data = data
        label = label
        pred_label = model(data)
        loss = criterion(pred_label, label)
        optim.zero_grad()
        loss.backward()
        total_loss += loss.item()
        optim.step()
        _, answer = torch.max(pred_label.data, 1)
        total_train += label.size(0)
        correct_train += (answer == label).sum()
        if ite % print_every == 0:
            print("total_loss:",total_loss/print_every, "\tloss:",loss.item())
            total_loss=0
    print('Epoch {:3d} Accuracy on training data: {}% ({}/{})'
          .format(epoch, (100 * correct_train / total_train), correct_train, total_train))
    # pytorch 0.4 feature, not calculate grad on test set
#     with torch.no_grad():
#         correct_test = 0
#         total_test = 0
#         for (data, label) in testing_loader:
#             pred_label = model(data)
#             _, answer = torch.max(pred_label.data, 1)
#             total_test += label.size(0)
#             correct_test += (answer == label).sum()
#         print('          Accuracy on testing data: {}% ({}/{})'
#               .format((100 * correct_test / total_test), correct_test, total_test))



total_loss: 1.0307710790932179 	loss: 0.68595951795578
total_loss: 0.8377548316270113 	loss: 0.7056499123573303
total_loss: 0.7825536297857761 	loss: 0.7353289723396301
total_loss: 0.7451909584254026 	loss: 0.5427332520484924
total_loss: 0.7030690496489406 	loss: 0.6329156756401062
total_loss: 0.6849039036333561 	loss: 0.5992240309715271
Epoch   0 Accuracy on training data: 28% (524923/1818006)
total_loss: 0.6701685741618275 	loss: 0.7401068210601807
total_loss: 0.6653630465045571 	loss: 0.5013478994369507
total_loss: 0.6580152650475501 	loss: 0.5138773322105408
total_loss: 0.6461799715310336 	loss: 0.516302764415741
total_loss: 0.6460245130211115 	loss: 0.5203981399536133
total_loss: 0.6419944063797594 	loss: 0.6132413744926453
Epoch   1 Accuracy on training data: 57% (1041899/1818006)


### 预测

In [None]:
# model = torch.load('model.pkl')
# model.to(device)

In [349]:
test_feats = { u:d for u,d in kaldi_io.read_mat_scp("/home1/meichaoyang/workspace/align_44_1k/test_feat/feats.scp")}
test_feats.keys()

dict_keys(['miya_mcy', 'miya_mcy1', 'miya_mingzhang', 'miya_mingzhang1'])

In [369]:
feat = test_feats["miya_mcy1"]
feats_list = []
for i in range(20,feat.shape[0]-9):
    input_data=feat[i-20:i+10].reshape(1,-1)
    feats_list.append(input_data)
feats_nd = np.concatenate(tuple(feats_list))

In [381]:
i=30
input_feat = feat[i-20:i+10].reshape(1,-1)

In [449]:
# test_feats = { u:d for u,d in kaldi_io.read_mat_scp("/home1/meichaoyang/workspace/align_44_1k/test_feat/feats.scp")}

pred_label = model(torch.Tensor(test_feats["miya_mingzhang1"]))
_, answer = torch.max(pred_label.data, 1)
answer_list=list(answer.to("cpu", torch.int).numpy())

In [448]:
utt='SV0252_7_01_S3887'
# utt="SV0255_7_01_S3881"
# utt=list(phone_label.keys())[110]

# label_list = list(phone_label_dev[utt])
pred_label = model(torch.Tensor(SLR_feats_dev[utt])) #.to(device)
_, answer = torch.max(pred_label.data, 1)
answer_list=list(answer.to("cpu", torch.int).numpy())

KeyError: 'SV0252_7_01_S3887'

In [438]:
utt=list(feats_aishell.keys())[110]

label_list = list(feats_aishell[utt])
pred_label = model(torch.Tensor(feats_aishell[utt]))#.to(device)
_, answer = torch.max(pred_label.data, 1)
answer_list=list(answer.to("cpu", torch.int).numpy())

## 将预测结果映射

In [450]:
title, content = Data_show().show_softmax(torch.nn.Softmax()(pred_label))

  """Entry point for launching an IPython kernel.


In [451]:
print(title,"\n",content)

	other	你	好	米	雅 
 0:	0.01	0.02	0.93	0.00	0.05	
1:	0.01	0.05	0.59	0.01	0.33	
2:	0.01	0.03	0.83	0.00	0.13	
3:	0.01	0.09	0.82	0.01	0.07	
4:	0.01	0.06	0.44	0.02	0.47	
5:	0.01	0.09	0.72	0.06	0.12	
6:	0.01	0.04	0.85	0.03	0.07	
7:	0.01	0.01	0.35	0.00	0.63	
8:	0.00	0.00	0.17	0.00	0.82	
9:	0.01	0.06	0.44	0.03	0.46	
10:	0.00	0.00	0.10	0.00	0.89	
11:	0.01	0.15	0.59	0.08	0.18	
12:	0.01	0.02	0.97	0.00	0.01	
13:	0.01	0.03	0.93	0.00	0.03	
14:	0.00	0.01	0.94	0.00	0.04	
15:	0.00	0.02	0.95	0.00	0.03	
16:	0.01	0.14	0.76	0.02	0.07	
17:	0.01	0.03	0.93	0.00	0.03	
18:	0.00	0.01	0.82	0.00	0.17	
19:	0.01	0.01	0.51	0.00	0.48	
20:	0.01	0.03	0.53	0.01	0.43	
21:	0.01	0.08	0.68	0.05	0.19	
22:	0.00	0.00	0.38	0.00	0.61	
23:	0.00	0.00	0.55	0.00	0.45	
24:	0.00	0.00	0.45	0.00	0.55	
25:	0.00	0.00	0.82	0.00	0.18	
26:	0.00	0.00	0.45	0.00	0.55	
27:	0.00	0.00	0.24	0.00	0.76	
28:	0.00	0.00	0.14	0.00	0.86	
29:	0.01	0.04	0.37	0.01	0.58	
30:	0.01	0.02	0.33	0.00	0.64	
31:	0.00	0.00	0.75	0.00	0.25	
32:	0.00	0.00	0.84	0.00	0.16	
33:

In [441]:
word_label_new[utt]

KeyError: 'IC0500W0447'

### “你好米雅”测试

In [None]:
feats_miya_test = { u:d for u,d in kaldi_io.read_mat_scp("/home1/meichaoyang/Dataset/feats/SLR85/far_field/train/feats.scp") }

In [None]:
utt_miya_test=list(feats_miya_test.keys())[420]
# utt_aishell="IC0001W0406"

pred_label_miya_test = model(torch.Tensor(feats_miya_test[utt_miya_test]).to(device))
_, answer_miya_test = torch.max(pred_label_miya_test.data, 1)
answer_miya_test_list=list(answer_miya_test.to("cpu", torch.int).numpy())

In [None]:
title, content = Data_show().show_softmax(torch.nn.Softmax()(pred_label_miya_test))

In [None]:
print(title)
print(content)

In [None]:
feats[utt_miya_test]

## 非“你好米雅”测试

In [None]:
feats_aishell = { u:d for u,d in kaldi_io.read_mat_scp("../wake_dnn_miya_only/feats_aishell2_test/feats.scp") }

In [None]:
utt_aishell=list(feats_aishell.keys())[420]
utt_aishell="IC0001W0406"

pred_label_aishell = model(torch.Tensor(feats_aishell[utt_aishell]).to(device))
_, answer_aishell = torch.max(pred_label_aishell.data, 1)
answer_aishell_list=list(answer_aishell.to("cpu", torch.int).numpy())

In [None]:
decoder.show_result(decoder.decode(torch.nn.Softmax()(pred_label_aishell)))

In [None]:
Data_show().show_softmax(torch.nn.Softmax()(pred_label_aishell))

In [None]:
utt_aishell

## 保存模型

In [None]:
torch.save(model.to("cpu"), 'model.pkl')
model1 = torch.load('model.pkl')

In [None]:
sm = torch.jit.script(model1)
sm.save("phone_cla_model.pt")

In [None]:
model1 = torch.load('model.pkl')

In [None]:
model1

In [None]:
map1={1:"1-1",2:"2-1",3:"3-1"}

In [None]:
map2={2:"2-2",3:"3-2",4:"4-2"}

In [None]:
{**map1,**map2}

In [None]:
{**map2,**map1}