In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tensorflow.io import gfile
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from tqdm import *
import numpy as np
import tensorflow as tf
import os
from torch.utils.tensorboard import SummaryWriter

2024-01-02 15:08:31.135027: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-01-02 15:08:31.135064: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
class Ourclassifer(nn.Module):
    def __init__(self, in_dim, n_hidden1, n_hidden2, out_dim=1, drop_prob1=0.5, drop_prob2=0.5):
        super(Ourclassifer, self).__init__()
        self.layer1 = nn.Linear(in_dim, n_hidden1)
        self.layer2 = nn.Linear(n_hidden1, n_hidden2)
        self.layer3 = nn.Linear(n_hidden2, out_dim)

        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()

        self.drop1 = nn.Dropout(drop_prob1)
        self.drop2 = nn.Dropout(drop_prob2)

        self.sigmoid = nn.Sigmoid()
        self.device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
        self.to(self.device)
    def forward(self, x):
        x = x.sum(dim=1)
        hidden_1_out = self.relu1(self.layer1(x))
        hidden_1_out = self.drop1(hidden_1_out)

        hidden_2_out = self.relu2(self.layer2(hidden_1_out))
        hidden_2_out = self.drop2(hidden_2_out)
        out = self.layer3(hidden_2_out)
        out = self.sigmoid(out)
        return out

In [3]:
class ExtendedWav2Vec2ForCTC(Wav2Vec2ForCTC):
    """
    In ESPNET there is a LayerNorm layer between encoder output and CTC classification head.
    """
    def __init__(self, config):
        super().__init__(config)
        self.myhead = Ourclassifer(in_dim=config.hidden_size, n_hidden1=config.hidden_size, n_hidden2=config.hidden_size)
        self.freeze_feature_extractor()
        self.freeze_base_model()
        self.lm_head = torch.nn.Sequential(
                torch.nn.LayerNorm(config.hidden_size),
                self.myhead
        )
        for param in self.lm_head.parameters():
            param.requires_grad = True
        self.to(self.myhead.device)
        
model = ExtendedWav2Vec2ForCTC.from_pretrained("mandarin-wav2vec2-aishell1")

Some weights of ExtendedWav2Vec2ForCTC were not initialized from the model checkpoint at mandarin-wav2vec2-aishell1 and are newly initialized: ['lm_head.1.layer2.weight', 'lm_head.1.layer1.weight', 'myhead.layer2.weight', 'lm_head.1.layer1.bias', 'myhead.layer1.weight', 'lm_head.1.layer3.weight', 'myhead.layer2.bias', 'myhead.layer3.weight', 'myhead.layer3.bias', 'lm_head.1.layer2.bias', 'myhead.layer1.bias', 'lm_head.1.layer3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
processor = Wav2Vec2Processor.from_pretrained("mandarin-wav2vec2-aishell1")

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


In [5]:
criterion = nn.BCELoss()
lr = 0.00001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
ts_writer = SummaryWriter("logs")

In [6]:
SPEECH_DATA_DIR = "record"

class MyDataSet(Dataset):
    def __init__(self, data):
        #定义好 image 的路径
        self.data = data
    def __getitem__(self, index):
        file_name = self.data[index][0].decode("utf-8")
        #print(file_name)
        audio_input, sample_rate = sf.read(file_name)
        inputs = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt") 
        inputs['input_values'] = inputs['input_values'].to(model.device)       
        logits = model(**inputs).logits
        
        logits = logits.squeeze(dim=-1)
        return logits, torch.tensor(self.data[index][1]).float()

    def __len__(self):
        return len(self.data)

def get_label(word):
    if word == "网易精灵":
        return 1
    return 0

def get_files(word):
    return os.listdir(SPEECH_DATA_DIR + '/' + word + '/')

def process_files(file_names, label, repeat):
    file_names = tf.repeat(file_names, repeat).numpy()
    return [(file_name, label) for file_name in tqdm(file_names, desc=f"({word}, {label})", leave=False)]

def train(net, num_epochs, train_iter, val_iter, save_dir):
    patience = 0
    start_epoch = 0
    verbose = False  
    best_val_loss = 1e10  
    earlyStop = False
    max_patience = 5
    best_train_epochs = None
    for i in tqdm(range(start_epoch, num_epochs), disable=not verbose):
        net.train()
        train_loss = 0.0
        cnt = 0
        for logits, label in train_iter:
            logits = logits.to(net.device)
            logits = logits.squeeze(dim=-1)
            label = label.to(net.device)

            loss = criterion(logits, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            cnt += 1
        train_loss /= cnt
    
        ts_writer.add_scalar("Train_Loss", train_loss, i+1)

        net.eval()
        val_loss = 0.0
        cnt = 0
        for logits, label in val_iter:
            logits = logits.to(net.device)
            logits = logits.squeeze(dim=-1)
            label = label.to(net.device)
            
            loss = criterion(logits, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            val_loss += loss.item()
            cnt += 1
        val_loss /= cnt  

        ts_writer.add_scalar("Val_Loss", val_loss, i+1) 

        if val_loss < best_val_loss:
            patience = 0
            best_val_loss = val_loss
        else:
            patience += 1
            if patience == max_patience:
                earlyStop = True
        print("epoch: ", i+1, " train_loss: ", train_loss, " val_loss: ", val_loss)
        if earlyStop:
            print("Num of training epochs: ", i + 1)
            best_train_epochs = i + 1
            break
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    torch.save(net.state_dict(), os.path.join(save_dir + f"/experimental_wakeup_model.pth"))    
    return best_train_epochs

def train_all(net, num_epochs, train_iter, save_dir):
    start_epoch = 0
    verbose = False  

    for i in tqdm(range(start_epoch, num_epochs), disable=not verbose):
        net.train()
        train_loss = 0.0
        cnt = 0
        for logits, label in train_iter:
            logits = logits.to(net.device)
            logits = logits.squeeze(dim=-1)
            label = label.to(net.device)

            loss = criterion(logits, label)            

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            cnt += 1
        train_loss /= cnt
    
        ts_writer.add_scalar("Train_All_Loss", train_loss, i+1)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    torch.save(net.state_dict(), os.path.join(save_dir + f"/final_wakeup_model.pth"))


In [7]:
train_data = []
validate_data = []

words =[
    '网易精灵',
    '网易',
    '精灵',
    '精力',
    '易精',
    '网易精',
    '易精灵',
    '网易精力',
    '网红精灵',
    '网',
    '易',
    '精',
    '灵',
    "开始",
    "开机",
    "停止",
    "哇",
    "啊",
    "_background_noise_",
    "_silence_"
]

SPEECH_DATA_DIR = "record"

TRAIN_SIZE=0.8
VALIDATION_SIZE=0.2


for word in words:
    file_names = [SPEECH_DATA_DIR + '/' + word + '/' + file_name for file_name in tqdm(get_files(word), leave=False)]
    np.random.shuffle(file_names)
    train_size = int(TRAIN_SIZE*len(file_names))
    validation_size = int(VALIDATION_SIZE*len(file_names))
    repeat = 8 if word == "网易精灵" else 1  
    train_data.extend(process_files(file_names[:train_size], label=get_label(word), repeat=repeat))
    validate_data.extend(process_files(file_names[train_size:train_size+validation_size], label=get_label(word), repeat=repeat))


batch_size = 32
trainDataSet = MyDataSet(data=train_data)
valDataSet = MyDataSet(data=validate_data)
train_loader = DataLoader(dataset=trainDataSet, batch_size=batch_size)    
val_loader = DataLoader(dataset=valDataSet, batch_size=batch_size)    

2024-01-02 15:09:00.492054: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2024-01-02 15:09:00.495471: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:10:00.0 name: NVIDIA GeForce RTX 3060 computeCapability: 8.6
coreClock: 1.777GHz coreCount: 28 deviceMemorySize: 11.76GiB deviceMemoryBandwidth: 335.32GiB/s
2024-01-02 15:09:00.495620: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-01-02 15:09:00.495789: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcublas.so.10'; dlerror: libcublas.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/loca

In [8]:
model_save_path = "saved_model"
best_train_epochs = train(net=model, num_epochs=100, train_iter=train_loader, val_iter=val_loader, save_dir=model_save_path)

epoch:  1  train_loss:  11.179279584792411  val_loss:  6.6702398130948435
epoch:  2  train_loss:  3.6330788066062896  val_loss:  1.0358070207161816
epoch:  3  train_loss:  2.9958647943239316  val_loss:  0.39578291131278304
epoch:  4  train_loss:  2.459462216969098  val_loss:  0.38177907094724256
epoch:  5  train_loss:  1.4696236295320169  val_loss:  0.12973300666070492
epoch:  6  train_loss:  1.5947255682869863  val_loss:  0.029790134056386818
epoch:  7  train_loss:  1.2291408737660676  val_loss:  0.024054903646427376
epoch:  8  train_loss:  1.2385043077836984  val_loss:  0.011721716984826927
epoch:  9  train_loss:  0.9514352128513472  val_loss:  0.003230450734620973
epoch:  10  train_loss:  0.8559534061935403  val_loss:  0.0018150417398808205
epoch:  11  train_loss:  0.7003850013364815  val_loss:  0.0032250569668573893
epoch:  12  train_loss:  0.5907830977443631  val_loss:  0.0016268307305477947
epoch:  13  train_loss:  0.5406827508922911  val_loss:  0.0013760174420074022
epoch:  14  

In [9]:
model.eval()
cnt = 0
valid_cnt = 0
test_loader = DataLoader(dataset=valDataSet, batch_size=len(validate_data)) 
for logits, label in test_loader:
    logits = logits.to(model.device)
    logits = logits.squeeze(dim=-1)
    logits = (logits > 0.5).float()
    label = label.to(model.device)
    cnt += label.shape[0]
    valid_cnt += (label == logits).sum().item()
    
    
print("acc: ", valid_cnt / cnt)

acc:  1.0


In [11]:
all_data = train_data + validate_data
allDataSet = MyDataSet(data=all_data)
all_loader = DataLoader(dataset=allDataSet, batch_size=batch_size)    

model = ExtendedWav2Vec2ForCTC.from_pretrained("mandarin-wav2vec2-aishell1")

train_all(model, num_epochs=best_train_epochs, train_iter=all_loader, save_dir=model_save_path)

Some weights of ExtendedWav2Vec2ForCTC were not initialized from the model checkpoint at mandarin-wav2vec2-aishell1 and are newly initialized: ['lm_head.1.layer3.weight', 'lm_head.1.layer2.bias', 'myhead.layer2.bias', 'lm_head.1.layer3.bias', 'lm_head.1.layer1.bias', 'lm_head.1.layer1.weight', 'myhead.layer3.bias', 'myhead.layer1.bias', 'lm_head.1.layer2.weight', 'myhead.layer3.weight', 'myhead.layer1.weight', 'myhead.layer2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def get_voice_position(audio, noise_floor):
    audio = audio - np.mean(audio)
    audio = audio / np.max(np.abs(audio))
    return trim(audio, axis=0, epsilon=noise_floor)

import wave
#import torchaudio
import tensorflow_io as tfio
from tensorflow_io.core.python.experimental.audio_ops import trim
model_save_path = "saved_model"
processor = Wav2Vec2Processor.from_pretrained("mandarin-wav2vec2-aishell1")
model = ExtendedWav2Vec2ForCTC.from_pretrained("mandarin-wav2vec2-aishell1")
model.load_state_dict(torch.load(os.path.join(model_save_path + f"/final_wakeup_model.pth")))
model.eval()
sample_rate = 16000
NOISE_FLOOR = 0.3

file_path = "record/网易精灵/output_3.wav"
audio_tensor = tfio.audio.AudioIOTensor(file_path)
audio = tf.cast(audio_tensor[:], tf.float32)
voice_start, voice_end = get_voice_position(audio, NOISE_FLOOR)
voice_start = voice_start.numpy()[0]
voice_end = voice_end.numpy()[0]

valid_audio = audio_tensor[voice_start:voice_end].numpy().reshape(-1).astype(np.float32)

inputs = processor(valid_audio, sampling_rate=sample_rate, return_tensors="pt") 
inputs['input_values'] = inputs['input_values'].to(model.device)       
logits = model(**inputs).logits
print(logits.item())
if logits.item() > 0.5:
    print("我被唤醒啦！")



Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of ExtendedWav2Vec2ForCTC were not initialized from the model checkpoint at mandarin-wav2vec2-aishell1 and are newly initialized: ['lm_head.1.layer2.weight', 'lm_head.1.layer1.weight', 'myhead.layer2.weight', 'lm_head.1.layer1.bias', 'myhead.layer1.weight', 'lm_head.1.layer3.weight', 'myhead.layer2.bias', 'myhead.layer3.weight', 'myhead.layer3.bias', 'lm_head.1.layer2.bias', 'myhead.layer1.bias', 'lm_head.1.layer3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.8994930386543274
我被唤醒啦！
