In [1]:
import torch
import random

from transformers import Wav2Vec2CTCTokenizer, SeamlessM4TFeatureExtractor, Wav2Vec2BertProcessor

#文字编码工具
#使用processor文件夹下的vocab.json构建tokenizer
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('./processor',
                                                 bos_token='[CLS]',
                                                 eos_token='[SEP]',
                                                 unk_token='[UNK]',
                                                 pad_token='[PAD]')

#声音信号编码工具
feature_extractor = SeamlessM4TFeatureExtractor(sampling_rate=16000,
                                                padding_value=0.0)

#组合上面两个工具
processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor,
                                  tokenizer=tokenizer)

del tokenizer
del feature_extractor

processor

  from .autonotebook import tqdm as notebook_tqdm


Wav2Vec2BertProcessor:
- feature_extractor: SeamlessM4TFeatureExtractor {
  "feature_extractor_type": "SeamlessM4TFeatureExtractor",
  "feature_size": 80,
  "num_mel_bins": 80,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000,
  "stride": 2
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='./processor', vocab_size=4054, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	1: AddedToken("[CLS]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	2: AddedToken("[SEP]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	3: AddedToken("[UNK]", rstrip=

In [2]:
#测试
data = processor(text=['测试文字1', '测试测试文字2'],
                 audio=[torch.randn(8000).numpy(),
                        torch.randn(16000).numpy()],
                 sampling_rate=16000,
                 padding=True,
                 return_tensors='pt')

#其实分开用更方便一点
data = processor.tokenizer(['测试文字1', '测试测试文字2'],
                           padding=True,
                           truncation=True,
                           max_length=35 + 2,
                           return_tensors='pt')

data = processor.feature_extractor(
    [torch.randn(8000).numpy(),
     torch.randn(16000).numpy()],
    sampling_rate=16000,
    padding=True,
    truncation=True,
    max_length=900,
    padding_value=0.0,
    return_tensors='pt')

for k, v in data.items():
    print(k, v.shape, v.dtype, v)

input_features torch.Size([2, 49, 160]) torch.float32 tensor([[[-0.9523, -0.7594, -2.5887,  ..., -0.0667,  0.0922,  0.9519],
         [-0.4041,  0.5543,  0.2227,  ..., -0.2744,  0.4729, -1.4659],
         [ 1.7767,  1.4884,  0.7725,  ...,  0.3004,  0.3270, -0.6555],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0069,  0.0117, -1.3570,  ..., -1.3703, -0.6182,  0.7779],
         [ 1.2093,  1.3706,  1.0757,  ...,  0.8772, -1.3837, -0.4901],
         [-0.1526, -1.0427,  1.0818,  ...,  0.6680,  0.3451, -0.4456],
         ...,
         [-0.1174,  0.2229, -0.6526,  ..., -0.5193,  2.0450, -2.0595],
         [-0.8036, -0.1815,  0.7316,  ..., -0.5252, -1.7430,  0.9779],
         [ 0.4233,  0.4700, -0.0511,  ...,  0.0248, -0.1939,  0.3127]]])
attention_mask torch.Size([2, 49]) torch.int32 tensor([[1, 1, 

In [3]:
from datasets import load_from_disk, Audio

dataset = load_from_disk('dataset/mozilla-foundation/common_voice_16_0')

dataset = dataset.remove_columns([
    'accent', 'age', 'client_id', 'down_votes', 'gender', 'locale', 'segment',
    'up_votes', 'path', 'variant'
])
dataset = dataset.rename_columns({'sentence': 'text'})
dataset = dataset.cast_column('audio', Audio(sampling_rate=16000))


def f(data):
    lens_audio = len(data['audio']['array']) / 16000
    lens_text = len(data['text'])
    return 1 <= lens_audio <= 9 and 2 <= lens_text <= 35


dataset = dataset.filter(f)

dataset, dataset[3]

(Dataset({
     features: ['audio', 'text'],
     num_rows: 27858
 }),
 {'audio': {'path': 'common_voice_zh-CN_18729599.mp3',
   'array': array([ 7.77156117e-16,  0.00000000e+00, -4.44089210e-16, ...,
          -2.50576342e-08, -2.53291603e-07, -4.08032776e-07]),
   'sampling_rate': 16000},
  'text': '第二天，男孩们去了罗比的家。'})

In [4]:
def show(data):
    from IPython.display import Audio, display
    display(Audio(data=data, rate=16000))


show(dataset[3]['audio']['array'])
dataset[3]['text']

'第二天，男孩们去了罗比的家。'

In [5]:
def f(data):
    text = [i['text'] for i in data]
    text = processor.tokenizer(text,
                               padding=True,
                               truncation=True,
                               max_length=35 + 2,
                               return_tensors='pt').to('cuda')

    audio = [i['audio']['array'] for i in data]
    audio = processor.feature_extractor(audio,
                                        sampling_rate=16000,
                                        padding=True,
                                        truncation=True,
                                        max_length=900,
                                        padding_value=0.0,
                                        return_tensors='pt').to('cuda')

    return text.input_ids, audio.input_features, audio.attention_mask


loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=4,
                                     collate_fn=f,
                                     drop_last=True,
                                     shuffle=True)

len(loader), next(iter(loader))

(6964,
 (tensor([[1397,  889, 3608,  227,  633, 1588, 1548,  231, 1629,  916, 2268, 1548,
           2244, 2100, 2749,  675, 3688,    0,    0,    0,    0],
          [2639,  229, 2367,  345, 3688,    0,    0,    0,    0,    0,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0,    0],
          [1305, 3372, 1583, 3340, 2183, 1841, 3093, 2220, 3385, 3421, 2005, 3543,
           3688,    0,    0,    0,    0,    0,    0,    0,    0],
          [1776, 3482, 3945, 2204,  766, 1764, 3618, 3575, 1382,  406, 3154, 1164,
           3426,  700, 3482, 2783, 3390, 2325, 1798,  479, 3688]],
         device='cuda:0'),
  tensor([[[-4.8011e+00, -4.9900e+00, -4.9148e+00,  ..., -1.9586e+00,
            -1.7661e+00, -1.6455e+00],
           [-1.9484e+00, -2.1389e+00, -2.4848e+00,  ..., -1.2688e+00,
            -1.1479e+00, -1.1489e+00],
           [-2.4120e+00, -2.2528e+00, -2.5282e+00,  ..., -1.6682e+00,
            -1.5610e+00, -1.8720e+00],
           ...,
           [ 0.0000e+00,  

In [6]:
class Wav2Vec2BertForCTC(torch.nn.Module):

    def __init__(self):
        super().__init__()

        from transformers import Wav2Vec2BertModel, Wav2Vec2BertConfig
        config = Wav2Vec2BertConfig.from_pretrained(
            'model/lansinuote/Chinese_Speech_to_Text_CTC')

        self.wav2vec2_bert = Wav2Vec2BertModel(config)
        self.dropout = torch.nn.Dropout(0.1)
        self.lm_head = torch.nn.Linear(1024, processor.tokenizer.vocab_size)

        from transformers import Wav2Vec2BertForCTC
        parameters = Wav2Vec2BertForCTC.from_pretrained(
            'model/lansinuote/Chinese_Speech_to_Text_CTC')
        self.wav2vec2_bert.load_state_dict(
            parameters.wav2vec2_bert.state_dict())
        #丢弃部分参数,验证训练过程是有效的
        #self.lm_head.load_state_dict(parameters.lm_head.state_dict())
        del parameters

        self.train()
        self.to('cuda')

    def forward(self, input_features, attention_mask):
        last_hidden_state = self.wav2vec2_bert(
            input_features, attention_mask=attention_mask).last_hidden_state

        last_hidden_state = self.dropout(last_hidden_state)

        return self.lm_head(last_hidden_state)


model = Wav2Vec2BertForCTC()

with torch.no_grad():
    input_features = torch.randn(4, 377, 160).to('cuda')
    attention_mask = torch.ones(4, 377).long().to('cuda')
    print(model(input_features, attention_mask).shape)

torch.Size([4, 189, 4054])


In [7]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CTCLoss(blank=processor.tokenizer.pad_token_id,
                             reduction='mean',
                             zero_infinity=False)

for epoch in range(1):
    for i, (input_ids, input_features, attention_mask) in enumerate(loader):
        logits = model(input_features, attention_mask)

        log_probs = logits.log_softmax(dim=2).transpose(0, 1)
        input_lengths = (attention_mask.sum(1) / 2).ceil().long()
        input_ids_mask = input_ids != processor.tokenizer.pad_token_id
        
        loss = criterion(log_probs=log_probs,
                         targets=input_ids[input_ids_mask],
                         input_lengths=input_lengths,
                         target_lengths=input_ids_mask.sum(-1))

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 500 == 0:
            print(epoch, i, loss.item())
            print(processor.tokenizer.decode(input_ids[0]))
            print(processor.tokenizer.decode(logits[0].argmax(1)))

0 0 70.24105834960938
凯库拉岭位于新西兰南岛东北部的两座平行山脉。
颗战森勉喇漠喇确涡脚罹祀郑盼此，涡吉吴旱稀修棵蛱婺喇修粥门郑疆鞑暮场浆漠丘肥郑鞅伸漠腌骤洛册郑顼龙规俊郑某峡钕祀某郑某蓝希郑顼祀钉祀铰某修遣掠显t鲡某郑稀蔻加稀泗门т郑恶懋帜涡修寅巨馔郑喇怕某郑漠旱稀焰稀扫救焰稀晴稀其濒稀涡濒萍稀网肺型
0 500 6.240141868591309
兜率天之第四天。
。
0 1000 5.626153945922852
龚明鑫，中华民国政治人物，现任行政院政务委员。
国国人。。。。。。
0 1500 5.2803826332092285
父亲刘锐。
十进为日。
0 2000 3.6847288608551025
艾碧嘉意识到女王现在因为对莎拉的伤心而厌恶她。
海比拉亦职到女王现在因为对沙拉的生星而建以他。
0 2500 3.148824453353882
情感歌谣是第二次世界大战后在日本独自发展的流行音乐样式之一。
求感歌由是第二次世界大战后在日本从自发展的流行音人次事之一。
0 3000 1.0861380100250244
青年时期曾经参加法国托派组织国际主义共产主义组织。
七年时期曾经参加法国托派组织，国际主义共产主义组织。。
0 3500 1.3475462198257446
并由五位爸和小孩主唱主题曲。
并由武位化妈和小海主创主题权。
0 4000 1.3369859457015991
有个叫“平山”的站。
有个叫平山的站。
0 4500 1.20524001121521
天佑十一年五月完工。
天右十一年五月完工。。
0 5000 0.8278586268424988
宋尔卫，广东鹤山人，汉族，九三学社员。
宋尔卫，广东赫深人，汉族，九三学省社员。。
0 5500 1.1511497497558594
这些产品的制作者小爱因斯坦公司是华特迪士尼公司的一个子公司。
这些产品的制作者小小爱因斯坦公司是华特地士尼公司的一个体公司。
0 6000 1.6899852752685547
是帘蛤目帘蛤科浅蜊属的一种。
是联哈木莲哈科显力属家的一种。
0 6500 0.5565958619117737
白峰是朝鲜民主义人民共和国的金日成传记作家。
白风是朝鲜民主主义人民共和国的今日臣专辑作家。
