In [1]:
import torch 
import torch.nn as nn
import torchaudio
import numpy as np

import os, glob
import json
sr = 16000

In [2]:
torch.backends.cudnn.benchmark = True

In [3]:
tgt = 'hira'

In [4]:
import pandas as pd
val = pd.read_csv(f'./val_{tgt}_wav.csv', index_col=0)
val = val.reset_index()
val.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment,hira
0,033ede7ca4c60dc27cef421b4d33799d38924ed36fa8dd...,./data/processed_clips/common_voice_ja_2140974...,祖母は、おおむね機嫌よく、サイコロをころがしている。,2,0,,,,ja,,そぼはおおむねきげんよくさいころをころがしている
1,087edae49ce1e0f600682ceccc7fc28e81e64ae890e647...,./data/processed_clips/common_voice_ja_2207275...,財布をなくしたので、交番へ行きます。,2,0,teens,female,,ja,,さいふをなくしたのでこうばんへいきます
2,09e6ae463786aae9071baa9044ac8b7466aa7c48dcdaf4...,./data/processed_clips/common_voice_ja_2367700...,背の高さは一七〇センチほどで、目が大きく、やや太っている。,2,0,,,,ja,,せのたかさはいちしち〇せんちほどでめがおおきくややふとっている
3,15b7d87a73d28b37664fdf7fea1ff232f89e80ce954c9b...,./data/processed_clips/common_voice_ja_1949962...,新しい靴をはいて出かけます。,2,0,,,,ja,,あたらしいくつをはいてでかけます
4,169409451683ba935fd4cfef8622ffb09ddef76a20fc3b...,./data/processed_clips/common_voice_ja_3061584...,家族といえども言葉で伝えるのは大事,2,0,,,,ja,,かぞくといえどもことばでつたえるのはだいじ


In [5]:
model_folder = f'wav2vec2_tiny_ja_{tgt}'

In [6]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer(f"./vocab_{tgt}_cv.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
tokenizer

PreTrainedTokenizer(name_or_path='', vocab_size=114, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'})

In [7]:
token_list = open(f"./vocab_{tgt}_cv.json",'r')
token_list = json.load(token_list)
token_list = {k: v for k, v in enumerate(token_list)}
token_list 

{0: 'a',
 1: 'b',
 2: 'c',
 3: 'd',
 4: 'e',
 5: 'f',
 6: 'g',
 7: 'h',
 8: 'i',
 9: 'j',
 10: 'k',
 11: 'l',
 12: 'm',
 13: 'n',
 14: 'o',
 15: 'p',
 16: 'q',
 17: 'r',
 18: 's',
 19: 't',
 20: 'u',
 21: 'v',
 22: 'w',
 23: 'x',
 24: 'y',
 25: 'z',
 26: '々',
 27: '〇',
 28: 'ぁ',
 29: 'あ',
 30: 'ぃ',
 31: 'い',
 32: 'ぅ',
 33: 'う',
 34: 'ぇ',
 35: 'え',
 36: 'ぉ',
 37: 'お',
 38: 'か',
 39: 'が',
 40: 'き',
 41: 'ぎ',
 42: 'く',
 43: 'ぐ',
 44: 'け',
 45: 'げ',
 46: 'こ',
 47: 'ご',
 48: 'さ',
 49: 'ざ',
 50: 'し',
 51: 'じ',
 52: 'す',
 53: 'ず',
 54: 'せ',
 55: 'ぜ',
 56: 'そ',
 57: 'ぞ',
 58: 'た',
 59: 'だ',
 60: 'ち',
 61: 'ぢ',
 62: 'っ',
 63: 'つ',
 64: 'づ',
 65: 'て',
 66: 'で',
 67: 'と',
 68: 'ど',
 69: 'な',
 70: 'に',
 71: 'ぬ',
 72: 'ね',
 73: 'の',
 74: 'は',
 75: 'ば',
 76: 'ぱ',
 77: 'ひ',
 78: 'び',
 79: 'ぴ',
 80: 'ふ',
 81: 'ぶ',
 82: 'ぷ',
 83: 'へ',
 84: 'べ',
 85: 'ぺ',
 86: 'ほ',
 87: 'ぼ',
 88: 'ぽ',
 89: 'ま',
 90: 'み',
 91: 'む',
 92: 'め',
 93: 'も',
 94: 'ゃ',
 95: 'や',
 96: 'ゅ',
 97: 'ゆ',
 98: 'ょ',
 99: 'よ',
 100: 'ら',

In [8]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=sr, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [9]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [10]:
class W2v2Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.pathes = df['path'].values
        self.sentences = df[tgt].values

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.pathes[idx])
        batch = dict()
        batch["input_values"] = processor(waveform.reshape(-1), sampling_rate=sr).input_values[0]  
        with processor.as_target_processor():
            batch["labels"] = processor(self.sentences[idx]).input_ids       
        
        return batch

    def __len__(self):
        return len(self.df)

val_dataset = W2v2Dataset(val)

In [11]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [12]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [13]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    'charsiu/zh_w2v2_tiny_fc_10ms',
    attention_dropout=0.2,
    hidden_dropout=0.2,
    feat_proj_dropout=0.2,
    mask_time_prob=0.075,
    layerdrop=0.2,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    diversity_loss_weight=100
    #vocab_size=len(processor.tokenizer)
)

model.lm_head = nn.Linear(384, len(processor.tokenizer))
model.config.vocab_size=len(processor.tokenizer)

In [14]:
model_ckpts = glob.glob(f'{model_folder}/*-*')
model_ckpt = sorted(model_ckpts)[-1]
model_file = f'./{model_ckpt}/pytorch_model.bin'
model.load_state_dict(torch.load(model_file))

<All keys matched successfully>

In [15]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
model = model.to(device)

In [16]:
from torch.utils.data import DataLoader
test_loader = DataLoader(val_dataset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=4)

In [17]:
# 認識結果が記述されたファイル
hypothesis_file = os.path.join(f'./hypothesis_{tgt}.txt')

# 正解文が記述されたファイル
reference_file = os.path.join(f'./reference_{tgt}.txt')

In [18]:
from tqdm import tqdm

In [19]:
import torchaudio as ta
with open(hypothesis_file, mode='w') as hyp_file, \
     open(reference_file, mode='w') as ref_file:
    for i in tqdm(range(len(val))):
        input_values = processor(ta.load(val['path'].iloc[i])[0], return_tensors="pt", sampling_rate=sr).input_values.to(device)  # Batch size 1
        logits = model(input_values[0]).logits
        predicted_ids = torch.argmax(logits, dim=-1)

        transcription = processor.decode(predicted_ids[0])
        gt = val.iloc[i][tgt]
        hyp_file.write('%s %s\n' \
                % (i, ' '.join(transcription)))
        ref_file.write('%s %s\n' \
            % (i, ' '.join(gt)))
    

100%|█████████████████████████████████████████| 500/500 [00:15<00:00, 31.58it/s]


In [20]:
result_file = os.path.join(f'result_{tgt}.txt')

In [21]:
# -*- coding: utf-8 -*-

#
# レーベンシュタイン距離を用いて，
# 認識結果の誤り数を算出します．
#

import numpy as np
import copy

def calculate_error(hypothesis, reference):
    ''' レーベンシュタイン距離を計算し，
        置換誤り，削除誤り，挿入誤りを出力する
    hypothesis:       認識結果(トークン毎に区切ったリスト形式)
    reference:        正解(同上)
    total_error:      総誤り数
    substitute_error: 置換誤り数
    delete_error:     削除誤り数
    insert_error:     挿入誤り数
    len_ref:          正解文のトークン数
    '''
    # 認識結果および正解系列の長さを取得
    len_hyp = len(hypothesis)
    len_ref = len(reference)

    # 累積コスト行列を作成する
    # 行列の各要素には，トータルコスト，
    # 置換コスト，削除コスト，挿入コストの
    # 累積値が辞書形式で定義される．
    cost_matrix = [[{"total":0, 
                     "substitute":0,
                     "delete":0,
                     "insert":0} \
                     for j in range(len_ref+1)] \
                         for i in range(len_hyp+1)]

    # 0列目と0行目の入力
    for i in range(1, len_hyp+1):
        # 縦方向への遷移は，削除処理を意味する
        cost_matrix[i][0]["delete"] = i
        cost_matrix[i][0]["total"] = i
    for j in range(1, len_ref+1):
        # 横方向への遷移は，挿入処理を意味する
        cost_matrix[0][j]["insert"] = j
        cost_matrix[0][j]["total"] = j

    # 1列目と1行目以降の累積コストを計算していく
    for i in range(1, len_hyp+1):
        for j in range(1, len_ref+1):
            #
            # 各処理のコストを計算する
            #
            # 斜め方向の遷移時，文字が一致しない場合は，
            # 置換処理により累積コストが1増加
            substitute_cost = \
                cost_matrix[i-1][j-1]["total"] \
                + (0 if hypothesis[i-1] == reference[j-1] else 1)
            # 縦方向の遷移時は，削除処理により累積コストが1増加
            delete_cost = cost_matrix[i-1][j]["total"] + 1
            # 横方向の遷移時は，挿入処理により累積コストが1増加
            insert_cost = cost_matrix[i][j-1]["total"] + 1

            # 置換処理，削除処理，挿入処理のうち，
            # どの処理を行えば累積コストが最も小さくなるかを計算
            cost = [substitute_cost, delete_cost, insert_cost]
            min_index = np.argmin(cost)

            if min_index == 0:
                # 置換処理が累積コスト最小となる場合

                # 遷移元の累積コスト情報をコピー
                cost_matrix[i][j] = \
                    copy.copy(cost_matrix[i-1][j-1])
                # 文字が一致しない場合は，
                # 累積置換コストを1増加させる
                cost_matrix[i][j]["substitute"] \
                    += (0 if hypothesis[i-1] \
                        == reference[j-1] else 1)
            elif min_index == 1:
                # 削除処理が累積コスト最小となる場合
                
                # 遷移元の累積コスト情報をコピー
                cost_matrix[i][j] = copy.copy(cost_matrix[i-1][j])
                # 累積削除コストを1増加させる
                cost_matrix[i][j]["delete"] += 1
            else:
                # 置換処理が累積コスト最小となる場合
                
                # 遷移元の累積コスト情報をコピー
                cost_matrix[i][j] = copy.copy(cost_matrix[i][j-1])
                # 累積挿入コストを1増加させる
                cost_matrix[i][j]["insert"] += 1

            # 累積トータルコスト(置換+削除+挿入コスト)を更新
            cost_matrix[i][j]["total"] = cost[min_index]

    #
    # エラーの数を出力する
    # このとき，削除コストは挿入誤り，
    # 挿入コストは削除誤りになる点に注意．
    # (削除コストが1である
    #    = 1文字削除しないと正解文にならない 
    #    = 認識結果は1文字分余計に挿入されている
    #    = 挿入誤りが1である)
    #

    # 累積コスト行列の右下の要素が最終的なコストとなる．
    total_error = cost_matrix[len_hyp][len_ref]["total"]
    substitute_error = cost_matrix[len_hyp][len_ref]["substitute"]
    # 削除誤り = 挿入コスト
    delete_error = cost_matrix[len_hyp][len_ref]["insert"]
    # 挿入誤り = 削除コスト
    insert_error = cost_matrix[len_hyp][len_ref]["delete"]
    
    # 各誤り数と，正解文の文字数
    # (誤り率を算出する際に分母として用いる)を出力
    return (total_error, 
            substitute_error,
            delete_error,
            insert_error,
            len_ref)

In [22]:
# 各誤りの総数(エラー率算出時の分子)
total_err = 0
total_sub = 0
total_del = 0
total_ins = 0
# 正解文の総文字数(エラー率算出時の分母)
total_length = 0

# 各ファイルをオープン
with open(hypothesis_file, mode='r') as hyp_file, \
     open(reference_file, mode='r') as ref_file, \
     open(result_file, mode='w') as out_file:
    # 認識結果ファイル正解文ファイルを一行ずつ読み込む
    for line_hyp, line_ref in zip(hyp_file, ref_file):
        # 読み込んだ行をスペースで区切り，リスト型の変数にする
        parts_hyp = line_hyp.split()
        parts_ref = line_ref.split()

        # 発話ID(partsの0番目の要素)が一致しているか確認
        if parts_hyp[0] != parts_ref[0]:
            sys.stderr.write('Utterance ids of '\
                'hypothesis and reference do not match.')
            exit(1)

        # 1要素目以降が認識結果/正解分の文字列(リスト型)
        hypothesis = parts_hyp[1:]
        reference = parts_ref[1:]

        # 誤り数を計算する
        (error, substitute, delete, insert, ref_length) \
            = calculate_error(hypothesis,
                                          reference)

        # 総誤り数を累積する
        total_err += error
        total_sub += substitute
        total_del += delete
        total_ins += insert
        total_length += ref_length

        # 各発話の結果を出力する
        out_file.write('ID: %s\n' % (parts_hyp[0]))
        out_file.write('#ERROR (#SUB #DEL #INS): '\
            '%d (%d %d %d)\n' \
            % (error, substitute, delete, insert))
        out_file.write('REF: %s\n' % (' '.join(reference)))
        out_file.write('HYP: %s\n' % (' '.join(hypothesis)))
        out_file.write('\n')

    # 総エラー数を，正解文の総文字数で割り，エラー率を算出する
    err_rate = 100.0 * total_err / total_length
    sub_rate = 100.0 * total_sub / total_length
    del_rate = 100.0 * total_del / total_length
    ins_rate = 100.0 * total_ins / total_length

    # 最終結果を出力する
    out_file.write('------------------------------'\
        '-----------------------------------------------\n')
    out_file.write('#TOKEN: %d, #ERROR: %d '\
        '(#SUB: %d, #DEL: %d, #INS: %d)\n' \
        % (total_length, total_err,
           total_sub, total_del, total_ins))
    out_file.write('TER: %.2f%% (SUB: %.2f, '\
        'DEL: %.2f, INS: %.2f)\n' \
        % (err_rate, sub_rate, del_rate, ins_rate))
    print('TER: %.2f%% (SUB: %.2f, DEL: %.2f, INS: %.2f)' \
        % (err_rate, sub_rate, del_rate, ins_rate))
    out_file.write('------------------------------'\
        '-----------------------------------------------\n')



TER: 21.43% (SUB: 14.84, DEL: 4.02, INS: 2.57)
