# 환경 구성

In [1]:
from setproctitle import setproctitle
setproctitle("Wav2Vec2 textfile")

import os

from tqdm import tqdm

dataset_path = "/wav2vec2/s-kr/fine-tune/dataset"
kspon_path = os.path.join(dataset_path, "KsponSpeech")

In [2]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Count of using GPUs:', torch.cuda.device_count()) 

Device: cuda
Count of using GPUs: 3


# KsponSpeech Vocab 생성

In [3]:
from setproctitle import setproctitle
setproctitle("Wav2Vec2 Vocab")

from tqdm import tqdm
from datasets import Dataset, ClassLabel
from IPython.display import display, HTML
from glob import glob
from kspon_preprocess import special_filter, bracket_filter

import re
import librosa
import random
import numpy as np
import pandas as pd
import os


def show_random_elements(dataset, num_examples=15):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    
    df = pd.DataFrame(dataset[:num_examples])
    display(HTML(df.to_html()))

    
def _read_txt_file(file_path):
    # with open(file_path, 'r', encoding='utf-8') as f:
    with open(file_path, 'r', encoding='cp949') as f:
        text = f.read()
        
    return text


text_list = list()

In [4]:
audio_list = list()
durations = 0
max_sec = 10.0
min_sec = 2.0

kspon_wavs = glob(os.path.join(kspon_path, '**', '*.wav'), recursive=True)
random.seed(44)
random.shuffle(kspon_wavs)

remove_re = '[a-zA-Z0-9%]'

for file in tqdm(kspon_wavs):
    duration = librosa.get_duration(filename=file, sr=16000)
    if (min_sec <= duration) and (max_sec >= duration):
        text_path = file.replace(".wav", ".txt")
        text = _read_txt_file(text_path)
        text = special_filter(bracket_filter(text))
        if re.findall(remove_re, text) == []:
            text_list.append(text)
            audio_list.append(file)
            durations += duration
    if durations >= 500*60*60:
        break

len(text_list)

 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▍           | 548603/612422 [04:59<00:34, 1834.00it/s]


378191

In [5]:
text_dict = {"text": text_list}

vocab_timit = Dataset.from_dict(text_dict)
print(len(vocab_timit))
print(vocab_timit)

show_random_elements(vocab_timit)

378191
Dataset({
    features: ['text'],
    num_rows: 378191
})


Unnamed: 0,text
0,너무 막 완결 된 거 막 추천하는 거 없냐 요즘
1,음 어쨌든 한번 사 봐 이 인생이 편해져
2,난 너가 언제 한번 대마초 하다가 걸려갖고 잡혀갔음 좋겠어
3,그 그 그런 이 그런 이유
4,아침 많이 먹긴 하는데
5,그래서 막
6,여군 괜찮겠다 이랬는데 결국 안가드라 안 간건지 떨어진건지
7,햄 치즈 응 맛있던데 그 빵이 빠삭빠삭해서
8,그러니까
9,무슨 아쿠아 뭐였거든 아쿠아맨인가


In [6]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = extract_all_chars(vocab_timit)
vocab_list = list(set(vocabs["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}

In [7]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

2083

In [8]:
import json
import os

vocab_path = os.path.join(dataset_path, 'vocab.json')
with open(vocab_path, 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Processor 생성

In [9]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer(vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='', vocab_size=2083, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'})

# 학습 데이터셋 구축

In [10]:
from setproctitle import setproctitle
setproctitle("Wav2Vec2 Dataset")

from datasets import Dataset
from transformers import Wav2Vec2Processor
from kspon_preprocess import special_filter, bracket_filter, del_noise  # 특수 기호 제거하는 전처리 코드 함수
from tqdm import tqdm

import numpy as np
import os
import soundfile as sf
import librosa
import re

input_values_list = list()
input_length_list = list()
labels_list = list()
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
remove_re = '[a-zA-Z0-9%]'

def remove_special_characters(text: str) -> str:
    text = special_filter(bracket_filter(text))
    # text = re.sub(chars_to_ignore_regex, '', text)
    
    return text

In [11]:
for text, audio_path in tqdm(zip(text_list, audio_list)):
    audio, _ = sf.read(audio_path)
    non_silence_indices = del_noise(audio, top_db=30)  # del_noise 함수를 통해 노이즈 제거
    audio = np.concatenate([audio[start:end] for start, end in non_silence_indices])
    if audio.ndim > 1:
        audio = np.delete(audio, 1, axis=1)
        audio = audio.reshape(-1)
    input_value = processor(audio, sampling_rate=16000).input_values[0]
    input_values_list.append(input_value)
    input_length_list.append(len(input_value))
    text = remove_special_characters(text)
    with processor.as_target_processor():
        labels_list.append(processor(text).input_ids)

378191it [18:30, 340.51it/s]


In [12]:
print(len(input_values_list), len(input_length_list), len(labels_list))

print(input_values_list[0], input_values_list[0].shape)
print(input_length_list[0])
print(labels_list[0])

378191 378191 378191
[ 0.02032578  0.02032578  0.01894384 ... -0.00800393 -0.00938587
 -0.01007684] (40960,)
40960
[512, 877, 1240, 420, 1240, 804, 29, 1240, 1606, 1240, 1000, 1240, 420, 1240, 1002, 1403, 992, 995, 1240, 1000, 1240, 936, 1934, 1240, 27, 157]


In [13]:
import pandas as pd

train_rate = 0.90
train_idx = int(train_rate * len(input_values_list))

train_df = pd.DataFrame({'input_values': input_values_list[:train_idx], 'input_length': input_length_list[:train_idx], 'labels': labels_list[:train_idx]})
test_df = pd.DataFrame({'input_values': input_values_list[train_idx:], 'input_length': input_length_list[train_idx:], 'labels': labels_list[train_idx:]})

print(len(train_df))
print(len(test_df))

340371
37820


In [14]:
train_timit = Dataset.from_pandas(train_df)
test_timit = Dataset.from_pandas(test_df)

print(len(train_timit))
print(train_timit)

340371
Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 340371
})


# 데이터 확인

In [15]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(train_timit))

ipd.Audio(data=np.asarray(train_timit[rand_int]["input_values"]), rate=16000)

# Train 준비

In [16]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [17]:
from datasets import load_metric

cer_metric = load_metric("cer")
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [18]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    # wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [19]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    gradient_checkpointing=True,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(processor.tokenizer)
)

print(model.config)
print("=" * 100)
print(model)

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_hid.bias', 'quantizer.weight_proj.bias', 'project_hid.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to u

Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "layer",
  "feat_proj_dropout": 0.1,
  "feat_quantizer_dropout": 0.0,
  "final_dropout": 0.0,
  "hidden_act

In [20]:
model.freeze_feature_encoder()

In [21]:
from transformers import TrainingArguments

output_dir = os.path.join(dataset_path, "results")

training_args = TrainingArguments(
  output_dir=output_dir,           
  group_by_length=True,
  per_device_train_batch_size=128,
  per_device_eval_batch_size=128,
  evaluation_strategy="steps",
  num_train_epochs=100,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=1000,
  eval_steps=1000,
  logging_steps=1000,
  learning_rate=1e-4,
  log_on_each_node=True,
  weight_decay=0.005,
  warmup_steps=1000,
  eval_accumulation_steps=1,
  save_total_limit=200,
)

# Train 시작

In [22]:
from transformers import Trainer
from setproctitle import setproctitle
setproctitle("Wav2Vec2 Train")

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_timit,
    eval_dataset=test_timit,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [None]:
try:
    trainer.train()
except Exception as e:
    print(e)
finally:
    for obj in trainer.state.log_history:
        print(obj)
        
    trainer.save_model(output_dir)

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
