# 모델 vocab_size

In [2]:
from transformers import Wav2Vec2CTCTokenizer
import os

dataset_path = "/wav2vec2/s-kr/fine-tune/dataset"
vocab_path = os.path.join(dataset_path, 'vocab.json')
tokenizer = Wav2Vec2CTCTokenizer(vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [3]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [5]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='', vocab_size=1261, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'})

In [6]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
# model = Wav2Vec2Model.from_pretrained(
    # "kresnik/wav2vec2-large-xlsr-korean",    # pre-trained된 한국어 wav2vec2 모델을 사용
    "facebook/wav2vec2-large-xlsr-53",
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

model

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['project_q.bias', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.bias', 'quantizer.codevectors', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

In [10]:
model.config.vocab_size = 1261
model.config

Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "layer",
  "feat_proj_dropout": 0.1,
  "feat_quantizer_dropout": 0.0,
  "final_dropout": 0.0,
  "gradient_c

In [11]:
model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

# 추론 테스트

In [1]:
from setproctitle import setproctitle
setproctitle("wav2vec infer test")

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
import os
import torch

dataset_path = "/wav2vec2/s-kr/fine-tune/dataset"
vocab_path = os.path.join(dataset_path, 'vocab.json')

tokenizer = Wav2Vec2CTCTokenizer(vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [3]:
# processor.save_pretrained(os.path.join(dataset_path, "results/checkpoint-900"))

In [4]:
model = Wav2Vec2ForCTC.from_pretrained(
    os.path.join(dataset_path, "results/checkpoint-900")
).to('cuda')

# model.config

In [4]:
processor = Wav2Vec2Processor.from_pretrained(os.path.join(dataset_path, "results/checkpoint-900"))
processor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='/wav2vec2/s-kr/fine-tune/dataset/results/checkpoint-900', vocab_size=1261, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]', 'additional_special_tokens': [AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True)]})

In [None]:
from datasets import load_dataset
import soundfile as sf
from jiwer import wer

ds = load_dataset("kresnik/zeroth_korean", "clean")

test_ds = ds['test']

Downloading builder script:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

Downloading and preparing dataset zeroth_korean/clean to /root/.cache/huggingface/datasets/kresnik___zeroth_korean/clean/1.0.1/f6cf96a53d5512525e3113bab8048d36ce268658d6e0c40d45f65dfa3f0bc343...


Downloading data:   0%|          | 0.00/10.3G [00:00<?, ?B/s]

In [18]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

test_ds = test_ds.map(map_to_array)

In [18]:
def map_to_pred(batch):
    inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
    input_values = inputs.input_values.to("cuda")
    
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

result = test_ds.map(map_to_pred, batched=True, batch_size=2, remove_columns=["speech"])

In [18]:
print("WER:", wer(result["text"], result["transcription"]))

# facebook/wav2vec2-large-xlsr-53 확인

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
# model = Wav2Vec2Model.from_pretrained(
    # "kresnik/wav2vec2-large-xlsr-korean",    # pre-trained된 한국어 wav2vec2 모델을 사용
    "facebook/wav2vec2-large-xlsr-53"

# 코사인 유사도

In [1]:
from numpy import dot
from numpy.linalg import norm
import numpy as np

# 코사인 유사도를 구하는 함수
def cos_sim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [25]:
a = [45, 33, 1, 20, 41, 26, 9, 13, 8]
b = [33, 117, 0, 31, 76, 71, 7, 9, 3]
c = [1, 0, 7, 0, 0, 0, 0, 1, 0]
d = [20, 31, 0, 96, 72, 65, 27, 8, 2]

cos_sim(b, d)

0.7479333057269969

In [8]:
def cos_similarity2(v1, v2):
    dot_product = np.dot(v1, v2)
    l2_norm = (np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2))))
    similarity = dot_product / l2_norm     
    
    return similarity

cos_similarity2(a, b)

0.8662700054380889

In [11]:
from scipy import spatial

result = 1 - spatial.distance.cosine(a, b)
result

0.8662700054380887

In [28]:
import pandas as pd

k = pd.DataFrame({"num":[1,2,3,4,5], "num2": [6,7,8,9,10]})
k

Unnamed: 0,num,num2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [37]:
k.values.tolist()

[[1, 6], [2, 7], [3, 8], [4, 9], [5, 10]]