In [154]:
!pip install datasets



## Create Wav2Vec2CTCTokenizer

In [155]:
from datasets import load_dataset
import pandas as pd
import numpy as np

In [156]:
# from google.colab import drive
# drive.mount('/content/drive')

In [157]:
# huggingface dataset
# all_data = load_dataset('csv',data_files='/content/drive/MyDrive/Final/csvfile/KsponSpeech_0002.csv',split='train')
# all_data = load_dataset('csv',data_files='/content/drive/MyDrive/Final/csvfile/*.csv',split='train')
all_data = load_dataset('csv',data_files='/Jaeyoon/face/voice/voic/csvfile/*csv',split='train')


# all_data2 = load_dataset('csv',data_files='/content/drive/MyDrive/Final/csvfile/KsponSpeech_0002.csv',split='train')


Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Using custom data configuration default-de7c285ef4a73378
Reusing dataset csv (C:\Users\YGL\.cache\huggingface\datasets\csv\default-de7c285ef4a73378\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


In [159]:
all_data

Dataset({
    features: ['Unnamed: 0', 'src', 'text'],
    num_rows: 49000
})

#자음모음으로 변환 유니코드

In [6]:
__all__ = ["split_syllable_char", "split_syllables",
           "join_jamos", "join_jamos_char",
           "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]

import itertools

INITIAL = 0x001
MEDIAL = 0x010
FINAL = 0x100
CHAR_LISTS = {
    INITIAL: list(map(chr, [
        0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
        0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
        0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
        0x314e
    ])),
    MEDIAL: list(map(chr, [
        0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
        0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
        0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
        0x3161, 0x3162, 0x3163
    ])),
    FINAL: list(map(chr, [
        0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
        0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
        0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
        0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
        0x314c, 0x314d, 0x314e
    ]))
}
CHAR_INITIALS = CHAR_LISTS[INITIAL]
CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
CHAR_FINALS = CHAR_LISTS[FINAL]
CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
CHARSET = set(itertools.chain(*CHAR_SETS.values()))
CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
                for k, v in CHAR_LISTS.items()}


def is_hangul_syllable(c):
    return 0xac00 <= ord(c) <= 0xd7a3  # Hangul Syllables


def is_hangul_jamo(c):
    return 0x1100 <= ord(c) <= 0x11ff  # Hangul Jamo


def is_hangul_compat_jamo(c):
    return 0x3130 <= ord(c) <= 0x318f  # Hangul Compatibility Jamo


def is_hangul_jamo_exta(c):
    return 0xa960 <= ord(c) <= 0xa97f  # Hangul Jamo Extended-A


def is_hangul_jamo_extb(c):
    return 0xd7b0 <= ord(c) <= 0xd7ff  # Hangul Jamo Extended-B


def is_hangul(c):
    return (is_hangul_syllable(c) or
            is_hangul_jamo(c) or
            is_hangul_compat_jamo(c) or
            is_hangul_jamo_exta(c) or
            is_hangul_jamo_extb(c))


def is_supported_hangul(c):
    return is_hangul_syllable(c) or is_hangul_compat_jamo(c)


def check_hangul(c, jamo_only=False):
    if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
        raise ValueError(f"'{c}' is not a supported hangul character. "
                         f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
                         f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
                         f"supported at the moment.")


def get_jamo_type(c):
    check_hangul(c)
    assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
    return sum(t for t, s in CHAR_SETS.items() if c in s)


def split_syllable_char(c):
    """
    Splits a given korean syllable into its components. Each component is
    represented by Unicode in 'Hangul Compatibility Jamo' range.

    Arguments:
        c: A Korean character.

    Returns:
        A triple (initial, medial, final) of Hangul Compatibility Jamos.
        If no jamo corresponds to a position, `None` is returned there.

    Example:
        >>> split_syllable_char("안")
        ("ㅇ", "ㅏ", "ㄴ")
        >>> split_syllable_char("고")
        ("ㄱ", "ㅗ", None)
        >>> split_syllable_char("ㅗ")
        (None, "ㅗ", None)
        >>> split_syllable_char("ㅇ")
        ("ㅇ", None, None)
    """
    check_hangul(c)
    if len(c) != 1:
        raise ValueError("Input string must have exactly one character.")

    init, med, final = None, None, None
    if is_hangul_syllable(c):
        offset = ord(c) - 0xac00
        x = (offset - offset % 28) // 28
        init, med, final = x // 21, x % 21, offset % 28
        if not final:
            final = None
        else:
            final -= 1
    else:
        pos = get_jamo_type(c)
        if pos & INITIAL == INITIAL:
            pos = INITIAL
        elif pos & MEDIAL == MEDIAL:
            pos = MEDIAL
        elif pos & FINAL == FINAL:
            pos = FINAL
        idx = CHAR_INDICES[pos][c]
        if pos == INITIAL:
            init = idx
        elif pos == MEDIAL:
            med = idx
        else:
            final = idx
    return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
                 for pos, idx in
                 zip([INITIAL, MEDIAL, FINAL], [init, med, final]))


def split_syllables(s, ignore_err=True, pad=None):
    """
    Performs syllable-split on a string.

    Arguments:
        s (str): A string (possibly mixed with non-Hangul characters).
        ignore_err (bool): If set False, it ensures that all characters in
            the string are Hangul-splittable and throws a ValueError otherwise.
            (default: True)
        pad (str): Pad empty jamo positions (initial, medial, or final) with
            `pad` character. This is useful for cases where fixed-length
            strings are needed. (default: None)

    Returns:
        Hangul-split string

    Example:
        >>> split_syllables("안녕하세요")
        "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"
        >>> split_syllables("안녕하세요~~", ignore_err=False)
        ValueError: encountered an unsupported character: ~ (0x7e)
        >>> split_syllables("안녕하세요ㅛ", pad="x")
        'ㅇㅏㄴㄴㅕㅇㅎㅏxㅅㅔxㅇㅛxxㅛx'
    """

    def try_split(c):
        try:
            return split_syllable_char(c)
        except ValueError:
            if ignore_err:
                return (c,)
            raise ValueError(f"encountered an unsupported character: "
                             f"{c} (0x{ord(c):x})")

    s = map(try_split, s)
    if pad is not None:
        tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
    else:
        tuples = map(lambda x: filter(None, x), s)
    return "".join(itertools.chain(*tuples))


def join_jamos_char(init, med, final=None):
    """
    Combines jamos into a single syllable.

    Arguments:
        init (str): Initial jao.
        med (str): Medial jamo.
        final (str): Final jamo. If not supplied, the final syllable is made
            without the final. (default: None)

    Returns:
        A Korean syllable.
    """
    chars = (init, med, final)
    for c in filter(None, chars):
        check_hangul(c, jamo_only=True)

    idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
                for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
    init_idx, med_idx, final_idx = idx
    # final index must be shifted once as
    # final index with 0 points to syllables without final
    final_idx = 0 if final_idx is None else final_idx + 1
    return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)


def join_jamos(s, ignore_err=True):
    """
    Combines a sequence of jamos to produce a sequence of syllables.

    Arguments:
        s (str): A string (possible mixed with non-jamo characters).
        ignore_err (bool): If set False, it will ensure that all characters
            will be consumed for the making of syllables. It will throw a
            ValueError when it fails to do so. (default: True)

    Returns:
        A string

    Example:
        >>> join_jamos("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안녕하세요"
        >>> join_jamos("ㅇㅏㄴㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안ㄴ녕하세요"
        >>> join_jamos()
    """
    last_t = 0
    queue = []
    new_string = ""

    def flush(n=0):
        new_queue = []
        while len(queue) > n:
            new_queue.append(queue.pop())
        if len(new_queue) == 1:
            if not ignore_err:
                raise ValueError(f"invalid jamo character: {new_queue[0]}")
            result = new_queue[0]
        elif len(new_queue) >= 2:
            try:
                result = join_jamos_char(*new_queue)
            except (ValueError, KeyError):
                # Invalid jamo combination
                if not ignore_err:
                    raise ValueError(f"invalid jamo characters: {new_queue}")
                result = "".join(new_queue)
        else:
            result = None
        return result

    for c in s:
        if c not in CHARSET:
            if queue:
                new_c = flush() + c
            else:
                new_c = c
            last_t = 0
        else:
            t = get_jamo_type(c)
            new_c = None
            if t & FINAL == FINAL:
                if not (last_t == MEDIAL):
                    new_c = flush()
            elif t == INITIAL:
                new_c = flush()
            elif t == MEDIAL:
                if last_t & INITIAL == INITIAL:
                    new_c = flush(1)
                else:
                    new_c = flush()
            last_t = t
            queue.insert(0, c)
        if new_c:
            new_string += new_c
    if queue:
        new_string += flush()
    return new_string

#-------

In [161]:
import re

# c = '바보는 아니고 c/ l/ b/ *'
# split_syllables(c)


chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    batch['text'] = split_syllables(batch['text'])
    return batch

In [162]:
remove_spectial_char_data = all_data.map(remove_special_characters)

  0%|          | 0/49000 [00:00<?, ?ex/s]

In [163]:
# result1 = pd.concat([df1,df2])

In [164]:
df = pd.DataFrame(remove_spectial_char_data)
df.rename(columns = {'src' : 'filename'}, inplace = True)
df['text'] = df['text'].str.replace("b/","<b>",regex=True)
df['text'] = df['text'].str.replace("l/","<l>",regex=True)
df['text'] = df['text'].str.replace("n/","<n>",regex=True)
df['text'] = df['text'].str.replace("/","",regex=True)


df[995:1005]

Unnamed: 0.1,Unnamed: 0,filename,text
995,995,KsponSpeech_001996.wav,ㅇㅣㄱㅓㅅㅈㅓㄱㅓㅅ ㄸㅏㄱㅔㅆㅈㅣ ㄱㅡㄴㄷㅔ ㄱㅡ ㅅㅜㅈㅣㄹㄱㅘㄴㄹㅣ ㅇㅣㄹㅓㄴ ㄱ...
996,996,KsponSpeech_001997.wav,<l> ㅇㅣ ㅈㅓㅇㄷㅗㅁㅕㄴ ㅇㅏㄴㅈㅜㅅㄱㅓㄹㅣ ㅇㅏㄴㅣㅇㅑ
997,997,KsponSpeech_001998.wav,ㄱㅡㄴㄷㅔ
998,998,KsponSpeech_001999.wav,ㄱㅡㄹㅓㅁ ㄴㅓㄴㅡㄴ ㄱㅡㄴㄷㅔ ㄱㅡㄹㅓㅁ ㅈㅣㄴㅉㅏ ㅁㅝㅎㅏㄴㅑ ㅇㅣㅅㅣㅂ ㅇㅣㄹ...
999,999,KsponSpeech_002000.wav,ㄱㅡㄱㅓ ㅈㅔㄴㄷㅓ ㅎㅐㄴㄷㅡㅍㅗㄴ ㅅㅏㄹ ㄸㅐ ㅈㅜㄴㅡㄴㄷㅔ
1000,0,KsponSpeech_002001.wav,ㄱㅡ ㅇㅏㅊㅣㅁㅇㅔ ㄱㅓㄹㅇㅓ ㄴㅏㅇㅗㄴㅣㄲㅏ <unk> ㄸㅗ ㅈㅐㅁㅣㅆㄷㅓㄹㅏㄱㅗ...
1001,1,KsponSpeech_002002.wav,ㅇㅘㅇㅈㅘㅇㅢ ㄱㅔㅇㅣㅁㅇㅣㅇㅓㅆㄴㅏ
1002,2,KsponSpeech_002003.wav,ㅇㅏ ㄱㅡㄹㅓㅁㅕㄴ ㄴㅓ <b> ㅇㅣㄸㅏㄱㅏ <b> ㅁㅕㅊㅅㅣㅇㅔ ㅈㅣㅂ ㄱㅏㄹ ㄱ...
1003,3,KsponSpeech_002004.wav,ㅈㅏㅁㄲㅏㄴ ㅇㅗㄴㅡㄹ ㅌㅗㅇㅛㅇㅣㄹㅇㅣㄴㄷㅔ ㄷㅗㅇㅅㅐㅇ ㅇㅗㄴㅡㄹ ㅅㅟㄴㅑ
1004,4,KsponSpeech_002005.wav,ㅇㅖㅅ ㅈㅓㅇ* ㅇㅓ ㅇㅣㄱㅔ ㅈㅓㅇㅁㅏㄹ <b> ㅅㅡㅍㅔㅇㅣㄴ ㅍㅔㅅㅡㅌㅡㄹㅗ ...


In [165]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}


In [166]:
char_vocab = remove_spectial_char_data.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=remove_spectial_char_data.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [167]:
char_vocab

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 1
})

In [168]:
vocab_list = list(set(char_vocab["vocab"][0]))

In [169]:
vocab_list

['ㄺ',
 'ㅁ',
 'ㄹ',
 '<',
 'ㅠ',
 'ㅕ',
 'i',
 '2',
 'ㅘ',
 'ㅈ',
 '/',
 'ㅜ',
 'ㄳ',
 'ㅂ',
 'p',
 'a',
 'e',
 'ㄵ',
 't',
 '*',
 'o',
 'ㅍ',
 'ㄾ',
 'ㅓ',
 'ㅙ',
 'y',
 'm',
 'q',
 'f',
 'ㅏ',
 'n',
 'ㅗ',
 'ㅎ',
 'r',
 'ㅐ',
 'ㅛ',
 'ㅇ',
 'ㅃ',
 '4',
 'ㅢ',
 ' ',
 'c',
 'g',
 '1',
 'b',
 'ㅄ',
 'x',
 'ㅋ',
 'ㅟ',
 'ㅚ',
 'ㄱ',
 'l',
 'ㅌ',
 'ㅑ',
 'ㄶ',
 'ㅉ',
 'ㅔ',
 'ㄷ',
 'ㅊ',
 'k',
 'ㅞ',
 'd',
 'v',
 'ㅝ',
 'ㄼ',
 'ㅀ',
 'ㅖ',
 'u',
 'ㄻ',
 'ㅅ',
 '3',
 'ㄲ',
 '>',
 'ㅣ',
 'h',
 'ㄸ',
 's',
 'j',
 'ㄴ',
 'ㅆ',
 'w',
 'ㅡ',
 'ㅒ']

In [170]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
len(vocab_dict)

83

In [171]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["<unk>"] = len(vocab_dict)
vocab_dict["<pad>"] = len(vocab_dict)
# vocab_dict["c/"] = len(vocab_dict)
vocab_dict["<b>"] = len(vocab_dict)
vocab_dict["<n>"] = len(vocab_dict)
vocab_dict["<l>"] = len(vocab_dict)
# vocab_dict["*"] = len(vocab_dict)
len(vocab_dict)

88

In [18]:
# import json
# with open('vocab.json', 'w') as vocab_file:
#     json.dump(vocab_dict, vocab_file)

In [172]:
!pip install transformers



In [173]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("../Jaeyoon/vocab_jamos.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

## Create XLSR-Wav2Vec2 Feature Extractor

In [174]:
import torch

In [175]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [176]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

In [177]:
# from google.colab import drive
# drive.mount('/content/gdrive/')

In [178]:
# processor.save_pretrained("/content/gdrive/MyDrive/wav2vec2-large-xlsr-ready")

## add audio array

In [179]:
import librosa
import os

In [180]:
# dir_list = os.listdir('/content/drive/MyDrive/Final/wavfile')
dir_list = os.listdir('../Jaeyoon/face/voice/voic/KsponSpeech_01')

dir_list.sort()

In [181]:
def load_audio(batch):
    for dir in dir_list:
        if os.path.isfile('../Jaeyoon/face/voice/voic/KsponSpeech_01/' + dir + '/'+batch['src']):

            batch['array'],_ = librosa.load('../Jaeyoon/face/voice/voic/KsponSpeech_01/' + dir + '/'+batch['src'],sr=16000)
      # print(dir)
#     batch['array'],_ = librosa.load(batch['src'], sr=16000)
    return batch

In [182]:
# wavpath = '/content/drive/MyDrive/Final/wavfile/'
# wavpath_dir = os.listdir(wavpath)
# wavpath_dir.sort()
# wavpath_dir

In [None]:
audio_data = remove_spectial_char_data.map(load_audio)
# all_data = all_data.map(load_audio)

  0%|          | 0/49000 [00:00<?, ?ex/s]

In [149]:
# audio_data_1 = []
# audio_data_2 = []
# audio_data_3 = []
# audio_data_4 = []
# audio_data_5 = []
# audio_data_6 = []
# audio_data_7 = []
# audio_data_8 = []
# audio_data_9 = []
# audio_data_10 = []

In [150]:
# audio_data_1 = audio_data[40000:41000]
# audio_data_2 = audio_data[41000:42000]
# audio_data_3 = audio_data[42000:43000]
# audio_data_4 = audio_data[43000:44000]
# audio_data_5 = audio_data[44000:45000]
# audio_data_6 = audio_data[35000:36000]
# audio_data_7 = audio_data[36000:37000]
# audio_data_8 = audio_data[37000:38000]
# audio_data_9 = audio_data[38000:39000]
# audio_data_10 = audio_data[39000:40000]

In [143]:
audio_data[0:5]

{'Unnamed: 0': [0, 1, 2, 3, 4],
 'src': ['KsponSpeech_001001.wav',
  'KsponSpeech_001002.wav',
  'KsponSpeech_001003.wav',
  'KsponSpeech_001004.wav',
  'KsponSpeech_001005.wav'],
 'text': ['n/ 아 그런가요 ',
  '그냥 별 열심히 하지 않은 거 아니야 이 열정이 없는 거지 연기라는 직업에 대해서 b/ ',
  '바꿀려면 그런 걸 바꿔야지 ',
  '요 요즘 영화 본 거 있니 ',
  '좋은 거라고 생각이 들었는데 b/ 졸업하고 나서 계속 b/ 그거에 매여서 갚아야 되는 모습들을 보면서 b/ '],
 'array': [[6.103515625e-05,
   6.103515625e-05,
   -0.000152587890625,
   -0.000274658203125,
   -6.103515625e-05,
   -0.0001220703125,
   0.0,
   0.00030517578125,
   0.0003662109375,
   0.000701904296875,
   0.0010986328125,
   0.00128173828125,
   0.001373291015625,
   0.0013427734375,
   0.0013427734375,
   0.00146484375,
   0.001556396484375,
   0.0015869140625,
   0.001556396484375,
   0.0013427734375,
   0.00103759765625,
   0.001007080078125,
   0.000823974609375,
   0.0006103515625,
   0.00048828125,
   0.00030517578125,
   0.000244140625,
   0.000213623046875,
   0.000396728515625,
   0.000762939453125,
   0.00109

In [75]:
# audio_total = []
# for i in range(0,49000,1000):
#     auaudio_data + '_' + str(i)+','+ str(i+1000) = audio_data[i,i+1000]
#     pass

AttributeError: 'str' object has no attribute 'append'

In [68]:
# from tqdm import tqdm

# l = len(audio_data)

# ds_list = []
# for i in tqdm(range(0,l,1000)):
#     ds_list.append(audio_data[i:i+1000])
# ds_list.append(audio_data[i+1000:])
# len(ds_list)

KeyboardInterrupt: 

In [151]:
# audio_data_1 = pd.DataFrame(audio_data_1)
# audio_data_2 = pd.DataFrame(audio_data_2)
# audio_data_3 = pd.DataFrame(audio_data_3)
# audio_data_4 = pd.DataFrame(audio_data_4)
# audio_data_5 = pd.DataFrame(audio_data_5)
# audio_data_6 = pd.DataFrame(audio_data_6)
# audio_data_7 = pd.DataFrame(audio_data_7)
# audio_data_8 = pd.DataFrame(audio_data_8)
# audio_data_9 = pd.DataFrame(audio_data_9)
# audio_data_10 = pd.DataFrame(audio_data_10)

In [152]:
# audio_data_2.to_csv('audio_array_2.csv',index = False)

In [153]:
# audio_data_1.to_csv('audio_array_41.csv',index = False)
# audio_data_2.to_csv('audio_array_42.csv',index = False)
# audio_data_3.to_csv('audio_array_43.csv',index = False)
# audio_data_4.to_csv('audio_array_44.csv',index = False)
# audio_data_5.to_csv('audio_array_45.csv',index = False)
# audio_data_6.to_csv('audio_array_36.csv',index = False)
# audio_data_7.to_csv('audio_array_37.csv',index = False)
# audio_data_8.to_csv('audio_array_38.csv',index = False)
# audio_data_9.to_csv('audio_array_39.csv',index = False)
# audio_data_10.to_csv('audio_array_40.csv',index = False)

In [58]:
# import csv
# audio_data.to_csv('audio_array_1.csv',index = False)

Creating CSV from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

10123358

In [None]:
# drive.mount('/content/drive')

In [None]:
audio_data = pd.read_csv("/content/audio_array.csv",index=False)

## Preprocess Data

In [None]:
def prepare_dataset(batch):

    # batched output is "un-batched"
    batch["input_values"] = processor(batch["array"], sampling_rate=16000).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [None]:
audio_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,src,text,array
0,0,0,KsponSpeech_001001.wav,n/ 아 그런가요,[ 6.1035156e-05 6.1035156e-05 -1.5258789e-04 ...
1,1,1,KsponSpeech_001002.wav,그냥 별 열심히 하지 않은 거 아니야 이 열정이 없는 거지 연기라는 직업에 대해서 b/,[-0.00039673 -0.00030518 -0.00036621 ... -0.00...
2,2,2,KsponSpeech_001003.wav,바꿀려면 그런 걸 바꿔야지,[-1.2207031e-04 -1.5258789e-04 -1.8310547e-04 ...
3,3,3,KsponSpeech_001004.wav,요 요즘 영화 본 거 있니,[ 0.00021362 0.00018311 0.00012207 ... -0.00...
4,4,4,KsponSpeech_001005.wav,좋은 거라고 생각이 들었는데 b/ 졸업하고 나서 계속 b/ 그거에 매여서 갚아야 되...,[ 9.4604492e-04 7.0190430e-04 6.1035156e-04 ...
...,...,...,...,...,...
48995,8995,995,KsponSpeech_049996.wav,진짜 좋겠다 요즘 맛있는 거 못 먹은 지 너무 오래된 거 같애 l/ 왜 나 요즘 ...,[ 3.6621094e-04 3.9672852e-04 6.4086914e-04 ...
48996,8996,996,KsponSpeech_049997.wav,왜 뭐 달라,[ 0.0005188 0.00045776 0.00039673 ... -0.00...
48997,8997,997,KsponSpeech_049998.wav,맻 명 아는 체하니까 b/,[ 0.00021362 0.00042725 0.00048828 ... -0.00...
48998,8998,998,KsponSpeech_049999.wav,이제* 하늘 이렇게 보는데 어느 날 무지개가 이렇게* 떴는데 갑자기 무지개가 영어로...,[-5.7983398e-04 -7.6293945e-04 -7.6293945e-04 ...


In [None]:
len(audio_data.loc[0,'array'])

96

In [None]:
order_voice = audio_data.map(
    prepare_dataset,
    remove_columns=remove_spectial_char_data.column_names,
    # num_proc=4 #병렬작업
)

AttributeError: ignored

In [None]:
order_voice

NameError: ignored

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,filename,text
0,0,KsponSpeech_001001.wav,<n> 아 그런가요
1,1,KsponSpeech_001002.wav,그냥 별 열심히 하지 않은 거 아니야 이 열정이 없는 거지 연기라는 직업에 대해서 ...
2,2,KsponSpeech_001003.wav,바꿀려면 그런 걸 바꿔야지
3,3,KsponSpeech_001004.wav,요 요즘 영화 본 거 있니
4,4,KsponSpeech_001005.wav,좋은 거라고 생각이 들었는데 <b> 졸업하고 나서 계속 <b> 그거에 매여서 갚아야...


In [None]:
print(len(audio_data[0]['array']))

KeyError: ignored

In [None]:
df = pd.DataFrame(df,columns=['filename','text','array'])
df.head()

Unnamed: 0,filename,text,array
0,KsponSpeech_001001.wav,<n> 아 그런가요,
1,KsponSpeech_001002.wav,그냥 별 열심히 하지 않은 거 아니야 이 열정이 없는 거지 연기라는 직업에 대해서 ...,
2,KsponSpeech_001003.wav,바꿀려면 그런 걸 바꿔야지,
3,KsponSpeech_001004.wav,요 요즘 영화 본 거 있니,
4,KsponSpeech_001005.wav,좋은 거라고 생각이 들었는데 <b> 졸업하고 나서 계속 <b> 그거에 매여서 갚아야...,


In [None]:
df['array'] = audio_data['array']
df.head()

Unnamed: 0,filename,text,array
0,KsponSpeech_001001.wav,<n> 아 그런가요,[ 6.1035156e-05 6.1035156e-05 -1.5258789e-04 ...
1,KsponSpeech_001002.wav,그냥 별 열심히 하지 않은 거 아니야 이 열정이 없는 거지 연기라는 직업에 대해서 ...,[-0.00039673 -0.00030518 -0.00036621 ... -0.00...
2,KsponSpeech_001003.wav,바꿀려면 그런 걸 바꿔야지,[-1.2207031e-04 -1.5258789e-04 -1.8310547e-04 ...
3,KsponSpeech_001004.wav,요 요즘 영화 본 거 있니,[ 0.00021362 0.00018311 0.00012207 ... -0.00...
4,KsponSpeech_001005.wav,좋은 거라고 생각이 들었는데 <b> 졸업하고 나서 계속 <b> 그거에 매여서 갚아야...,[ 9.4604492e-04 7.0190430e-04 6.1035156e-04 ...


# Training

## Set-up Trainer

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.cuda.HalfTensor]]]) -> Dict[str, torch.cuda.HalfTensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
!pip install jiwer

In [None]:
from datasets import load_dataset, load_metric, Audio

wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Import Model

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

XLSR-Wav2Vec2의 첫 번째 구성 요소는 원시 음성 신호에서 음향적으로 의미가 있지만 문맥적으로 독립적인 기능을 추출하는 데 사용되는 CNN 계층 스택으로 구성됩니다.  
모델의 이 부분은 사전 교육 중에 이미 충분히 훈련되었으며 논문에 명시된 바와 같이 더 이상 미세 조정할 필요가 없습니다. 따라서 특징 추출 부분의 모든 파라미터에 대해 require_grad를 False로 설정할 수 있다.

In [None]:
model.freeze_feature_extractor()

메모리를 절약하기 위해 그라데이션 체크포인팅을 활성화

In [None]:
model.gradient_checkpointing_enable()

## TrainingArguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  # output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
  output_dir="./wav2vec2-large-xlsr-ko-demo",
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=3,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [None]:
from transformers import Trainer
from torch.utils.data import random_split

ds_size = len(order_voice)
train_size = int(ds_size*0.8)
val_size = ds_size - train_size
train_ds, val_ds = random_split(order_voice,[train_size,val_size])

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.feature_extractor,
)

## Model Training

In [None]:
torch.cuda.is_available()

In [None]:
hist = trainer.train()

In [None]:
hist = trainer.train()

In [None]:
hist = trainer.train()

In [None]:
hist = trainer.train()

CTC 손실을 사용하여 더 큰 데이터 세트에서 더 큰 모델을 미세 조정하려면 [여기서](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition#connectionist-temporal-classification-without-language-model-ctc-wo-lm) 공식 음성 인식 예를 살펴봐야 한다.

## Model predict

In [None]:
model(data_collator([order_voice[0]]))