In [1]:
%%capture

%pip install datasets==1.18.3
%pip install transformers==4.11.3
%pip install librosa
%pip install jiwer

In [2]:
# open the tsv file using pandas

import pandas as pd

df = pd.read_csv('metadata_lab_m2li.tsv', sep='\t', encoding='utf-8')

In [4]:
df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment,clip,duration[ms],total_duration
0,898da828246ce49a58def510072044db74eaae191a47b0...,common_voice_fr_25026726.mp3,"Parmi eux, une petite fille, Rosina, qui le fr...",2,0,thirties,male,Français du Canada,,fr,,common_voice_fr_25026726.mp3,6084,6084
1,898da828246ce49a58def510072044db74eaae191a47b0...,common_voice_fr_25026727.mp3,"Vingt à trente mètres sous le sommet, un abri ...",2,1,thirties,male,Français du Canada,,fr,,common_voice_fr_25026727.mp3,6444,12528
2,898da828246ce49a58def510072044db74eaae191a47b0...,common_voice_fr_25026728.mp3,Le sac et la corde aux comédiens et au cardinal!,2,0,thirties,male,Français du Canada,,fr,,common_voice_fr_25026728.mp3,4644,17172
3,898da828246ce49a58def510072044db74eaae191a47b0...,common_voice_fr_25026730.mp3,Il parle couramment l'anglais et il a une bonn...,2,0,thirties,male,Français du Canada,,fr,,common_voice_fr_25026730.mp3,5940,23112
4,898da828246ce49a58def510072044db74eaae191a47b0...,common_voice_fr_25026731.mp3,Le bilan de compétence est avant tout celui du...,2,0,thirties,male,Français du Canada,,fr,,common_voice_fr_25026731.mp3,4572,27684


In [5]:
# list the audio files in the common_voice_audio folder (only the first 300)

import os

common_voice_audio = os.listdir('common_voice_audio')[:300]

print("audio files count:", len(common_voice_audio))

audio files count: 300


In [6]:
# only the rows with the path corresponding to the extracted files

df = df[df['path'].isin(common_voice_audio)]

df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment,clip,duration[ms],total_duration
6,8b1e91be96d7650e5980595ff9b0ec78d37a0d1d536d45...,common_voice_fr_17315927.mp3,Je trouve à ces amendements un intérêt pédagog...,3,0,thirties,male,Français de France,,fr,,common_voice_fr_17315927.mp3,4224,39756
36,9539722ce060eab153f0adb1d5f9e54945322231f04212...,common_voice_fr_23981049.mp3,L'éducation est l'un des objectifs principaux ...,2,0,twenties,male,,,fr,,common_voice_fr_23981049.mp3,4224,168204
56,a7181395cba9f9970eff97afa54eb435d12eb8234999d5...,common_voice_fr_19608157.mp3,Le titre olympique revient au Sud-Coréen Jin J...,2,1,thirties,male,Français de France,,fr,,common_voice_fr_19608157.mp3,5376,219540
78,affd924b9637cac14b2357d1b192fe2da4d8edf6414a55...,common_voice_fr_18048560.mp3,Travailler jusqu’à deux heures me semblerait s...,2,0,twenties,male,Français de France,,fr,,common_voice_fr_18048560.mp3,5904,323808
79,affd924b9637cac14b2357d1b192fe2da4d8edf6414a55...,common_voice_fr_18048561.mp3,Des mises en réserve pour le sous-objectif soi...,2,0,twenties,male,Français de France,,fr,,common_voice_fr_18048561.mp3,6264,330072


In [11]:
# transcriptions

transcriptions = list(df['sentence'].values)

In [12]:
transcriptions[:5]

['Je trouve à ces amendements un intérêt pédagogique.',
 "L'éducation est l'un des objectifs principaux du zoo.",
 'Le titre olympique revient au Sud-Coréen Jin Jong-oh.',
 'Travailler jusqu’à deux heures me semblerait susceptible de faire consensus.',
 'Des mises en réserve pour le sous-objectif soins de ville peuvent être décidées ou non.']

In [14]:
# remove all of the characters that are not in the alphabet and not in the french alphabet
# lowercase all of the characters

import re

def remove_special_characters(transcription):
    transcription = transcription.lower()
    transcription = re.sub(r'[^a-zA-Zàâäçèéêëîïôœùûüÿ\'\’\s]', '', transcription)
    transcription = re.sub('-', ' ', transcription)
    return transcription

transcriptions = [remove_special_characters(transcription) for transcription in transcriptions]

In [15]:
transcriptions[:5]

['je trouve à ces amendements un intérêt pédagogique',
 "l'éducation est l'un des objectifs principaux du zoo",
 'le titre olympique revient au sudcoréen jin jongoh',
 'travailler jusqu’à deux heures me semblerait susceptible de faire consensus',
 'des mises en réserve pour le sousobjectif soins de ville peuvent être décidées ou non']

In [16]:
# update the dataframe

df['sentence'] = transcriptions

In [17]:
# vocab

vocab_list = list(set(' '.join(transcriptions)))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict

{'ï': 0,
 'ê': 1,
 'x': 2,
 'b': 3,
 'm': 4,
 'k': 5,
 'ç': 6,
 'f': 7,
 't': 8,
 "'": 9,
 'h': 10,
 'â': 11,
 'è': 12,
 'î': 13,
 'c': 14,
 'i': 15,
 'y': 16,
 'ô': 17,
 'v': 18,
 'w': 19,
 'é': 20,
 'à': 21,
 'o': 22,
 'a': 23,
 'd': 24,
 'r': 25,
 'l': 26,
 'n': 27,
 'j': 28,
 ' ': 29,
 'u': 30,
 's': 31,
 'z': 32,
 'œ': 33,
 '’': 34,
 'g': 35,
 'ü': 36,
 'e': 37,
 'ù': 38,
 'p': 39,
 'û': 40,
 'q': 41}

In [18]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [19]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

44

In [20]:
# save the vocab

import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [21]:
from transformers import Wav2Vec2CTCTokenizer

# load the tokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [22]:
from transformers import Wav2Vec2FeatureExtractor

# load the feature extractor
# the feature extractor is responsible for processing the audio files

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [23]:
from transformers import Wav2Vec2Processor

# load the processor
# the processor is responsible for processing the transcriptions

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [24]:
# audio files paths

audio_paths = list(df['path'].apply(lambda x: f"./common_voice_audio/{x}").values)
transcriptions_paths = list(df['sentence'].values)

audio_paths[:5], transcriptions_paths[:5]


(['./common_voice_audio/common_voice_fr_17315927.mp3',
  './common_voice_audio/common_voice_fr_23981049.mp3',
  './common_voice_audio/common_voice_fr_19608157.mp3',
  './common_voice_audio/common_voice_fr_18048560.mp3',
  './common_voice_audio/common_voice_fr_18048561.mp3'],
 ['je trouve à ces amendements un intérêt pédagogique',
  "l'éducation est l'un des objectifs principaux du zoo",
  'le titre olympique revient au sudcoréen jin jongoh',
  'travailler jusqu’à deux heures me semblerait susceptible de faire consensus',
  'des mises en réserve pour le sousobjectif soins de ville peuvent être décidées ou non'])

In [None]:
# get the arrays of the audio files and the transcriptions using the processor

import torch

def map_to_array(batch):
    speech_array = []
    for path in batch:
        speech_array.append(processor(path, return_tensors="pt").input_values)
    return speech_array

def map_to_transcription(batch):
    return processor(batch, return_tensors="pt").input_values

speech_arrays = map_to_array(audio_paths)



In [25]:
# import torch, torchaudio

# # get the arrays of the audio files

# def speech_file_to_array_fn(batch):
#     speech_array, sampling_rate = torchaudio.load(batch)    # load the audio file
#     resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)   # resample the audio file
#     batch = resampler(speech_array).squeeze().numpy()   # squeeze the audio file
#     return batch
   
# speech_arrays = [speech_file_to_array_fn(path) for path in audio_paths]

In [27]:
# feature extraction
inputs = feature_extractor(speech_arrays, sampling_rate=16_000, return_tensors="pt", padding=True)

# tokenize the transcriptions to get the labels
target = processor.tokenizer(transcriptions_paths, return_tensors="pt", padding=True).input_ids

In [28]:
inputs["input_values"].shape, target.shape

(torch.Size([300, 163200]), torch.Size([300, 120]))

In [29]:
# create the dataset

from datasets import Dataset

common_voice_dataset = Dataset.from_dict({"input_values": inputs["input_values"], "labels": target})

# split the dataset into training and validation datasets

common_voice_dataset = common_voice_dataset.train_test_split(test_size=0.2)


In [30]:
common_voice_dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 240
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 60
    })
})

In [31]:
common_voice_dataset["train"][0]

{'input_values': [0.0002115588285960257,
  0.0002115588285960257,
  0.0002115588285960257,
  0.0002115588285960257,
  0.0002115588285960257,
  0.0002115588285960257,
  0.00021155879949219525,
  0.0002115588722517714,
  0.0002115588285960257,
  0.0002115587703883648,
  0.0002115586248692125,
  0.00021155865397304296,
  0.0002115586685249582,
  0.00021155847935006022,
  0.00021155922149773687,
  0.00021155900321900845,
  0.00021155989088583738,
  0.00021155884314794093,
  0.00021155983267817646,
  0.00021155763533897698,
  0.00021155891590751708,
  0.00021155792637728155,
  0.0002115573443006724,
  0.00021155792637728155,
  0.00021155792637728155,
  0.0002115595416398719,
  0.0002115588285960257,
  0.0002115594397764653,
  0.00021155991998966783,
  0.0002115566749125719,
  0.00021155801368877292,
  0.00021155711147002876,
  0.0002115562674589455,
  0.00021156568254809827,
  0.00021155459398869425,
  0.00021156603179406375,
  0.00021155360445845872,
  0.0002115698589477688,
  0.0002115623

In [32]:
# evaluation metric

from datasets import load_metric

wer_metric = load_metric("wer")

Downloading:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [34]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(pred_logits, dim=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [37]:
# train the model using the common_voice_dataset batch size of 8

from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xslr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.freeze_feature_extractor()

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-large-960h-turkish-demo",
    group_by_length=True,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=30,
    fp16=True,
    save_steps=400,
    eval_steps=400,
    logging_steps=400,
    learning_rate=3e-4,
    warmup_steps=500,
    save_total_limit=2,
    )

from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=processor.data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_dataset["train"],
    eval_dataset=common_voice_dataset["test"],
    tokenizer=processor.feature_extractor,
)

trainer.train()

OSError: facebook/wav2vec2-large-xslr-53 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [35]:
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

# 모델 설정
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(vocab_dict)
)

# 학습 설정
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned",
    per_device_train_batch_size=32,
    evaluation_strategy="steps",
    num_train_epochs=10,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=1000,
)

# 손실 함수 설정
ctc_loss = torch.nn.CTCLoss()

# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

# Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_dataset["train"],
    eval_dataset=common_voice_dataset["test"],
    tokenizer=processor.feature_extractor,
)

# 모델 학습
trainer.train()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/80 [00:00<?, ?it/s]

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13GFamilyCommandBuffer: 0x314dddf00>
    label = <none> 
    device = <AGXG13GDevice: 0x13cdc2000>
        name = Apple M1 
    commandQueue = <AGXG13GFamilyCommandQueue: 0x16b6ff400>
        label = <none> 
        device = <AGXG13GDevice: 0x13cdc2000>
            name = Apple M1 
    retainedReferences = 1


RuntimeError: MPS backend out of memory (MPS allocated: 8.02 GB, other allocations: 2.38 MB, max allowed: 9.07 GB). Tried to allocate 1.99 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).