In [1]:
import os
import torch
import librosa
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
from dataCVC import DataCollatorCTCWithPadding
from sklearn.model_selection import train_test_split




In [2]:
df = pd.read_csv("./audio_dataset/MedicalSpeechIntent.csv")
df["text"] = df["phrase"].str.upper().str.replace(r"[^A-Z' ]", "", regex=True)
df["text"] = df["text"].str.replace(" ", "|")
audio_dir = "./audio_dataset/MedicalSpeechIntent"
df["audio"] = df["file_name"].apply(lambda x: os.path.join(audio_dir, x))

In [3]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df.to_csv("./audio_dataset/MedicalSpeechIntent_train.csv", index=False)
val_df.to_csv("./audio_dataset/MedicalSpeechIntent_validation.csv", index=False)
test_df.to_csv("./audio_dataset/MedicalSpeechIntent_test.csv", index=False)

In [20]:
# model_name = "./wav2vec2_local"
model_name = "./wav2vec2_finetuned"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

In [21]:
def preprocess(batch):
    speech_array, _ = librosa.load(batch["audio"], sr=16000)
    inputs = processor(
        speech_array,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True,
    )
    labels = processor.tokenizer(batch["text"]).input_ids
    batch["input_values"] = inputs.input_values[0]
    if "attention_mask" in inputs:
        batch["attention_mask"] = inputs.attention_mask[0]
    batch["labels"] = torch.tensor(labels)
    return batch

In [22]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

In [23]:
dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names, num_proc=1)
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Map:   0%|          | 0/5328 [00:00<?, ? examples/s]

Map:   0%|          | 0/666 [00:00<?, ? examples/s]

Map:   0%|          | 0/667 [00:00<?, ? examples/s]

In [24]:
training_args = TrainingArguments(
    output_dir="./wav2vec2_finetuned",
    per_device_train_batch_size=4,
    save_steps=100,
    num_train_epochs=1,
    logging_steps=25,
    learning_rate=1e-4,
    warmup_steps=50,
    max_steps=100,
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    processing_class=processor.feature_extractor,
    data_collator=data_collator,
)


In [None]:
# trainer.train(resume_from_checkpoint=True)
trainer.train()



Step,Training Loss


In [10]:
trainer.save_model("./wav2vec2_finetuned")
processor.save_pretrained("./wav2vec2_finetuned")

[]