In [2]:
import os

from datasets import load_dataset, DownloadMode
from transformers import AutoConfig, Wav2Vec2Processor, TrainingArguments, Trainer

# in-house functions
from common import utils, utils_fine_tune, crate_csv_bea_from_scp
from common.utils_fine_tune import Wav2Vec2ForSpeechClassification

In [3]:
config = utils.load_config('../config/config_sm.yml')
task = 'bea-base-train-flat'
# Loading the dataset into 'load_datasets' class
size = 5000  # size of the sub-set of the data to use
tempo_target = 'no_pause_speech'
labels_train = '../data/{}/{}_train_{}.csv'.format(task, tempo_target, size)
labels_dev = '../data/{}/{}_dev_{}.csv'.format(task, tempo_target, size)

In [14]:
data_files = {
    'train': labels_train,
    'validation': labels_dev
}

bea16k_set = load_dataset('csv', data_files=data_files, delimiter=',', cache_dir=config['hf_cache_dir'],
                          download_mode=DownloadMode['REUSE_DATASET_IF_EXISTS'])
train_set = bea16k_set['train']
val_set = bea16k_set['validation']

# Getting unique labels
label_list = train_set.unique('speed')
label_list.sort()
num_labels = len(label_list)

Using custom data configuration default-947ce755c357d31a
Reusing dataset csv (/media/jvel/data/hf_cache/csv/default-947ce755c357d31a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
subset = torch.utils.data.Subset(val_set, range(1000))

In [18]:
subset.dataset

Dataset({
    features: ['path', 'name', 'speed'],
    num_rows: 4344
})

In [5]:
# Configurations
lang = 'english'
model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-{}".format(lang)
pooling_mode = "mean"

config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [None]:
pp = utils.PreprocessFunction(processor, label_list, target_sampling_rate)

print("Generating the datasets...\n")
# Preprocess data
train_dataset = train_set.map(
    pp.preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=16
    # keep_in_memory=True
)
print("Train dataset generated successfully...\n")

eval_dataset = val_set.map(
    pp.preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=16
    # keep_in_memory=True
)
print("Validation dataset generated successfully...\n")


In [7]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['speed']}")

Training input_values: [-0.2130257487297058, -0.32530438899993896, -0.30089595913887024, -0.2764875590801239, -0.3057776391506195, -0.3643577992916107, -0.3106593191623688, -0.3350677490234375, -0.3545944392681122, -0.31554102897644043, -0.3790028393268585, -0.33994942903518677, -0.37412115931510925, -0.36923947930336, -0.33994942903518677, -0.3790028393268585, -0.3204227089881897, -0.3106593191623688, -0.271605908870697, -0.2471974939107895, -0.2911325991153717, -0.3790028393268585, -0.38876619935035706, -0.4082929193973541, -0.48151808977127075, -0.3106593191623688, -0.26672422885894775, -0.33018606901168823, -0.3643577992916107, -0.43758299946784973, -0.3985295593738556, -0.41805627942085266, -0.40341123938560486, -0.4131745994091034, -0.41805627942085266, -0.3497127592563629, -0.4717547297477722, -0.4131745994091034, -0.4082929193973541, -0.5645066499710083, -0.3985295593738556, -0.32530438899993896, -0.35947611927986145, -0.3643577992916107, -0.33018606901168823, -0.38876619935035

In [8]:
# Setting-up the trainer
data_collator = utils_fine_tune.DataCollatorCTCWithPadding(processor=processor, padding=True)

In [9]:
# Load pre-trained model to fine-tune
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)
model.freeze_feature_extractor()

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this mode

In [10]:
epochs_list = [1.0, 3.0, 5.0]
for num_train_epochs in epochs_list:
    out_dir = '../runs/{0}_{1}_{2}'.format(task, num_train_epochs, lang)
    training_args = TrainingArguments(
        output_dir=out_dir,
        # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",
        num_train_epochs=num_train_epochs,
        fp16=False,
        save_steps=10,
        eval_steps=10,
        logging_steps=10,
        learning_rate=5e-3,
        save_total_limit=2,
        # use_ipex=True
    )

    # trainer = utils_fine_tune.CTCTrainer(
    #     model=model,
    #     data_collator=data_collator,
    #     args=training_args,
    #     compute_metrics=utils.compute_metrics,
    #     train_dataset=train_dataset,
    #     eval_dataset=eval_dataset,
    #     tokenizer=processor.feature_extractor,
    # )

    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=utils.compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=processor.feature_extractor,
    )

    trainer.train()
    trainer.save_model(out_dir)

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: name, path, speed. If name, path, speed are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5000


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: name, path, speed. If name, path, speed are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4344
  Batch size = 1
Saving model checkpoint to ../runs/bea-base-train-flat_1.0_english/checkpoint-10
Configuration saved in ../runs/bea-base-train-flat_1.0_english/checkpoint-10/config.json
Model weights saved in ../runs/bea-base-train-flat_1.0_english/checkpoint-10/pytorch_model.bin
Feature extractor saved in ../runs/bea-base-train-flat_1.0_english/checkpoint-10/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: name, path, speed. If name, path, speed are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely igno

KeyboardInterrupt: 

In [54]:
train_dataset

Dataset({
    features: ['name', 'no_pause_speech', 'length', 'label', 'audio', 'sampling_rate'],
    num_rows: 5000
})

In [55]:
train_dataset = train_dataset.map(preprocess_function, batched=True, batch_size=1)
dev_dataset = dev_dataset.map(preprocess_function, batched=True, batch_size=1)

100%|██████████| 5000/5000 [02:04<00:00, 40.05ba/s]
100%|██████████| 4344/4344 [01:20<00:00, 53.75ba/s]


In [57]:
model = AutoModelForAudioClassification.from_pretrained(
        model_name,
        trust_remote_code=True,
        cache_dir=config['hf_cache_dir']
    )
model.config.id2label = None
model.config.label2id = None
model.config.num_labels = 4 # 37+2/length wav ==> target; 31/len ==> target
model.classifier = torch.nn.Linear(in_features=256, out_features=4, bias=True)

loading configuration file https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-hungarian/resolve/main/config.json from cache at /media/jvel/data/hf_cache/393f801e9cc35ac6b859a67ec4d2b4ca2d43522f9bfc9acf29aeec9be945fdd5.eab6c2c6881c3d1bae6bc094160f04e2cfa8224310d68b6a735447f973f9d9db
Model config Wav2Vec2Config {
  "_name_or_path": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
  "activation_dropout": 0.05,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean"

In [65]:
model.freeze_feature_extractor()
num_train_epochs = 10
out_dir = '../runs/{0}_{1}_{2}'.format(task, num_train_epochs, tempo_target)
args = TrainingArguments(
    output_dir=out_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=1,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="mse",
    push_to_hub=False,
    gradient_checkpointing=True,
    save_total_limit=3
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [66]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: sampling_rate, length, no_pause_speech, audio, name. If sampling_rate, length, no_pause_speech, audio, name are not expected by `Wav2Vec2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 12500


AttributeError: 'str' object has no attribute 'dtype'

In [40]:
a[0].dtype

AttributeError: 'float' object has no attribute 'dtype'