In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import TrainingArguments, Trainer
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_metric
from ast import literal_eval

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
language_code = 'ar'
language_name = 'arabic'
base_model = "facebook/wav2vec2-large-xlsr-53"

data_dir = "./drive/My Drive/Senior Project/Resources/data"
bucket_name = "hearsome-sagemaker-datasets"

output_models_dir = f"./workspace/output_models/{language_code}/wav2vec2-large-xlsr-{language_name}-demo"
new_output_models_dir = f"./drive/My Drive/Senior Project/Resources/workspace/output_models/{language_code}/wav2vec2-large-xlsr-{language_name}"

In [None]:
train = pd.read_pickle(f"{data_dir}/train.pbz2", compression='bz2')
dev = pd.read_pickle(f"{data_dir}/dev.pbz2", compression='bz2')

In [None]:
train.head()

Unnamed: 0,input_values,input_length,labels
0,"[-0.010540983, -0.006122073, -0.003470727, -0....",64128,"[38, 0, 12, 14, 18, 34, 40, 0, 11, 34, 24, 18,..."
1,"[-0.0019061395, -0.0007876515, -0.0007876515, ...",54096,"[14, 35, 11, 35, 0, 16, 19, 11, 0, 35, 29, 33,..."
2,"[0.0006602038, -0.00043284148, -0.00043284148,...",43056,"[11, 37, 0, 38, 11, 23, 14, 18, 19, 11, 35, 0,..."
3,"[0.0010812458, 0.0025648824, 0.0010812458, 0.0...",61760,"[38, 0, 11, 40, 26, 11, 0, 22, 40, 0, 35, 11, ..."
4,"[0.07599429, 0.06602047, 0.062280282, 0.054799...",55536,"[33, 34, 0, 40, 38, 35, 0, 17, 11, 16, 13, 0, ..."


In [None]:
processor = Wav2Vec2Processor.from_pretrained(new_output_models_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from torch.utils.data import Dataset, DataLoader

class MGB3Dataset(Dataset):
    def __init__(self, df):
        super(Dataset, self).__init__()
        self.data = df[["input_values", "input_length", "labels"]].to_dict('records')
        
    def __len__(self):
        return len(self.data)
        

    def __getitem__(self, index):
        return self.data[index]

In [None]:
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
        
    def __init__(self, processor: Wav2Vec2Processor, padding: Union[bool, str]):
        self.processor = processor
        self.padding = padding

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.weight', 'project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.weight', 'project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

In [None]:
model.freeze_feature_encoder()

In [None]:
training_args = TrainingArguments(
  output_dir=new_output_models_dir,
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=600,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [None]:
train_dataset = MGB3Dataset(train)
dev_dataset = MGB3Dataset(dev)

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 6140
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 5760
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
400,4.2378,1.281393,0.791274
800,0.9609,0.672955,0.453432
1200,0.5692,0.707748,0.500111
1600,0.3922,0.600045,0.320324
2000,0.2957,0.609008,0.291828
2400,0.2515,0.623151,0.317441


***** Running Evaluation *****
  Num examples = 1310
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
Saving model checkpoint to ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/checkpoint-600
Configuration saved in ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/checkpoint-600/config.json
Model weights saved in ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/checkpoint-600/pytorch_model.bin
Feature extractor saved in ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/checkpoint-600/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 8
The following col

KeyboardInterrupt: ignored

In [None]:
trainer.save_model(new_output_models_dir)

Saving model checkpoint to ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic
Configuration saved in ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/config.json
Model weights saved in ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/pytorch_model.bin
Feature extractor saved in ./drive/My Drive/Senior Project/Resources/workspace/output_models/ar/wav2vec2-large-xlsr-arabic/preprocessor_config.json
