In [1]:
# 15/11/2024

# !pip install huggingface huggingface_hub
# from huggingface_hub import HfApi
# from transformers import Wav2Vec2Config, Wav2Vec2Model

!export HF_DATASETS_CACHE="/home2/havt/emotion_intensity/cache"
CACHE_DIR = "/home2/havt/emotion_intensity/cache"

In [55]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, Wav2Vec2ForAudioFrameClassification, Wav2Vec2PreTrainedModel, Wav2Vec2Model, Wav2Vec2Config
import torch
from torch import nn
# load model and tokenizer
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", cache_dir=CACHE_DIR)
# model = Wav2Vec2ForAudioFrameClassification.from_pretrained("facebook/wav2vec2-base-960h", cache_dir=CACHE_DIR)

In [3]:
from typing import Optional, Union

_HIDDEN_STATES_START_POSITION = 2

class Wav2Vec2ForEmotionIntensityAndClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
            )
        self.wav2vec2 = Wav2Vec2Model(config)
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.num_labels = config.num_labels

        self.init_weights()

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.wav2vec2.feature_extractor._freeze_parameters()

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.wav2vec2.parameters():
            param.requires_grad = False

    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # last_hidden_state # 2, 127, 768
        # extract_features  # 2, 127, 512
        # hidden_states  # [13] x 2, 27, 768

        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            hidden_states = outputs[0]

        logits = self.classifier(hidden_states) # 2, 127, 5

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # flatten=bs*127,n_emo; flatten=bs*1,n_emo
            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))

        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return output

        return {
            "loss": loss,
            "logits": logits,
            "hidden_states": outputs.hidden_states,
            "attentions": outputs.attentions,
        }

In [63]:
processor = Wav2Vec2Processor.from_pretrained("/home2/havt/tmp/wav2vec2-base-960h")
config = Wav2Vec2Config.from_pretrained("/home2/havt/tmp/wav2vec2-base-960h")
config.num_labels=5
config.use_weighted_layer_sum=True
config.ignore_mismatched_sizes=True
model = Wav2Vec2ForEmotionIntensityAndClassification(config)
# model = Wav2Vec2ForEmotionIntensityAndClassification.from_pretrained("/home2/havt/tmp/wav2vec2-base-960h", num_labels=5, use_weighted_layer_sum=True, ignore_mismatched_sizes=True)
# model = Wav2Vec2ForAudioFrameClassification.from_pretrained("/home2/havt/tmp/wav2vec2-base-960h", num_labels=5, use_weighted_layer_sum=True, ignore_mismatched_sizes=True)
# model = Wav2Vec2FeatureExtractor.from_pretrained("/home2/havt/tmp/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("/home2/havt/tmp/wav2vec2-base-960h")

In [6]:
import os
from datasets import Dataset
import pandas as pd


# files = os.listdir("/home2/havt/emotion_intensity/esd_dataset_processed")
# df = pd.DataFrame(files)
# df.to_csv("/tmp/esd_preprocessed_files.csv", header=["fname"], index=None)

In [5]:
# data = []
# for base_dir, dirs, files in os.walk("/home2/havt/emotion_intensity/esd_dataset/Emotion Speech Dataset"):
#     for fname in files:
#         if fname.endswith('.wav'):
#             fpath = os.path.join(base_dir, fname)
#             f_base_name = fname.replace(".wav", "")
#             data.append({"wav_path": fpath, "idx": f_base_name})

In [6]:
# wav_df = pd.DataFrame(data)
# wav_df.head()

In [7]:
# mel_df = pd.read_csv("/tmp/esd_preprocessed_files.csv")
# mel_df['idx'] = mel_df.apply(lambda r: r['fname'].split(".")[0][4:15], axis=1)
# mel_df.head()

In [8]:
# df = pd.merge(mel_df, wav_df, on='idx', how='inner')
# df.to_csv("/tmp/esd_mel_wav_processed.csv", header=["fname", "idx", "wav_path"], index=None)

In [16]:
ds = Dataset.from_csv("/tmp/esd_mel_wav_processed.csv", cache_dir=CACHE_DIR)
ds = ds.select(range(0, 1000))

In [17]:
import pickle as pk


base_dir = "/home2/havt/emotion_intensity/esd_dataset_processed"
def read_mel(row):
    fname = row['fname']
    mel_file = os.path.join(base_dir, fname)
    with open(mel_file, "rb") as f:
        data = pk.load(f)
        row['mel'] = data
    return row

def extract_fname(row):
    fname = row['fname']
    fname = fname.split('.')[0]
    fname = fname.split('_')
    _, set_idx, sample_idx, emotion = fname
    row['set_idx'] = set_idx
    row['sample_idx'] = sample_idx
    row['emotion'] = emotion
    return row

emotion_map = {
    "neutral": 0,
    "happy": 1,
    "sad": 2,
    "angry": 3,
    "surprise": 4,
}

def emotion_to_id(row):
    emotion = row['emotion']
    emotion_id = emotion_map[emotion]
    row['emotion_id'] = emotion_id
    return row

# import librosa
from scipy.io import wavfile
def read_wav(row):
    audio_path = row['wav_path']
    # wav, sr = librosa.load(audio_path)
    _, wav = wavfile.read(audio_path)
    row['wav'] = wav
    return row


# ds = ds.map(read_mel)
ds = ds.map(extract_fname)
ds = ds.map(emotion_to_id)
ds = ds.map(read_wav)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map: 100%|██████████| 1000/1000 [00:00<00:00, 7169.84 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 7157.67 examples/s]
Map: 100%|██████████| 1000/1000 [00:01<00:00, 923.21 examples/s]


In [18]:
ds = ds.with_format('pt')

In [19]:
# print(ds.column_names)
ds = ds.remove_columns(['fname', 'set_idx', 'sample_idx', 'emotion', 'wav_path'])
ds[0]

{'idx': '0003_000898',
 'emotion_id': tensor(1),
 'wav': tensor([32, 30, 30,  ..., -4, -7, -7])}

In [20]:
# ds = ds.remove_columns(['mel'])
ds = ds.rename_column('wav', 'input_values')
ds = ds.rename_column('emotion_id', 'labels')

In [21]:
ds[0]

{'idx': '0003_000898',
 'labels': tensor(1),
 'input_values': tensor([32, 30, 30,  ..., -4, -7, -7])}

In [69]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union



def get_mask_from_lengths(lengths, max_len=None, device=None):
    batch_size = lengths.shape[0]
    if max_len is None:
        max_len = torch.max(lengths).item()

    if device == "cpu":
        ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1)
    elif device is not None:
        ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).cuda()
    elif device is None:
        ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1)
    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)

    return mask


class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        # input_features = [{"input_values": feature["input_values"].type(torch.FloatTensor)} for feature in features]
        input_features = [{"input_values": self.processor(audio=feature["input_values"], return_tensors="pt", padding="longest", sampling_rate=16000).input_values[0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        input_feature_lens = [len(v['input_values']) for v in input_features]
        attention_mask = get_mask_from_lengths(torch.Tensor(input_feature_lens), max_len=max(input_feature_lens))
        
        # print(input_features)
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        batch["attention_mask"] = attention_mask

        return batch
    

In [70]:
collator = DataCollatorCTCWithPadding()
collator.processor=processor
collator.padding=True
loader = torch.utils.data.DataLoader(
    ds,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    collate_fn=collator,
)
for batch in loader:
    print(batch)
    break

{'input_values': tensor([[ 6.9086e-04, -4.1661e-05, -1.0297e-02,  ...,  6.5510e-03,
          6.5510e-03,  7.2835e-03],
        [ 1.1239e-02,  1.0571e-02,  1.1239e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), 'labels': tensor([4, 1]), 'attention_mask': tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True,  True,  True]])}


In [71]:
# model.freeze_feature_extractor()
model = model.to('cpu')

In [72]:
# model.config.use_weighted_layer_sum = True

In [74]:
n_batchs = 10

for epoch in range(1, n_batchs+1):
    for batch in loader:
        # print(batch)
        # print(type(batch))
        # out = model(batch.input_values, sampling_rate=16000)
        out = model(**batch)
        # out = model(**batch)
        print(out)
        # print(torch.Tensor(out['input_values']))
        break

TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [96]:
model(batch.input_values)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'input_values': [array([[ 2.9083561e-02,  2.5848836e-02,  2.2614107e-02, ...,
         3.2293976e-09,  3.2293976e-09,  3.2293976e-09],
       [-5.5489864e-02, -5.3176068e-02, -4.8548486e-02, ...,
         3.9375644e-02,  3.0120473e-02,  3.2434266e-02]], dtype=float32)]}

In [19]:
# !pip install accelerate -U

In [20]:
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#   output_dir="./wav2vec_out",
#   group_by_length=True,
#   per_device_train_batch_size=32,
#   evaluation_strategy="steps",
#   num_train_epochs=30,
#   fp16=True,
#   gradient_checkpointing=True, 
#   save_steps=500,
#   eval_steps=500,
#   logging_steps=500,
#   learning_rate=1e-4,
#   weight_decay=0.005,
#   warmup_steps=1000,
#   save_total_limit=2,
# )
ds[0]

{'idx': '0003_000898',
 'labels': tensor(1),
 'input_values': tensor([32, 30, 30,  ..., -4, -7, -7])}

In [114]:
# input_values = processor(ds[0]['labels'], return_tensors="pt", padding="longest", sampling_rate=16000).input_values + 10
ds[0]['labels']

tensor(1)

In [83]:
# tokenize
input_values = processor(ds[0]["input_values"], return_tensors="pt", padding="longest", sampling_rate=16000).input_values + 10
input_values
# # input_values.shape
output = model(input_values)
# # # logits = output.logits
# # # output['input_values'][0].shape
# # output['logits'].shape
output

TokenClassifierOutput(loss=None, logits=tensor([[[nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, nan],
         [nan, 

...

In [22]:
# from datasets import load_dataset


# # load dummy dataset and read soundfiles
# ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", cache_dir=CACHE_DIR)

# # # tokenize
# # input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values  # Batch size 1

# # # retrieve logits
# # logits = model(input_values).logits

# # # take argmax and decode
# # predicted_ids = torch.argmax(logits, dim=-1)
# # transcription = processor.batch_decode(predicted_ids)

In [23]:
# from datasets import load_dataset, load_from_disk

# ds = load_from_disk("/home2/havt/datasets/patrickvonplaten=librispeech_asr_dummy")
# ds[0]

In [24]:
# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate=16000).input_values 
input_values.shape

KeyError: 'audio'

In [None]:
output = model(input_values)
# logits = output.logits
# output['input_values'][0].shape
output['logits'].shape

torch.Size([1, 232, 2])