<a href="https://colab.research.google.com/github/hoa92ng/Homework/blob/main/Making_the_Most_of_your_Colab_Subscription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch import nn
from transformers import Wav2Vec2Model, AutoFeatureExtractor, get_scheduler
from datasets import Dataset, DatasetDict, Audio, concatenate_datasets
from model.pure_model_final import Wave_Network
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, load_dataset
import matplotlib.pyplot as plt
from torchmetrics import F1Score, Precision, Recall, Accuracy
from tqdm.auto import tqdm
from data.audio_augmentation import random_augementation
from data.custom_dataset import Custom_Audio_Dataset

dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'unknown':10,
              'silence':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding="max_length", max_length=16_000, truncation=True,
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def agumentation_function(examples, i):
    examples["audio"]['array'] = random_augementation(examples["audio"]['array'], i)

    return examples

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['nomaly_label'] = 0
    else:
        seq['nomaly_label'] = 1
    return seq

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_values']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_values'] = torch.cat([x['input_values'], torch.zeros(max_len - len(x['input_values']))])

    return_batch['input_values'] = torch.stack([x['input_values'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['anomaly_label'] = torch.stack([x['anomaly_label'] for x in batch])

    # Stack the padded sequences into a single tensor
    return return_batch

def show_data_count(input_dataset):
    df = input_dataset.to_pandas()
    # Giả sử cột label trong dataset có tên là 'label'
    label_counts = df['label'].value_counts()
    # Hiển thị kết quả
    print(label_counts)


if __name__ == "__main__":
    model_path = './model_1'
    weight_cls_state_dict = './w2vec/models/w2vec_model_all_in_one_cls_last.pth'
    epoch_num = 50
    batch_size = 32
    device = 'cuda'

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'unknown': '10', 'silence': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'unknown', '11':'silence'}

    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
    model_backbone = Wav2Vec2Model.from_pretrained(model_path).to(device=device)

    for param in model_backbone.parameters():
            param.requires_grad = False

    train_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs\train_updated_balance', feature_extractor=feature_extractor)
    valid_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs\test', feature_extractor=feature_extractor)
    train_data.show_data_count()
    valid_data.show_data_count()

    train_dataloader = DataLoader(train_data, batch_size, shuffle=True)
    test_dataloader = DataLoader(valid_data, batch_size)

    num_class = len(label2id) - 1
    model_cls = Wave_Network(num_classes=num_class).to(device=device)
    # model_cls.load_state_dict(torch.load(weight_cls_state_dict))
    # model_anomaly.load_state_dict(torch.load(weight_anomaly_state_dict))

    criterion_1 = nn.CrossEntropyLoss()
    criterion_2 = nn.BCEWithLogitsLoss()
    optim_cls = torch.optim.AdamW(model_cls.parameters(), lr=0.001)
    acc_score_classification = Accuracy('multiclass', num_classes=num_class).to(device='cuda')
    acc_score_anomaly = Accuracy('binary').to(device='cuda')

    num_training_steps = epoch_num * len(train_dataloader)
    lr_scheduler_cls = get_scheduler(
        name="linear", optimizer=optim_cls, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    progress_bar = tqdm(range(num_training_steps), position=0, leave=True)

    for epoch in range(epoch_num):
        loss_item = 0.
        valid_loss = 0.
        running_loss = 0
        save_loss = 999999
        loss_item_1 = 0.
        loss_item_2 = 0.
        model_cls.train()
        acc_score_classification.reset()
        acc_score_anomaly.reset()
        for i, data in enumerate(train_dataloader):
            optim_cls.zero_grad()
            with torch.no_grad():
                data['input_values'] = model_backbone(data['input_values'].cuda()).last_hidden_state
            inputs, labels = data['input_values'].cuda(), data['nomaly_label'].cuda()
            normal_inputs = data['input_values'][data['nomaly_label']==1].cuda()
            anormal_inputs = data['input_values'][data['nomaly_label']==0].cuda()
            normal_labels = data['label'][data['nomaly_label']==1].cuda()

            if len(normal_inputs) == 0: continue

            original_project_label = torch.unsqueeze(data['nomaly_label'], dim=-1)
            false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
            projection_labels = torch.concat((original_project_label, false_project_label)).cuda()

            o_classification, o_nomaly = model_cls(inputs, labels)
            loss_1 = criterion_1(o_classification, normal_labels)
            f1_score_cls_metric = acc_score_classification(o_classification.detach(), normal_labels.detach())


            p_origin_score = o_nomaly[:len(original_project_label)]
            p_fake_score = o_nomaly[len(original_project_label):]
            p_true_score = p_origin_score[labels==1]
            p_fake_score = torch.cat([p_fake_score, p_origin_score[labels==0]])

            true_labels = torch.ones(size=(p_true_score.shape[0], 1)).cuda()
            fake_labels = torch.zeros(size=(p_fake_score.shape[0], 1)).cuda()
            weight_balance = len(fake_labels) / (len(true_labels) + 1e-10)
            # loss_2 = criterion_2(p_true_score, true_labels) * weight_balance + criterion_2(p_fake_score, fake_labels)
            loss_2 = criterion_2(p_true_score, true_labels) * 2 + criterion_2(p_fake_score, fake_labels)
            acc_score_anomaly_metric = acc_score_anomaly(o_nomaly.detach(), projection_labels.detach())

            loss_item_1 += loss_1.item()
            loss_item_2 += loss_2.item()

            loss = 5 * loss_1 + loss_2
            loss.backward()
            optim_cls.step()
            lr_scheduler_cls.step()


            progress_bar.update(1)
            if i % 500 == 499:  # Print every 10 batches
                tqdm.write(f'[Epoch {epoch + 1}, Batch {i + 1}] loss_cls: {loss_item_1/500:.3f} | loss_anomaly: {loss_item_2/500:.3f} | acc_cls_score: {acc_score_classification.compute():.3f} | acc_anomaly_score: {acc_score_anomaly.compute():.3f}')
                loss_item_1 = 0.0
                loss_item_2 = 0.0

        model_cls.eval()
        acc_score_classification.reset()
        acc_score_anomaly.reset()
        with torch.no_grad():
            for i, data in enumerate(test_dataloader):
                data['input_values'] = model_backbone(data['input_values'].cuda()).last_hidden_state
                # data['input_values'] = model_backbone(data['input_values'].cuda()).extract_features
                inputs, labels = data['input_values'].cuda(), data['nomaly_label'].cuda()
                normal_inputs = data['input_values'][data['nomaly_label']==1].cuda()
                anormal_inputs = data['input_values'][data['nomaly_label']==0].cuda()
                normal_labels = data['label'][data['nomaly_label']==1].cuda()
                if len(normal_inputs) == 0: continue
                # true_project_label = torch.ones(size=(normal_inputs.shape[0], 1))
                original_project_label = torch.unsqueeze(data['nomaly_label'], dim=-1).cuda().float()
                false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
                # projection_labels = torch.concat((original_project_label, false_project_label)).cuda()
                o_classification, o_nomaly = model_cls(inputs, labels, is_train=False)
                loss_1 = criterion_1(o_classification, normal_labels)
                loss_2 = criterion_2(o_nomaly, original_project_label)
                loss = loss_1 + loss_2
                valid_loss += loss.item()
                f1_score_cls_metric = acc_score_classification(o_classification.detach(), normal_labels.detach())
                acc_score_anomaly_metric = acc_score_anomaly(o_nomaly.detach(), original_project_label.detach())
            valid_loss = valid_loss/len(test_dataloader)
            if epoch == 0: valid_loss = save_loss
            else:
                if valid_loss < save_loss:
                    torch.save(model_cls.state_dict(), f'./w2vec/models/w2vec_model_all_in_one_cls_best.pth')
                    save_loss = valid_loss
            torch.save(model_cls.state_dict(), f'./w2vec/models/w2vec_model_all_in_one_cls_last.pth')
            tqdm.write(f"Epoch [{epoch+1}/{epoch_num}], Valid Loss: {valid_loss:.3f} | acc_cls_score: {acc_score_classification.compute():.3f} | acc_anomaly_score: {acc_score_anomaly.compute():.3f}")

    torch.save(model_cls.state_dict(), f'./w2vec/models/w2vec_model_all_in_one_cls_final.pth')


In [None]:
from torch import nn
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from transformers import AutoFeatureExtractor, Wav2Vec2Model
from datasets import Dataset, DatasetDict, Audio
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, Audio
import matplotlib.pyplot as plt
from model.pure_model_final import Wave_Network

dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'silence':10,
              'unknown':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding=True
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_values']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_values'] = torch.cat([x['input_values'], torch.zeros(max_len - len(x['input_values']))])

    return_batch['input_values'] = torch.stack([x['input_values'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['nomaly_label'] = torch.stack([x['nomaly_label'] for x in batch])
    # return_batch['re_label'] = torch.stack([x['re_label'] for x in batch])
    return return_batch

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['nomaly_label'] = 0
    else:
        seq['nomaly_label'] = 1
    return seq

if __name__ == "__main__":
    model_path = './model_1'
    device = 'cuda'
    weight_state_dict_cls = './w2vec/models/w2vec_model_all_in_one_cls_best.pth'
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
    model_backbone = Wav2Vec2Model.from_pretrained(model_path).to(device=device)

    model_cls = Wave_Network(num_classes=11).to(device=device)
    batch_size = 1

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'silence': '10', 'unknown': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'silence', '11':'unknown'}

    # dataset = load_from_disk(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs')
    # test_dataset = dataset['test']
    test_dataset = load_from_disk(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\v0.01\all\test')

    test_dataset = test_dataset.map(preprocess_function, remove_columns='audio', batched=True)
    test_dataset = test_dataset.remove_columns(['file', 'is_unknown', 'speaker_id', 'utterance_id'])
    print(test_dataset)
    test_dataset_loader = DataLoader(test_dataset.with_format('torch'), batch_size, shuffle=False)
    num_class = len(label2id) - 1
    model_cls.load_state_dict(torch.load(weight_state_dict_cls))

    loss_item = 0.
    model_cls.eval()

    acc = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataset_loader):
            input_values = model_backbone(data['input_values'].cuda()).last_hidden_state
            input_values = input_values.cuda()
            o_classification, o_nomaly = model_cls(input_values, is_train=False)
            anomaly_arg = torch.sigmoid(o_nomaly)
            cls_arg = torch.argmax(o_classification, dim=-1)
            if (anomaly_arg.cpu().item() > 0.5 and data['nomaly_label'].cpu().item() == 1):
                if data['label'].cpu().item() != 11 and cls_arg.cpu().item() == data['label'].cpu().item():
                    acc += 1
                else:
                    print(i, anomaly_arg, data['nomaly_label'], cls_arg, data['label'])
            elif anomaly_arg.cpu().item() <= 0.5 and data['nomaly_label'].cpu().item() == 0:
                acc += 1
            else:
                print(i, anomaly_arg, data['nomaly_label'], cls_arg, data['label'])
        print(f'acc: {acc/len(test_dataset)}')


In [None]:
from torch import nn
from transformers import Wav2Vec2Model, AutoModelForAudioClassification
from transformers import AutoFeatureExtractor
from datasets import Dataset, DatasetDict
import torch
import torch.nn.functional as F
from model.attention_cbam import CBAM_1D


class Projection(torch.nn.Module):

    def __init__(self, in_planes=49, out_planes=1024, n_layers=3):
        super(Projection, self).__init__()
        self.out_planes = out_planes
        self.projector = torch.nn.Sequential()
        conv1d_input_size = in_planes
        for i in range(n_layers):
            if i == 0:
                self.projector.add_module(f"{i}cv1d",
                                   torch.nn.Conv1d(conv1d_input_size, conv1d_input_size * 2, 3, padding='same'))
            else:
                self.projector.add_module(f"{i}cv1d",
                                   torch.nn.Conv1d(conv1d_input_size * 2 ** i, conv1d_input_size * 2 ** (i + 1), 3, padding='same'))
            self.projector.add_module(f"{i}bn",
                                    torch.nn.BatchNorm1d(conv1d_input_size * 2 ** (i + 1)))
            self.projector.add_module(f"{i}relu", torch.nn.ReLU())
            self.projector.add_module(f"{i}cbam", CBAM_1D(conv1d_input_size * 2 ** (i + 1), 2))
            self.projector.add_module(f"{i}max_pool", torch.nn.MaxPool1d(2, 2))

    def forward(self, x):
        x = self.projector(x)
        x = F.adaptive_avg_pool1d(x.view(x.shape[0], -1), self.out_planes)
        return x

class Discriminator(torch.nn.Module):
    def __init__(self, in_planes, n_layers=3, hidden=None):
        super(Discriminator, self).__init__()

        _hidden = in_planes if hidden is None else hidden
        self.body = torch.nn.Sequential()
        for i in range(n_layers-1):
            _in = in_planes if i == 0 else _hidden
            _hidden = int(_hidden // 1.5) if hidden is None else hidden
            self.body.add_module('block%d'%(i+1),
                                 torch.nn.Sequential(
                                     nn.Dropout(0.2),
                                     torch.nn.Linear(_in, _hidden),
                                     torch.nn.BatchNorm1d(_hidden),
                                     torch.nn.ReLU()
                                 ))
        self.tail = torch.nn.Linear(_hidden, 1)

    def forward(self,x):
        x = self.body(x)
        x = self.tail(x)
        return x


class Wave_Network(nn.Module):
    def __init__(self, num_classes=12, std=0.04, use_cbam=True):
        super().__init__()
        self.std = std
        self.use_cbam = use_cbam
        self.projector = Projection(49, 768, 3)
        self.sub_projector = nn.Sequential(
            nn.Conv1d(1, 1, 3, padding='same'),
            nn.Conv1d(1, 1, 3, padding='same'),
            nn.Conv1d(1, 1, 3, padding='same'),
        )
        self.discriminator = Discriminator(768, 2)

        self.linear1 = nn.Sequential(nn.LazyLinear(out_features=768, bias=True),
                                     nn.BatchNorm1d(768),
                                     nn.Dropout(0.2),
                                     nn.ReLU())
        # self.linear2 = nn.Sequential(nn.LazyLinear(out_features=512, bias=True),
        #                              nn.BatchNorm1d(512),
        #                              nn.Dropout(0.2),
        #                              nn.ReLU())
        self.classifier = nn.LazyLinear(out_features=num_classes, bias=True)

    def forward(self, x, nomaly_label=None, is_train=True):
        input_x_projector = self.projector(x)

        if nomaly_label is not None:
            input_x_normal = input_x_projector[nomaly_label==1]
        else:
            input_x_normal = input_x_projector

        x_hidden_cls = self.linear1(input_x_normal)
        # x_hidden_cls = self.linear2(x_hidden_cls)
        x_classification = self.classifier(x_hidden_cls)

        input_x_sub_projector = torch.unsqueeze(input_x_projector, 1)
        input_x_sub_projector = self.sub_projector(input_x_sub_projector)
        input_x_sub_projector = torch.squeeze(input_x_sub_projector, dim=1)
        # input_x_sub_projector = torch.add(input_x_projector, input_x_sub_projector)

        if nomaly_label is not None:
            input_x_sub_normal = input_x_sub_projector[nomaly_label==1]
        else:
            input_x_sub_normal = input_x_sub_projector
        if is_train:
            noise = torch.normal(mean=0, std=self.std, size=input_x_sub_normal.shape).cuda()
            x_noise = input_x_sub_normal + noise
            x_noise = torch.concat((input_x_sub_projector, x_noise))
            x_anomaly = self.discriminator(x_noise)
        else:
            x_anomaly = self.discriminator(input_x_sub_projector)
        return x_classification, x_anomaly


# from torchsummary import summary
# model_cls = Wave_Network_Classification_Update(num_classes=11)
# summary(model_cls)


In [None]:
from datasets import Dataset, load_from_disk, Audio, VerificationMode, load_dataset
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, AddShortNoises, RoomSimulator, Gain
import random
import numpy as np
import torch

class Custom_Audio_Dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset_path, feature_extractor, use_transform=False, sr=16_000):
        """
        Arguments:
            n/a
        """
        self.feature_extractor = feature_extractor
        self.sample_rate = sr
        self.dataset = load_from_disk(dataset_path)
        self.dataset = self.dataset.map(self._preprocess_function, remove_columns='audio', batched=True)
        self.dataset = self.dataset.map(self._edit_label_2)
        self.transforms = [
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            Shift(p=0.5),
            # RoomSimulator(),
            Gain(min_gain_db=0.5, max_gain_db=1.5),
        ]
        self.use_transform = use_transform

    def _preprocess_function(self, examples):
        audio_arrays = [x['array'] for x in examples["audio"]]
        inputs = self.feature_extractor (
            audio_arrays, sampling_rate=self.feature_extractor.sampling_rate, padding="max_length", max_length=16_000, truncation=True,
        )
        # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
        return inputs

    def _edit_label_2(self, seq):
        if seq['label'] == 11:
            seq['nomaly_label'] = 0
        else:
            seq['nomaly_label'] = 1
        return seq

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        if self.use_transform:
            r_idx = random.randint(0, len(self.transforms))
            if r_idx != len(self.transforms):
                data['input_values'] = self.transforms[r_idx](np.array(data['input_values']), sample_rate=self.sample_rate)
            else:
                 data['input_values'] = np.array(data['input_values'])
        data['input_values'] = torch.Tensor(data['input_values'])
        return {'label': data['label'], 'nomaly_label': data['nomaly_label'], 'input_values': data['input_values']}

    def show_data_count(self):
        df = self.dataset.to_pandas()
        # Giả sử cột label trong dataset có tên là 'label'
        label_counts = df['label'].value_counts()
        # Hiển thị kết quả
        print(label_counts)

In [None]:
from torch import nn
from transformers import get_scheduler, WhisperModel, WhisperFeatureExtractor
from model.pure_model_whisper import Wave_Network_Classification_Update, Wave_Network_Anomaly_Detection
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, load_dataset
import matplotlib.pyplot as plt
from torchmetrics import F1Score, Precision, Recall, Accuracy
from tqdm.auto import tqdm
from data.audio_augmentation import random_augementation
from data.custom_dataset_whisper import Custom_Audio_Dataset

dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'unknown':10,
              'silence':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding="max_length", max_length=16_000, truncation=True,
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def agumentation_function(examples, i):
    examples["audio"]['array'] = random_augementation(examples["audio"]['array'], i)

    return examples

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['nomaly_label'] = 0
    else:
        seq['nomaly_label'] = 1
    return seq

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_features']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_features'] = torch.cat([x['input_features'], torch.zeros(max_len - len(x['input_features']))])

    return_batch['input_features'] = torch.stack([x['input_features'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['anomaly_label'] = torch.stack([x['anomaly_label'] for x in batch])

    # Stack the padded sequences into a single tensor
    return return_batch

def show_data_count(input_dataset):
    df = input_dataset.to_pandas()
    # Giả sử cột label trong dataset có tên là 'label'
    label_counts = df['label'].value_counts()
    # Hiển thị kết quả
    print(label_counts)


if __name__ == "__main__":
    model_path = './whisper_tiny/raw_tiny'
    weight_state_dict_cls = './w2vec/models/whisper_model_cls_last - Copy.pth'
    weight_state_dict_anomaly = './w2vec/models/whisper_model_anomaly_last - Copy.pth'
    epoch_num = 25
    batch_size = 32
    device = 'cuda'

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'unknown': '10', 'silence': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'unknown', '11':'silence'}

    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path, sampling_rate=16_000, chunk_length=1)
    model_backbone = WhisperModel.from_pretrained(model_path).encoder.to(device=device)
    # model_backbone.post_init()

    for param in model_backbone.parameters():
            param.requires_grad = False

    train_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs\train_updated_balance', feature_extractor=feature_extractor)
    valid_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs\test', feature_extractor=feature_extractor)
    train_data.show_data_count()
    valid_data.show_data_count()

    train_dataloader = DataLoader(train_data, batch_size, shuffle=True)
    test_dataloader = DataLoader(valid_data, batch_size)

    num_class = len(label2id) - 1
    model_cls = Wave_Network_Classification_Update(num_classes=num_class).to(device=device)
    model_anomaly = Wave_Network_Anomaly_Detection().to(device=device)
    model_cls.load_state_dict(torch.load(weight_state_dict_cls))
    model_anomaly.load_state_dict(torch.load(weight_state_dict_anomaly))

    criterion_1 = nn.CrossEntropyLoss()
    criterion_2 = nn.BCEWithLogitsLoss()
    optim_cls = torch.optim.AdamW(model_cls.parameters(), lr=0.0001)
    optim_anomaly = torch.optim.AdamW(model_anomaly.parameters(), lr=0.0001)
    acc_score_classification = Accuracy('multiclass', num_classes=num_class).to(device='cuda')
    acc_score_anomaly = Accuracy('binary').to(device='cuda')

    num_training_steps = epoch_num * len(train_dataloader)
    lr_scheduler_cls = get_scheduler(
        name="linear", optimizer=optim_cls, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    lr_scheduler_anomaly = get_scheduler(
        name="linear", optimizer=optim_anomaly, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    progress_bar = tqdm(range(num_training_steps), position=0, leave=True)

    for epoch in range(epoch_num):
        loss_item = 0.
        valid_loss = 0.
        running_loss = 0
        save_loss = 999999
        loss_item_1 = 0.
        loss_item_2 = 0.
        model_cls.train()
        model_anomaly.train()
        acc_score_classification.reset()
        acc_score_anomaly.reset()
        for i, data in enumerate(train_dataloader):
            optim_cls.zero_grad()
            optim_anomaly.zero_grad()
            with torch.no_grad():
                data['input_features'] = model_backbone(data['input_features'].cuda()).last_hidden_state
            inputs, labels = data['input_features'].cuda(), data['nomaly_label'].cuda()
            normal_inputs = data['input_features'][data['nomaly_label']==1].cuda()
            anormal_inputs = data['input_features'][data['nomaly_label']==0].cuda()
            normal_labels = data['label'][data['nomaly_label']==1].cuda()

            if len(normal_inputs) == 0: continue

            original_project_label = torch.unsqueeze(data['nomaly_label'], dim=-1)
            false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
            projection_labels = torch.concat((original_project_label, false_project_label)).cuda()

            o_classification = model_cls(inputs, labels)
            o_nomaly = model_anomaly(inputs, labels)
            loss_1 = criterion_1(o_classification, normal_labels)

            loss_1.backward()
            optim_cls.step()
            lr_scheduler_cls.step()
            f1_score_cls_metric = acc_score_classification(o_classification.detach(), normal_labels.detach())

            p_origin_score = o_nomaly[:len(original_project_label)]
            p_fake_score = o_nomaly[len(original_project_label):]
            p_true_score = p_origin_score[labels==1]
            p_fake_score = torch.cat([p_fake_score, p_origin_score[labels==0]])

            true_labels = torch.ones(size=(p_true_score.shape[0], 1)).cuda()
            fake_labels = torch.zeros(size=(p_fake_score.shape[0], 1)).cuda()
            weight_balance = len(fake_labels) / (len(true_labels) + 1e-10)
            # loss_2 = criterion_2(p_true_score, true_labels) * weight_balance + criterion_2(p_fake_score, fake_labels)
            loss_2 = criterion_2(p_true_score, true_labels) * 2 + criterion_2(p_fake_score, fake_labels)
            # loss_2 = criterion_2(o_nomaly, projection_labels)
            loss_2.backward()
            optim_anomaly.step()
            lr_scheduler_anomaly.step()
            acc_score_anomaly_metric = acc_score_anomaly(o_nomaly.detach(), projection_labels.detach())

            loss_item_1 += loss_1.item()
            loss_item_2 += loss_2.item()
            progress_bar.update(1)
            if i % 500 == 499:  # Print every 10 batches
                tqdm.write(f'[Epoch {epoch + 1}, Batch {i + 1}] loss_cls: {loss_item_1/500:.3f} | loss_anomaly: {loss_item_2/500:.3f} | acc_cls_score: {acc_score_classification.compute():.3f} | acc_anomaly_score: {acc_score_anomaly.compute():.3f}')
                loss_item_1 = 0.0
                loss_item_2 = 0.0

        model_cls.eval()
        model_anomaly.eval()
        acc_score_classification.reset()
        acc_score_anomaly.reset()
        with torch.no_grad():
            for i, data in enumerate(test_dataloader):
                data['input_features'] = model_backbone(data['input_features'].cuda()).last_hidden_state
                inputs, labels = data['input_features'].cuda(), data['nomaly_label'].cuda()
                normal_inputs = data['input_features'][data['nomaly_label']==1].cuda()
                anormal_inputs = data['input_features'][data['nomaly_label']==0].cuda()
                normal_labels = data['label'][data['nomaly_label']==1].cuda()
                if len(normal_inputs) == 0: continue
                # true_project_label = torch.ones(size=(normal_inputs.shape[0], 1))
                original_project_label = torch.unsqueeze(data['nomaly_label'], dim=-1).cuda().float()
                false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
                # projection_labels = torch.concat((original_project_label, false_project_label)).cuda()
                o_classification = model_cls(normal_inputs, is_train=False)
                o_nomaly = model_anomaly(inputs, labels, is_train=False)
                loss_1 = criterion_1(o_classification, normal_labels)
                loss_2 = criterion_2(o_nomaly, original_project_label)
                loss = loss_1 + loss_2
                valid_loss += loss.item()
                f1_score_cls_metric = acc_score_classification(o_classification.detach(), normal_labels.detach())
                acc_score_anomaly_metric = acc_score_anomaly(o_nomaly.detach(), original_project_label.detach())
            valid_loss = valid_loss/len(test_dataloader)
            if epoch == 0: valid_loss = save_loss
            else:
                if valid_loss < save_loss:
                    torch.save(model_cls.state_dict(), f'./w2vec/models/whisper_model_cls_best.pth')
                    torch.save(model_anomaly.state_dict(), f'./w2vec/models/whisper_model_anomaly_best.pth')
                    save_loss = valid_loss
            torch.save(model_cls.state_dict(), f'./w2vec/models/whisper_model_cls_last.pth')
            torch.save(model_anomaly.state_dict(), f'./w2vec/models/whisper_model_anomaly_last.pth')
            tqdm.write(f"Epoch [{epoch+1}/{epoch_num}], Valid Loss: {valid_loss:.3f} | acc_cls_score: {acc_score_classification.compute():.3f} | acc_anomaly_score: {acc_score_anomaly.compute():.3f}")

    torch.save(model_cls.state_dict(), f'./w2vec/models/whisper_model_cls_final.pth')
    torch.save(model_anomaly.state_dict(), f'./w2vec/models/whisper_model_anomaly_final.pth')


In [None]:
from torch import nn
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from transformers import WhisperModel, WhisperFeatureExtractor
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, Audio
import matplotlib.pyplot as plt
from model.pure_model_whisper import Wave_Network_Classification_Update, Wave_Network_Anomaly_Detection

dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'silence':10,
              'unknown':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding=True
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_features']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_features'] = torch.cat([x['input_features'], torch.zeros(max_len - len(x['input_features']))])

    return_batch['input_features'] = torch.stack([x['input_features'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['nomaly_label'] = torch.stack([x['nomaly_label'] for x in batch])
    # return_batch['re_label'] = torch.stack([x['re_label'] for x in batch])
    return return_batch

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['nomaly_label'] = 0
    else:
        seq['nomaly_label'] = 1
    return seq

if __name__ == "__main__":
    model_path = './whisper_tiny/raw_tiny'
    device = 'cuda'
    weight_state_dict_cls = './w2vec/models/whisper_model_cls_last - Copy (2).pth'
    weight_state_dict_anomaly = './w2vec/models/whisper_model_anomaly_last - Copy (2).pth'
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path, sampling_rate=16_000, chunk_length=1)
    model_backbone = WhisperModel.from_pretrained(model_path).encoder.to(device=device)

    model_cls = Wave_Network_Classification_Update(num_classes=11).to(device=device)
    model_anomaly = Wave_Network_Anomaly_Detection().to(device=device)
    epoch_num = 30
    batch_size = 1

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'silence': '10', 'unknown': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'silence', '11':'unknown'}

    test_dataset = load_from_disk(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\v0.02\test\test')
    # test_dataset = dataset['test']

    test_dataset = test_dataset.map(edit_label_2)
    print(test_dataset)
    test_dataset_loader = DataLoader(test_dataset.with_format('torch'), batch_size, shuffle=False)
    num_class = len(label2id) - 1
    model_cls.load_state_dict(torch.load(weight_state_dict_cls))
    model_anomaly.load_state_dict(torch.load(weight_state_dict_anomaly))

    loss_item = 0.
    model_cls.eval()
    model_anomaly.eval()
    acc = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataset_loader):
            temp = [x.numpy() for x in data["audio"]['array']]
            data['input_features'] = feature_extractor(temp, sampling_rate=16_000, return_tensors="pt")['input_features']
            data['input_features'] = model_backbone(data['input_features'].cuda()).last_hidden_state
            inputs = data['input_features'].cuda()
            o_classification = model_cls(inputs, is_train=False)
            o_nomaly = model_anomaly(inputs, is_train=False)
            anomaly_arg = torch.sigmoid(o_nomaly)
            cls_arg = torch.argmax(o_classification, dim=-1)
            if (anomaly_arg.cpu().item() > 0.5 and data['nomaly_label'].cpu().item() == 1):
                if data['label'].cpu().item() != 11 and cls_arg.cpu().item() == data['label'].cpu().item():
                    acc += 1
                else:
                    print(i, anomaly_arg, data['nomaly_label'], cls_arg, data['label'])
            elif anomaly_arg.cpu().item() <= 0.5 and data['nomaly_label'].cpu().item() == 0:
                acc += 1
            else:
                print(i, anomaly_arg, data['nomaly_label'], cls_arg, data['label'])
        print(f'acc: {acc/len(test_dataset)}')


In [None]:
from torch import nn
from transformers import Wav2Vec2Model, AutoModelForAudioClassification
from transformers import AutoFeatureExtractor
from datasets import Dataset, DatasetDict
import torch
import torch.nn.functional as F
from model.attention_cbam import CBAM_1D

def init_weight(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.xavier_normal_(m.weight)
    if isinstance(m, torch.nn.BatchNorm2d):
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)
    elif isinstance(m, torch.nn.Conv2d):
        m.weight.data.normal_(0.0, 0.02)


class Projection(torch.nn.Module):

    def __init__(self, in_planes, out_planes=None, n_layers=1, layer_type=0):
        super(Projection, self).__init__()
        self.out_planes = out_planes
        if out_planes is None:
            out_planes = in_planes
        self.layers = torch.nn.Sequential()
        _in = None
        _out = None
        self.cbam = CBAM_1D(1500, 2)
        for i in range(n_layers):
            # self.layers.add_module(f"{i}fc",
            #                        torch.nn.Linear(_in, _out))
            self.layers.add_module(f"{i}cv1d",
                                   torch.nn.Conv1d(in_planes, in_planes, 3, padding='same'))
            self.layers.add_module(f"{i}bn",
                                    torch.nn.BatchNorm1d(in_planes))

            if layer_type == n_layers - 1:
                self.layers.add_module(f"{i}relu",
                                            torch.nn.LeakyReLU(.2))
            # if i < n_layers - 1:
            #     # if layer_type > 0:
            #     #     self.layers.add_module(f"{i}bn",
            #     #                            torch.nn.BatchNorm1d(_out))
            #     if layer_type > 1:
            #         self.layers.add_module(f"{i}relu",
            #                                torch.nn.LeakyReLU(.2))
        self.apply(init_weight)

    def forward(self, x):
        x = self.layers(x)
        x = self.cbam(x)
        # x = x.reshape(x.shape[0],-1)
        x = F.adaptive_avg_pool1d(x.view(x.shape[0], -1), self.out_planes)
        # x = x.mean(dim=1)
        return x

class Discriminator(torch.nn.Module):
    def __init__(self, in_planes, n_layers=3, hidden=None):
        super(Discriminator, self).__init__()

        _hidden = in_planes if hidden is None else hidden
        self.body = torch.nn.Sequential()
        for i in range(n_layers-1):
            _in = in_planes if i == 0 else _hidden
            _hidden = int(_hidden // 1.5) if hidden is None else hidden
            self.body.add_module('block%d'%(i+1),
                                 torch.nn.Sequential(
                                     nn.Dropout(0.2),
                                     torch.nn.LazyLinear(_hidden),
                                     torch.nn.BatchNorm1d(_hidden),
                                     torch.nn.LeakyReLU(0.2)
                                 ))
        self.tail = torch.nn.Linear(_hidden, 1)
        # self.apply(init_weight)

    def forward(self,x):
        x = self.body(x)
        x = self.tail(x)
        return x

class Discriminator_Conv(torch.nn.Module):
    def __init__(self, in_planes, n_layers=3, hidden=None):
        super(Discriminator, self).__init__()

        _hidden = in_planes if hidden is None else hidden
        self.body = torch.nn.Sequential()
        for i in range(n_layers-1):
            _in = in_planes if i == 0 else _hidden
            _hidden = int(_hidden // 1.5) if hidden is None else hidden
            self.body.add_module('block%d'%(i+1),
                                 torch.nn.Sequential(
                                    #  nn.Dropout(0.2),
                                     torch.nn.Linear(_in, _hidden),
                                     torch.nn.BatchNorm1d(_hidden),
                                     torch.nn.LeakyReLU(0.2)
                                 ))
        self.tail = torch.nn.Linear(_hidden, 1, bias=False)
        self.apply(init_weight)

    def forward(self,x):
        x = self.body(x)
        x = self.tail(x)
        return x

class Wave_Network_Classification(nn.Module):
    def __init__(self, num_classes=12):
        super().__init__()

        self.projector = nn.Sequential(
            # nn.Dropout(0.3),
            # nn.Conv1d(49, 1, 3, padding='same'),
            nn.LazyLinear(256)
        )

        self.squeeze_exhibition_in = nn.LazyLinear(16)
        self.relu = nn.ReLU()
        self.squeeze_exhibition_out = nn.LazyLinear(49)
        self.sigmoid = nn.Sigmoid()

        self.classifier = nn.Sequential(
            # nn.Dropout(0.3),
            nn.LazyLinear(num_classes)
        )

    def forward(self, x, nomaly_label=None, is_train=True):
        if nomaly_label is not None:
            input_x = x[nomaly_label==1]
        else:
            input_x = x
        x_hidden_state = self.projector(input_x) #[n, 49, 256]
        # x_hidden_state = torch.squeeze(x_hidden_state)
        # x_hidden_state_mean = x_hidden_state.mean(dim=1)
        x_hidden_state_mean = x_hidden_state.mean(dim=-1) #[n, 49, 1]
        x_hidden_state_mean = torch.unsqueeze(x_hidden_state_mean, 1)
        # x_hidden_state_transpose = torch.transpose(x_hidden_state_mean, -1, -2) # [n, 1, 49]
        x_squeeze_exhibition = self.squeeze_exhibition_in(x_hidden_state_mean)
        x_squeeze_exhibition = self.relu(x_squeeze_exhibition)
        x_squeeze_exhibition = self.squeeze_exhibition_out(x_squeeze_exhibition)
        x_squeeze_exhibition = self.sigmoid(x_squeeze_exhibition)

        x_hidden_state = torch.transpose(x_hidden_state, -1, -2)
        x_hidden_state = torch.mul(x_hidden_state, x_squeeze_exhibition)
        x_hidden_state = torch.transpose(x_hidden_state, -1, -2)

        x_hidden_state = x_hidden_state.mean(dim=1)

        if is_train:
            normal_hidden_state = x_hidden_state
            x_classification = self.classifier(normal_hidden_state)
        else:
            x_classification = self.classifier(x_hidden_state)
        return x_classification

class Wave_Network_Classification_Update(nn.Module):
    def __init__(self, num_classes=12, use_cbam=True):
        super().__init__()

        self.use_cbam = use_cbam
        self.projector = torch.nn.Sequential()
        input_size = 1500
        for i in range(2):
            if i == 0:
                self.projector.add_module(f"{i}cv1d",
                                   torch.nn.Conv1d(input_size, input_size * 2 ** (i + 1), 3, padding='same'))
            else:
                self.projector.add_module(f"{i}cv1d",
                                   torch.nn.Conv1d(input_size * 2 ** i, input_size * 2 ** (i + 1), 3, padding='same'))
            self.projector.add_module(f"{i}bn",
                                    torch.nn.BatchNorm1d(input_size * 2 ** (i + 1)))
            self.projector.add_module(f"{i}relu", torch.nn.ReLU(.2))
            self.projector.add_module(f"{i}cbam", CBAM_1D(input_size * 2 ** (i + 1), 2))
            self.projector.add_module(f"{i}maxpool", torch.nn.MaxPool1d(2, 2))


        # self.cbam = CBAM_1D(49, 2)

        self.linear1 = nn.Sequential(nn.LazyLinear(out_features=1024, bias=True),
                                     nn.BatchNorm1d(1024),
                                     nn.Dropout(0.2),
                                     nn.ReLU())
        self.linear2 = nn.Sequential(nn.LazyLinear(out_features=512, bias=True),
                                     nn.BatchNorm1d(512),
                                     nn.Dropout(0.2),
                                     nn.ReLU())
        self.classifier = nn.LazyLinear(out_features=num_classes, bias=True)

        # self.linear = nn.Sequential(nn.LazyLinear(out_features=256))

    def forward(self, x, nomaly_label=None, is_train=True):
        if nomaly_label is not None:
            input_x = x[nomaly_label==1]
        else:
            input_x = x

        input_x = self.projector(input_x)
        # if self.use_cbam:
        #     input_x = self.cbam(input_x)
        x_hidden_state = F.adaptive_avg_pool1d(input_x.view(input_x.shape[0], -1), output_size=2048)
        x_hidden_state = self.linear1(x_hidden_state)
        x_hidden_state = self.linear2(x_hidden_state)

        if is_train:
            normal_hidden_state = x_hidden_state
            x_classification = self.classifier(normal_hidden_state)
        else:
            x_classification = self.classifier(x_hidden_state)
        return x_classification

class Wave_Network_Anomaly_Detection(nn.Module):
    def __init__(self, std=0.05):
        super().__init__()
        self.discriminator = Discriminator(1024)
        self.projection = Projection(1500, 1024, 2)
        self.std = std

    def forward(self, x, nomaly_label=None, is_train=True):
        # x_reshape = x.reshape(x.shape[0],-1)
        # x_hidden= F.adaptive_avg_pool1d(x_reshape, 768)
        # x_hidden = x.mean(dim=1)
        x_hidden = x
        if is_train:
            x_projector = self.projection(x_hidden)
            # add noise
            # if is_train:
            normal_x_projector = x_projector[nomaly_label==1]
            noise = torch.normal(mean=0, std=self.std, size=normal_x_projector.shape).cuda()
            x_noise = normal_x_projector + noise
            x_noise = torch.concat((x_projector, x_noise))
            # else:
            #     x_noise = x_projector
            x_anomaly = self.discriminator(x_noise)
        else:
            x_projector = self.projection(x_hidden)
            x_anomaly = self.discriminator(x_projector)
        return x_anomaly

# from torchsummary import summary
# model_cls = Wave_Network_Classification_Update(num_classes=11)
# summary(model_cls)


In [None]:
from datasets import Dataset, load_from_disk, Audio, VerificationMode, load_dataset
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, AddShortNoises, RoomSimulator, Gain
import random
import numpy as np
import torch

class Custom_Audio_Dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset_path, feature_extractor, use_transform=True, sr=16_000):
        """
        Arguments:
            n/a
        """
        self.feature_extractor = feature_extractor
        self.sample_rate = sr
        self.dataset = load_from_disk(dataset_path)
        # self.dataset = self.dataset.map(self._preprocess_function, remove_columns='audio', batched=True)
        self.dataset = self.dataset.map(self._edit_label_2)
        self.transforms = [
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            Shift(p=0.5),
            # RoomSimulator(),
            Gain(min_gain_db=0.5, max_gain_db=1.5),
        ]
        self.use_transform = use_transform

    def _preprocess_function(self, examples):
        audio_arrays = [x['array'] for x in examples["audio"]]
        inputs = self.feature_extractor (
            audio_arrays, sampling_rate=self.feature_extractor.sampling_rate, padding="max_length", max_length=16_000, truncation=True,
        )
        # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
        return inputs

    def _edit_label_2(self, seq):
        if seq['label'] == 11:
            seq['nomaly_label'] = 0
        else:
            seq['nomaly_label'] = 1
        return seq

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]

        temp = [x['array'] for x in data["audio"]]
        if self.use_transform:
            r_idx = random.randint(0, len(self.transforms))
            if r_idx != len(self.transforms):
                temp = [self.transforms[r_idx](x, sample_rate=self.sample_rate) for x in temp]
        feature_temp = self.feature_extractor(temp, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt")
        data['input_features'] = feature_temp['input_features']
        return {'label': data['label'], 'nomaly_label': data['nomaly_label'], 'input_features': data['input_features']}

    def show_data_count(self):
        df = self.dataset.to_pandas()
        # Giả sử cột label trong dataset có tên là 'label'
        label_counts = df['label'].value_counts()
        # Hiển thị kết quả
        print(label_counts)