<a href="https://colab.research.google.com/github/hoa92ng/Homework/blob/main/Making_the_Most_of_your_Colab_Subscription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch import nn
from transformers import WavLMModel, Wav2Vec2FeatureExtractor, get_scheduler
from datasets import Dataset, DatasetDict, Audio, concatenate_datasets
from model.pure_model_final import Wave_Network
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, load_dataset
import matplotlib.pyplot as plt
from torchmetrics import F1Score, Precision, Recall, Accuracy
from tqdm.auto import tqdm
from data.audio_augmentation import random_augementation
from data.custom_dataset import Custom_Audio_Dataset

dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'unknown':10,
              'silence':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding="max_length", max_length=16_000, truncation=True,
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def agumentation_function(examples, i):
    examples["audio"]['array'] = random_augementation(examples["audio"]['array'], i)

    return examples

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['nomaly_label'] = 0
    else:
        seq['nomaly_label'] = 1
    return seq

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_values']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_values'] = torch.cat([x['input_values'], torch.zeros(max_len - len(x['input_values']))])

    return_batch['input_values'] = torch.stack([x['input_values'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['anomaly_label'] = torch.stack([x['anomaly_label'] for x in batch])

    # Stack the padded sequences into a single tensor
    return return_batch

def show_data_count(input_dataset):
    df = input_dataset.to_pandas()
    # Giả sử cột label trong dataset có tên là 'label'
    label_counts = df['label'].value_counts()
    # Hiển thị kết quả
    print(label_counts)


if __name__ == "__main__":
    model_path = r'D:\1.Project\3.Machine_Learning\Voice\model_wavlm\wavlm_base'
    weight_cls_state_dict = './w2vec/models/w2vec_model_all_in_one_cls_last.pth'
    epoch_num = 50
    batch_size = 32
    device = 'cuda'

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'unknown': '10', 'silence': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'unknown', '11':'silence'}

    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
    model_backbone = WavLMModel.from_pretrained(model_path).to(device=device)

    for param in model_backbone.parameters():
            param.requires_grad = False

    model_backbone.feature_extractor._freeze_parameters()

    train_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs\train_updated_balance', feature_extractor=feature_extractor)
    valid_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs\test', feature_extractor=feature_extractor)
    # train_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\v0.01\all\train', feature_extractor=feature_extractor)
    # valid_data = Custom_Audio_Dataset(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\v0.01\all\test', feature_extractor=feature_extractor)
    train_data.show_data_count()
    valid_data.show_data_count()

    train_dataloader = DataLoader(train_data, batch_size, shuffle=True)
    test_dataloader = DataLoader(valid_data, batch_size)

    num_class = len(label2id) - 1
    model_cls = Wave_Network(num_classes=num_class, std=0.04, channel=49, size=768).to(device=device)
    # model_cls.load_state_dict(torch.load(weight_cls_state_dict))
    # model_anomaly.load_state_dict(torch.load(weight_anomaly_state_dict))

    criterion_1 = nn.CrossEntropyLoss()
    criterion_2 = nn.BCEWithLogitsLoss()
    optim_cls = torch.optim.AdamW(model_cls.parameters(), lr=0.001)
    acc_score_classification = Accuracy('multiclass', num_classes=num_class).to(device='cuda')
    acc_score_anomaly = Accuracy('binary').to(device='cuda')

    num_training_steps = epoch_num * len(train_dataloader)
    lr_scheduler_cls = get_scheduler(
        name="linear", optimizer=optim_cls, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    progress_bar = tqdm(range(num_training_steps), position=0, leave=True)

    for epoch in range(epoch_num):
        loss_item = 0.
        valid_loss = 0.
        running_loss = 0
        save_loss = 999999
        loss_item_1 = 0.
        loss_item_2 = 0.
        model_cls.train()
        acc_score_classification.reset()
        acc_score_anomaly.reset()
        for i, data in enumerate(train_dataloader):
            optim_cls.zero_grad()
            with torch.no_grad():
                data['input_values'] = model_backbone(data['input_values'].cuda()).last_hidden_state
            inputs, labels = data['input_values'].cuda(), data['nomaly_label'].cuda()
            normal_inputs = data['input_values'][data['nomaly_label']==1].cuda()
            anormal_inputs = data['input_values'][data['nomaly_label']==0].cuda()
            normal_labels = data['label'][data['nomaly_label']==1].cuda()

            if len(normal_inputs) == 0: continue

            original_project_label = torch.unsqueeze(data['nomaly_label'], dim=-1)
            false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
            projection_labels = torch.concat((original_project_label, false_project_label)).cuda()

            o_classification, o_nomaly = model_cls(inputs, labels)
            loss_1 = criterion_1(o_classification, normal_labels)
            f1_score_cls_metric = acc_score_classification(o_classification.detach(), normal_labels.detach())


            p_origin_score = o_nomaly[:len(original_project_label)]
            p_fake_score = o_nomaly[len(original_project_label):]
            p_true_score = p_origin_score[labels==1]
            p_fake_score = torch.cat([p_fake_score, p_origin_score[labels==0]])

            true_labels = torch.ones(size=(p_true_score.shape[0], 1)).cuda()
            fake_labels = torch.zeros(size=(p_fake_score.shape[0], 1)).cuda()
            weight_balance = len(fake_labels) / (len(true_labels) + 1e-10)
            # loss_2 = criterion_2(p_true_score, true_labels) * weight_balance + criterion_2(p_fake_score, fake_labels)
            loss_2 = criterion_2(p_true_score, true_labels) * 2 + criterion_2(p_fake_score, fake_labels)
            acc_score_anomaly_metric = acc_score_anomaly(o_nomaly.detach(), projection_labels.detach())

            loss_item_1 += loss_1.item()
            loss_item_2 += loss_2.item()

            loss = 5 * loss_1 + loss_2
            loss.backward()
            optim_cls.step()
            lr_scheduler_cls.step()


            progress_bar.update(1)
            if i % 500 == 499:  # Print every 10 batches
                tqdm.write(f'[Epoch {epoch + 1}, Batch {i + 1}] loss_cls: {loss_item_1/500:.3f} | loss_anomaly: {loss_item_2/500:.3f} | acc_cls_score: {acc_score_classification.compute():.3f} | acc_anomaly_score: {acc_score_anomaly.compute():.3f}')
                loss_item_1 = 0.0
                loss_item_2 = 0.0

        model_cls.eval()
        acc_score_classification.reset()
        acc_score_anomaly.reset()
        with torch.no_grad():
            for i, data in enumerate(test_dataloader):
                data['input_values'] = model_backbone(data['input_values'].cuda()).last_hidden_state
                # data['input_values'] = model_backbone(data['input_values'].cuda()).extract_features
                inputs, labels = data['input_values'].cuda(), data['nomaly_label'].cuda()
                normal_inputs = data['input_values'][data['nomaly_label']==1].cuda()
                anormal_inputs = data['input_values'][data['nomaly_label']==0].cuda()
                normal_labels = data['label'][data['nomaly_label']==1].cuda()
                if len(normal_inputs) == 0: continue
                # true_project_label = torch.ones(size=(normal_inputs.shape[0], 1))
                original_project_label = torch.unsqueeze(data['nomaly_label'], dim=-1).cuda().float()
                false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
                # projection_labels = torch.concat((original_project_label, false_project_label)).cuda()
                o_classification, o_nomaly = model_cls(inputs, labels, is_train=False)
                loss_1 = criterion_1(o_classification, normal_labels)
                loss_2 = criterion_2(o_nomaly, original_project_label)
                loss = loss_1 + loss_2
                valid_loss += loss.item()
                f1_score_cls_metric = acc_score_classification(o_classification.detach(), normal_labels.detach())
                acc_score_anomaly_metric = acc_score_anomaly(o_nomaly.detach(), original_project_label.detach())
            valid_loss = valid_loss/len(test_dataloader)
            if epoch == 0: valid_loss = save_loss
            else:
                if valid_loss < save_loss:
                    torch.save(model_cls.state_dict(), f'./w2vec/models/wavlm_model_all_in_one_cls_best.pth')
                    save_loss = valid_loss
            torch.save(model_cls.state_dict(), f'./w2vec/models/wavlm_model_all_in_one_cls_last.pth')
            tqdm.write(f"Epoch [{epoch+1}/{epoch_num}], Valid Loss: {valid_loss:.3f} | acc_cls_score: {acc_score_classification.compute():.3f} | acc_anomaly_score: {acc_score_anomaly.compute():.3f}")

    torch.save(model_cls.state_dict(), f'./w2vec/models/wavlm_model_all_in_one_cls_final.pth')


In [None]:
from torch import nn
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from transformers import AutoFeatureExtractor, WavLMModel
from datasets import Dataset, DatasetDict, Audio
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, Audio
import matplotlib.pyplot as plt
from model.pure_model_final import Wave_Network


dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'silence':10,
              'unknown':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding=True
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_values']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_values'] = torch.cat([x['input_values'], torch.zeros(max_len - len(x['input_values']))])

    return_batch['input_values'] = torch.stack([x['input_values'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['nomaly_label'] = torch.stack([x['nomaly_label'] for x in batch])
    # return_batch['re_label'] = torch.stack([x['re_label'] for x in batch])
    return return_batch

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['nomaly_label'] = 0
    else:
        seq['nomaly_label'] = 1
    return seq

if __name__ == "__main__":
    model_path = r'D:\1.Project\3.Machine_Learning\Voice\model_wavlm\wavlm_base'
    device = 'cuda'
    weight_state_dict_cls = './w2vec/models/wavlm_model_all_in_one_cls_last.pth'
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
    model_backbone = WavLMModel.from_pretrained(model_path).to(device=device)

    model_cls = Wave_Network(num_classes=11, std=0.04, channel=49, size=768).to(device=device)
    batch_size = 1

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'silence': '10', 'unknown': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'silence', '11':'unknown'}

    dataset = load_from_disk(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\dataset\superbs')
    test_dataset = dataset['test']
    # test_dataset = load_from_disk(r'D:\1.Project\3.Machine_Learning\Voice\anomaly\data\v0.01\all\test')

    test_dataset = test_dataset.map(preprocess_function, remove_columns='audio', batched=True)
    # test_dataset = test_dataset.remove_columns(['file', 'is_unknown', 'speaker_id', 'utterance_id'])
    print(test_dataset)
    test_dataset_loader = DataLoader(test_dataset.with_format('torch'), batch_size, shuffle=False)
    num_class = len(label2id) - 1
    model_cls.load_state_dict(torch.load(weight_state_dict_cls))

    loss_item = 0.
    model_cls.eval()

    acc = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataset_loader):
            input_values = model_backbone(data['input_values'].cuda()).last_hidden_state
            input_values = input_values.cuda()
            o_classification, o_nomaly = model_cls(input_values, is_train=False)
            anomaly_arg = torch.sigmoid(o_nomaly)
            cls_arg = torch.argmax(o_classification, dim=-1)
            if (anomaly_arg.cpu().item() > 0.5 and data['nomaly_label'].cpu().item() == 1):
                if data['label'].cpu().item() != 11 and cls_arg.cpu().item() == data['label'].cpu().item():
                    acc += 1
                else:
                    print(i, anomaly_arg, data['nomaly_label'], cls_arg, data['label'])
            elif anomaly_arg.cpu().item() <= 0.5 and data['nomaly_label'].cpu().item() == 0:
                acc += 1
            else:
                print(i, anomaly_arg, data['nomaly_label'], cls_arg, data['label'])
        print(f'acc: {acc/len(test_dataset)}')


In [None]:
from datasets import Dataset, load_from_disk, Audio, VerificationMode, load_dataset
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, AddShortNoises, RoomSimulator, Gain
import random
import numpy as np
import torch
WORDS = [
    "yes",
    "no",
    "up",
    "down",
    "left",
    "right",
    "on",
    "off",
    "stop",
    "go",
]

UNKNOWN_WORDS_V1 = [
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "bed",
    "bird",
    "cat",
    "dog",
    "happy",
    "house",
    "marvin",
    "sheila",
    "tree",
    "wow",
]

UNKNOWN_WORDS_V2 = UNKNOWN_WORDS_V1 + [
    "backward",
    "forward",
    "follow",
    "learn",
    "visual",
]

SILENCE = "_silence_"  # background noise
LABELS_V1 = WORDS + UNKNOWN_WORDS_V1 + [SILENCE]
LABELS_V2 = WORDS + UNKNOWN_WORDS_V2 + [SILENCE]

class Custom_Audio_Dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset_path, feature_extractor, use_transform=False, sr=16_000):
        """
        Arguments:
            n/a
        """
        self.feature_extractor = feature_extractor
        self.sample_rate = sr
        self.dataset = load_from_disk(dataset_path)
        self.dataset = self.dataset.map(self._preprocess_function, remove_columns='audio', batched=True)
        self.dataset = self.dataset.map(self._edit_label_2)
        self.transforms = [
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            Shift(p=0.5),
            # RoomSimulator(),
            Gain(min_gain_db=0.5, max_gain_db=1.5),
        ]
        self.use_transform = use_transform

    def _preprocess_function(self, examples):
        audio_arrays = [x['array'] for x in examples["audio"]]
        inputs = self.feature_extractor(
            audio_arrays, sampling_rate=self.feature_extractor.sampling_rate, padding="max_length", max_length=16_000, truncation=True,
        )
        # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
        return inputs

    def _edit_label_2(self, seq):
        if seq['label'] == 11:
            seq['nomaly_label'] = 0
        else:
            seq['nomaly_label'] = 1
        return seq
        # if seq['is_unknown']:
        #     seq['label'] = 11
        #     seq['nomaly_label'] = 0
        # else:
        #     if LABELS_V1[seq['label']] == "_silence_":
        #         seq['label'] = 10
        #     seq['nomaly_label'] = 1
        # return seq

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        if self.use_transform:
            r_idx = random.randint(0, len(self.transforms))
            if r_idx != len(self.transforms):
                data['input_values'] = self.transforms[r_idx](np.array(data['input_values']), sample_rate=self.sample_rate)
            else:
                 data['input_values'] = np.array(data['input_values'])
        data['input_values'] = torch.Tensor(data['input_values'])
        return {'label': data['label'], 'nomaly_label': data['nomaly_label'], 'input_values': data['input_values']}

    def show_data_count(self):
        df = self.dataset.to_pandas()
        # Giả sử cột label trong dataset có tên là 'label'
        label_counts = df['label'].value_counts()
        # Hiển thị kết quả
        print(label_counts)