<a href="https://colab.research.google.com/github/hoa92ng/Homework/blob/main/Making_the_Most_of_your_Colab_Subscription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from torch import nn
from transformers import Wav2Vec2Model, AutoFeatureExtractor
from datasets import Dataset, DatasetDict, Audio
from model.pure_model import Wave_Network_Classification, Wave_Network_Anomaly_Detection
from torch.utils.data import DataLoader
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from datasets import load_from_disk, Audio, VerificationMode, load_dataset
import matplotlib.pyplot as plt
from torchmetrics import F1Score, Precision, Recall, Accuracy

dict_label = {'yes':0,
              'no':1,
              'up':2,
              'down':3,
              'left':4,
              'right':5,
              'on':6,
              'off':7,
              'stop':8,
              'go':9,
              'unknown':10,
              'silence':11}

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def edit_label(examples):
    # valid_dataset.features['label'].names = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', '_silence_']
    for i, x in enumerate(examples['file']):
        a = examples['label'][i]
        if examples['is_unknown'][i]: examples['label'][i] = dict_label['unknown']
        elif id2label[str(examples['label'][i])] == '_silence_':
            examples['label'][i] = dict_label['silence']
    return examples

def preprocess_function(examples):
    audio_arrays = [x['array'] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding=True, max_length=16_000, truncation=True,
    )
    # inputs['input_values'] = model_backbone(torch.from_numpy(np.array(inputs['input_values'])).cuda()).last_hidden_state.cpu()
    return inputs

def edit_label_2(seq):
    if seq['label'] == 11:
        seq['anomaly_label'] = 0
    else:
        seq['anomaly_label'] = 1
    return seq

def collate_fn(batch):
    return_batch = {}
    # Find the max length of sequences in the batch
    max_len = max([len(x['input_values']) for x in batch])

    # Pad sequences to the max length
    for x in batch:
        x['input_values'] = torch.cat([x['input_values'], torch.zeros(max_len - len(x['input_values']))])

    return_batch['input_values'] = torch.stack([x['input_values'] for x in batch])
    return_batch['label'] = torch.stack([x['label'] for x in batch])
    return_batch['anomaly_label'] = torch.stack([x['anomaly_label'] for x in batch])

    # Stack the padded sequences into a single tensor
    return return_batch

if __name__ == "__main__":
    model_path = './model_wav2vec_base'
    weight_cls_state_dict = r'D:\1.Project\3.Machine_Learning\Voice\W2Vec\models\w2vec_model_cls_best.pth'
    weight_anomaly_state_dict = r'D:\1.Project\3.Machine_Learning\Voice\W2Vec\models\w2vec_model_anomaly_best.pth'
    epoch_num = 50
    batch_size = 32
    device = 'cuda'

    label2id = {'yes': '0', 'no': '1', 'up': '2', 'down': '3', 'left': '4', 'right': '5', 'on': '6', 'off': '7', 'stop': '8', 'go': '9', 'unknown': '10', 'silence': '11'}
    id2label = {'0':'yes', '1':'no', '2':'up', '3':'down', '4':'left', '5':'right', '6':'on', '7':'off', '8':'stop', '9':'go', '10':'unknown', '11':'silence'}

    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
    model_backbone = Wav2Vec2Model.from_pretrained(model_path).to(device=device)
    model_backbone.post_init()

    for param in model_backbone.parameters():
            param.requires_grad = False
    model_backbone.feature_extractor._freeze_parameters()
    # train_data = load_from_disk('./anomaly/data/dataset/train')
    # valid_data = load_from_disk('./anomaly/data/dataset/validation')
    # test_dataset = load_from_disk('./anomaly/data/dataset/test')
    dataset = load_dataset("superb", "ks", trust_remote_code=True, verification_mode=VerificationMode.NO_CHECKS)
    train_data = dataset['train']
    valid_data = dataset['validation']
    test_dataset = dataset['test']

    train_data = train_data.map(preprocess_function, remove_columns='audio', batched=True)
    valid_data = valid_data.map(preprocess_function, remove_columns='audio', batched=True)
    train_data = train_data.map(edit_label_2)
    valid_data = valid_data.map(edit_label_2)
    print(train_data)

    train_dataloader = DataLoader(train_data.with_format('torch'), batch_size, shuffle=True, collate_fn=collate_fn)
    test_dataloader = DataLoader(valid_data.with_format('torch'), batch_size, shuffle=True, collate_fn=collate_fn)

    num_class = len(label2id) - 1
    model_cls = Wave_Network_Classification(num_classes=num_class).to(device=device)
    model_anomaly = Wave_Network_Anomaly_Detection().to(device=device)
    # model_cls.load_state_dict(torch.load(weight_cls_state_dict))
    # model_anomaly.load_state_dict(torch.load(weight_anomaly_state_dict))

    criterion_1 = nn.CrossEntropyLoss()
    criterion_2 = nn.BCEWithLogitsLoss()
    # optimizer = torch.optim.Adam(model.get_train_params(), lr=0.001)
    optim_cls = torch.optim.AdamW(model_cls.parameters(), lr=0.0001)
    optim_anomaly = torch.optim.Adam(model_anomaly.parameters(), lr=0.001)
    f1_score_classification = F1Score('multiclass', num_classes=num_class).to(device='cuda')
    f1_score_anomaly = F1Score('binary', num_classes=2).to(device='cuda')

    for epoch in range(epoch_num):
        loss_item = 0.
        valid_loss = 0.
        running_loss = 0
        save_loss = 999999
        loss_item_1 = 0.
        loss_item_2 = 0.
        model_cls.train()
        model_anomaly.train()
        f1_score_classification.reset()
        f1_score_anomaly.reset()
        for i, data in enumerate(train_dataloader):
            optim_cls.zero_grad()
            optim_anomaly.zero_grad()
            with torch.no_grad():
                # data['input_values'] = model_backbone(data['input_values'].cuda()).extract_features
                data['input_values'] = model_backbone(data['input_values'].cuda()).last_hidden_state
            inputs, labels = data['input_values'].cuda(), data['anomaly_label'].cuda()
            normal_inputs = data['input_values'][data['anomaly_label']==1].cuda()
            anormal_inputs = data['input_values'][data['anomaly_label']==0].cuda()
            normal_labels = data['label'][data['anomaly_label']==1].cuda()

            if len(normal_inputs) == 0: continue

            original_project_label = torch.unsqueeze(data['anomaly_label'], dim=-1)
            false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
            projection_labels = torch.concat((original_project_label, false_project_label)).cuda()
            o_classification = model_cls(inputs, labels)
            o_nomaly = model_anomaly(inputs, labels)
            loss_1 = criterion_1(o_classification, normal_labels)
            loss_2 = criterion_2(o_nomaly, projection_labels)
            loss = loss_1 + loss_2
            f1_score_cls_metric = f1_score_classification(o_classification, normal_labels)
            f1_score_anomaly_metric = f1_score_anomaly(o_nomaly, projection_labels)

            loss_1.backward()
            optim_cls.step()

            loss_2.backward()
            optim_anomaly.step()

            running_loss += loss.item()
            loss_item_1 += loss_1.item()
            loss_item_2 += loss_2.item()
            if i % 10 == 9:  # Print every 10 batches
                print(f'[Epoch {epoch + 1}, Batch {i + 1}] running_loss: {running_loss/10:.3f} |  loss_cls: {loss_item_1/10:.3f} \
                      | loss_anomaly: {loss_item_2/10:.3f} | f1_cls_score: {f1_score_classification.compute():.3f} | f1_anomaly_score: {f1_score_anomaly.compute():.3f}')
                running_loss = 0.0
                loss_item_1 = 0.0
                loss_item_2 = 0.0

        model_cls.eval()
        model_anomaly.eval()
        f1_score_classification.reset()
        f1_score_anomaly.reset()
        with torch.no_grad():
            for i, data in enumerate(test_dataloader):
                data['input_values'] = model_backbone(data['input_values'].cuda()).last_hidden_state
                # data['input_values'] = model_backbone(data['input_values'].cuda()).extract_features
                inputs, labels = data['input_values'].cuda(), data['anomaly_label'].cuda()
                normal_inputs = data['input_values'][data['anomaly_label']==1].cuda()
                anormal_inputs = data['input_values'][data['anomaly_label']==0].cuda()
                normal_labels = data['label'][data['anomaly_label']==1].cuda()
                if len(normal_inputs) == 0: continue
                # true_project_label = torch.ones(size=(normal_inputs.shape[0], 1))
                original_project_label = torch.unsqueeze(data['anomaly_label'], dim=-1)
                false_project_label = torch.zeros(size=(normal_inputs.shape[0], 1))
                projection_labels = torch.concat((original_project_label, false_project_label)).cuda()
                o_classification = model_cls(inputs, labels)
                o_nomaly = model_anomaly(inputs, labels)
                loss_1 = criterion_1(o_classification, normal_labels)
                loss_2 = criterion_2(o_nomaly, projection_labels)
                loss = loss_1 + loss_2
                valid_loss += loss.item()
                f1_score_cls_metric = f1_score_classification(o_classification, normal_labels)
                f1_score_anomaly_metric = f1_score_anomaly(o_nomaly, projection_labels)
            valid_loss = valid_loss/len(test_dataloader)
            if epoch == 0: valid_loss = save_loss
            else:
                if valid_loss < save_loss:
                    torch.save(model_cls.state_dict(), f'./w2vec/models/w2vec_model_cls_best.pth')
                    torch.save(model_anomaly.state_dict(), f'./w2vec/models/w2vec_model_anomaly_best.pth')
                    save_loss = valid_loss
            print(f"Epoch [{epoch+1}/{epoch_num}], Valid Loss: {valid_loss:.3f} | f1_cls_score: {f1_score_classification.compute():.3f} | f1_anomaly_score: {f1_score_anomaly.compute():.3f}")

    torch.save(model_cls.state_dict(), f'./w2vec/models/w2vec_model_cls_final.pth')
    torch.save(model_anomaly.state_dict(), f'./w2vec/models/w2vec_model_anomaly_final.pth')


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
from torch import nn
from transformers import Wav2Vec2Model, AutoModelForAudioClassification
from transformers import AutoFeatureExtractor
from datasets import Dataset, DatasetDict
import torch
import torch.nn.functional as F


def init_weight(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.xavier_normal_(m.weight)
    if isinstance(m, torch.nn.BatchNorm2d):
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)
    elif isinstance(m, torch.nn.Conv2d):
        m.weight.data.normal_(0.0, 0.02)


class Projection(torch.nn.Module):

    def __init__(self, in_planes, out_planes=None, n_layers=1, layer_type=0):
        super(Projection, self).__init__()
        self.out_planes = out_planes
        if out_planes is None:
            out_planes = in_planes
        self.layers = torch.nn.Sequential()
        _in = None
        _out = None
        for i in range(n_layers):
            # self.layers.add_module(f"{i}fc",
            #                        torch.nn.Linear(_in, _out))
            self.layers.add_module(f"{i}cv1d",
                                   torch.nn.Conv1d(in_planes, in_planes, 3, padding='same'))
            self.layers.add_module(f"{i}bn",
                                    torch.nn.BatchNorm1d(in_planes))

            if layer_type == n_layers - 1:
                self.layers.add_module(f"{i}relu",
                                            torch.nn.LeakyReLU(.2))
            # if i < n_layers - 1:
            #     # if layer_type > 0:
            #     #     self.layers.add_module(f"{i}bn",
            #     #                            torch.nn.BatchNorm1d(_out))
            #     if layer_type > 1:
            #         self.layers.add_module(f"{i}relu",
            #                                torch.nn.LeakyReLU(.2))
        self.apply(init_weight)

    def forward(self, x):
        # x = .1 * self.layers(x) + x
        x = self.layers(x)
        # x = x.reshape(x.shape[0],-1)
        # x = F.adaptive_avg_pool1d(x, self.out_planes)
        x = x.mean(dim=1)
        return x

class Discriminator(torch.nn.Module):
    def __init__(self, in_planes, n_layers=3, hidden=None):
        super(Discriminator, self).__init__()

        _hidden = in_planes if hidden is None else hidden
        self.body = torch.nn.Sequential()
        for i in range(n_layers-1):
            _in = in_planes if i == 0 else _hidden
            _hidden = int(_hidden // 1.5) if hidden is None else hidden
            self.body.add_module('block%d'%(i+1),
                                 torch.nn.Sequential(
                                    #  nn.Dropout(0.2),
                                     torch.nn.Linear(_in, _hidden),
                                     torch.nn.BatchNorm1d(_hidden),
                                     torch.nn.LeakyReLU(0.2)
                                 ))
        self.tail = torch.nn.Linear(_hidden, 1)
        self.apply(init_weight)

    def forward(self,x):
        x = self.body(x)
        x = self.tail(x)
        return x

class Discriminator_Conv(torch.nn.Module):
    def __init__(self, in_planes, n_layers=3, hidden=None):
        super(Discriminator, self).__init__()

        _hidden = in_planes if hidden is None else hidden
        self.body = torch.nn.Sequential()
        for i in range(n_layers-1):
            _in = in_planes if i == 0 else _hidden
            _hidden = int(_hidden // 1.5) if hidden is None else hidden
            self.body.add_module('block%d'%(i+1),
                                 torch.nn.Sequential(
                                    #  nn.Dropout(0.2),
                                     torch.nn.Linear(_in, _hidden),
                                     torch.nn.BatchNorm1d(_hidden),
                                     torch.nn.LeakyReLU(0.2)
                                 ))
        self.tail = torch.nn.Linear(_hidden, 1, bias=False)
        self.apply(init_weight)

    def forward(self,x):
        x = self.body(x)
        x = self.tail(x)
        return x

class Wave_Network(nn.Module):
    def __init__(self, num_classes=12, model_path='', device='cuda'):
        super().__init__()
        self.device = device
        self.backbone = Wav2Vec2Model.from_pretrained(model_path).to(device=device)

        for param in self.backbone.parameters():
            param.requires_grad = False
        self.backbone.feature_extractor._freeze_parameters()

        self.discriminator = Discriminator(768).to(device=device)
        self.projection = Projection(768, 768, 2).to(device=device)
        self.projector = nn.Sequential(
            nn.Dropout(0.3),
            nn.LazyLinear(256).to(device=device)
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            # nn.Flatten(),
            # nn.LazyLinear(256).to(device=device),
            nn.LazyLinear(num_classes).to(device=device)
        )

    def get_train_params(self, is_classification=True):
        if is_classification:
            return [
                {'params': self.projector.parameters()},
                {'params': self.classifier.parameters()},
            ]
        else:
            return [
                {'params': self.projection.parameters()},
                {'params': self.discriminator.parameters()},
            ]

    def forward(self, x, anomaly_label=None, std=0.05, is_train=True):
        x = self.backbone(x).last_hidden_state
        x_reshape = x.reshape(x.shape[0],-1)
        x_hidden= F.adaptive_avg_pool1d(x_reshape, 768)
        x_hidden_state = self.projector(x)
        x_hidden_state = x_hidden_state.mean(dim=1)

        if is_train:
            normal_hidden_state = x_hidden_state[anomaly_label==1]
            x_classification = self.classifier(normal_hidden_state)

            x_projector = self.projection(x_hidden)
            # add noise
            # if is_train:
            normal_x_projector = x_projector[anomaly_label==1]
            noise = torch.normal(mean=0, std=std, size=normal_x_projector.shape).to(device=self.device)
            x_noise = normal_x_projector + noise
            x_noise = torch.concat((x_projector, x_noise))
            # else:
            #     x_noise = x_projector
            x_anomaly = self.discriminator(x_noise)
        else:
            x_classification = self.classifier(x_hidden_state)
            x_projector = self.projection(x_hidden)
            x_anomaly = self.discriminator(x_projector)
        return x_anomaly, x_classification

    def save(self, file_path):
        # Custom logic before saving
        print(f"Saving model to {file_path}")

        # For example, saving both the model's state dict and some metadata
        torch.save({
            'model_state_dict_backbone': self.backbone.state_dict(),
            'model_state_dict_discriminator': self.discriminator.state_dict(),
            'model_state_dict_projection': self.projection.state_dict(),
            'model_state_dict_classifier': self.classifier.state_dict(),
            'custom_metadata': {
                'info': 'This is a custom saved model',
                'epoch': 10,
                'loss': 0.1234
            }
        }, file_path)

        # Custom logic after saving
        print("Model saved successfully!")

    def load(self, file_path):
        # Custom logic before saving
        print(f"Loading model from {file_path}")

        # For example, saving both the model's state dict and some metadata
        temp_model = torch.load(file_path)
        self.backbone.load_state_dict(temp_model['model_state_dict_backbone'])
        self.discriminator.load_state_dict(temp_model['model_state_dict_discriminator'])
        self.projection.load_state_dict(temp_model['model_state_dict_projection'])
        self.classifier.load_state_dict(temp_model['model_state_dict_classifier'])
        # Custom logic after saving
        print("Model loaded successfully!")

class Wave_Network_Classification(nn.Module):
    def __init__(self, num_classes=12):
        super().__init__()
        self.projector = nn.Sequential(
            # nn.Dropout(0.3),
            # nn.Conv1d(49, 1, 3, padding='same'),
            nn.LazyLinear(256)
        )
        self.classifier = nn.Sequential(
            # nn.Dropout(0.3),
            nn.LazyLinear(num_classes)
        )

    def forward(self, x, anomaly_label=None, is_train=True):
        x_hidden_state = self.projector(x)
        # x_hidden_state = torch.squeeze(x_hidden_state)
        x_hidden_state = x_hidden_state.mean(dim=1)

        if is_train:
            normal_hidden_state = x_hidden_state[anomaly_label==1]
            x_classification = self.classifier(normal_hidden_state)
        else:
            x_classification = self.classifier(x_hidden_state)
        return x_classification

class Wave_Network_Anomaly_Detection(nn.Module):
    def __init__(self, std=0.05):
        super().__init__()
        self.discriminator = Discriminator(768)
        self.projection = Projection(49, 768, 3)
        self.std = std

    def forward(self, x, anomaly_label=None, is_train=True):
        # x_reshape = x.reshape(x.shape[0],-1)
        # x_hidden= F.adaptive_avg_pool1d(x_reshape, 768)
        # x_hidden = x.mean(dim=1)
        x_hidden = x
        if is_train:
            x_projector = self.projection(x_hidden)
            # add noise
            # if is_train:
            normal_x_projector = x_projector[anomaly_label==1]
            noise = torch.normal(mean=0, std=self.std, size=normal_x_projector.shape).cuda()
            x_noise = normal_x_projector + noise
            x_noise = torch.concat((x_projector, x_noise))
            # else:
            #     x_noise = x_projector
            x_anomaly = self.discriminator(x_noise)
        else:
            x_projector = self.projection(x_hidden)
            x_anomaly = self.discriminator(x_projector)
        return x_anomaly
# _network =  Wav2Vec2Model.from_pretrained(pretrained_model_name_or_path='./model_1')
# print(_network)
# for name, param in _network.named_parameters():
#     print(f"layer: {name} | Shape: {param.shape}")

# model = AutoModelForAudioClassification.from_pretrained(
#         pretrained_model_name_or_path='./model_1', num_labels=10, ignore_mismatched_sizes=True)
# for name, param in model.named_parameters():
#     print(f"layer: {name} | Shape: {param.shape}")

In [3]:
from datasets import load_dataset, VerificationMode
import datasets
print(datasets.__version__)

2.21.0


In [4]:
dataset = load_dataset("superb", "ks", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/57.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.49G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51094 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6798 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3081 [00:00<?, ? examples/s]

In [6]:
print(load_dataset.cache_dir)

AttributeError: 'function' object has no attribute 'cache_dir'