In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip -q install timm
!pip -q install torchlibrosa
!pip -q install audiomentations
!pip -q install transformers
!pip -q install fairseq

# Utils

In [None]:
import random, glob
import numpy as np, pandas as pd
import soundfile as sf
import time
from tqdm import tqdm
from albumentations.pytorch.functional import img_to_tensor
from transformers import Wav2Vec2Tokenizer,Wav2Vec2Model
import fairseq
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.modules.dropout import Dropout
from torch.nn.modules.linear import Linear
from torch.nn.modules.pooling import AdaptiveAvgPool2d, AdaptiveMaxPool2d


class CustomAudioDataset(Dataset):
    def __init__(self, df, period=10, transforms=None, train=True):
        self.period = period
        self.transforms = transforms
        self.train = train

        self.recording_ids = df["Filename"].values
        if train:
            self.labels = df["encoded_labels"].values

    def __len__(self):
        return len(self.recording_ids)

    def __getitem__(self, idx):
        recording_id = self.recording_ids[idx]

        y, sr = sf.read(recording_id)
        len_y = len(y)
        effective_length = 480000

        # Ensuring all waveforms are of effective_length
        if len_y < effective_length:
            new_y = np.zeros(effective_length, dtype=y.dtype)
            start = np.random.randint(effective_length - len_y)
            new_y[start:start+len_y] = y
            y = new_y.astype(np.float32)
        elif len_y > effective_length:
            start = np.random.randint(len_y - effective_length)
            y = y[start:start + effective_length].astype(np.float32)

        if self.transforms:
            y = self.transforms(samples=y, sample_rate=sr)

        label = np.zeros(7, dtype='f')
        if self.train:
            label[self.labels[idx]] = 1

        # Assertions to ensure that data shapes and types are as expected
        assert y.dtype == np.float32, f"Unexpected dtype: {y.dtype}, expected np.float32"
        assert y.shape[0] == effective_length, f"Unexpected shape: {y.shape}, expected {effective_length}"
        assert label.dtype == 'f', f"Unexpected dtype: {label.dtype}, expected 'f'"
        assert label.shape[0] == 7, f"Unexpected shape: {label.shape}, expected 7"

        return {
            "waveform": torch.tensor(y, dtype=torch.float),
            "target": torch.tensor(label, dtype=torch.float),
            "id": recording_id
        }

class TestDataset(Dataset):
    def __init__(self, df, period=10, transforms=None, data_path="train", train=True):
        self.period = period
        self.transforms = transforms
        self.data_path = data_path
        self.train = train

        self.recording_ids = df["recording_id"].values


    def __len__(self):
        return len(self.recording_ids)

    def __getitem__(self, idx):

        recording_id = self.recording_ids[idx]

        y, sr = sf.read(f"{self.data_path}/{recording_id}.flac")

        len_y = len(y)
        effective_length = 480000

        y_ = []
        i = 0
        while i < len_y:
            y__ = y[i:i+effective_length]

            if self.transforms:
                y__ = self.transforms(samples=y__, sample_rate=sr)

            y_.append(y__)
            i = i + effective_length

        y = np.stack(y_)

        label = np.zeros(7, dtype='f')

        return {
            "waveform" : y,
            "target" : torch.tensor(label, dtype=torch.float),
            "id" : recording_id
        }

In [None]:
class AudioClassifier(nn.Module):
    def __init__(self,path,classes_num,hfmodel=False):
        super().__init__()

        if not hfmodel:
            self.model, self.cfg,self.task = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
            self.model = self.model[0]
            self.fc = Linear(512, classes_num)
        else:
            self.model = Wav2Vec2Model.from_pretrained(path)
            self.fc = Linear(256, classes_num)

        self.dropout = Dropout(0.3)
        self.hfmodel=hfmodel


    def forward(self, input, spec_aug=False, mixup_lambda=None):

        if not self.hfmodel:
            wav2feature = self.model.feature_extractor(input)
            x1 = F.max_pool1d(wav2feature, kernel_size=3, stride=1)
            x2 = F.avg_pool1d(wav2feature, kernel_size=3, stride=1)
            x = x1 + x2
            x = F.dropout(x, p=0.5, training=self.training)
            x = self.fc(torch.sum(x,axis=2))

        else:
            wav2feature = self.model(input).last_hidden_state
            x1 = F.max_pool1d(wav2feature, kernel_size=3)
            x2 = F.avg_pool1d(wav2feature, kernel_size=3)
            x = x1 + x2
            x = F.dropout(x, p=0.5, training=self.training)
            x = self.fc(torch.sum(x,axis=1))


        return x

In [None]:
from sklearn.metrics import roc_auc_score

class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class MetricMeter(object):
    def __init__(self, num_classes=7):
        self.reset(num_classes)

    def reset(self, num_classes=7):
        self.num_classes = num_classes
        self.y_true = [[] for _ in range(num_classes)]
        self.y_pred = [[] for _ in range(num_classes)]

    def normalize_predictions(self, y_pred):
        return y_pred / np.sum(y_pred, axis=1, keepdims=True)

    def update(self, y_true, y_pred):
        y_true = y_true.cpu().detach().numpy()
        y_pred = self.normalize_predictions(torch.sigmoid(y_pred).cpu().detach().numpy())

        for i in range(self.num_classes):
            unique_labels = np.unique(y_true[:, i])
            if len(unique_labels) > 1:
                self.y_true[i].extend(y_true[:, i].tolist())
                self.y_pred[i].extend(y_pred[:, i].tolist())
            else:
                print(f"Warning: For class {i}, found only one unique label {unique_labels[0]} in y_true. AUC cannot be calculated.")

    @property
    def avg(self):
        auc_scores = []
        for i in range(self.num_classes):
            if len(np.unique(self.y_true[i])) > 1:
                auc = roc_auc_score(self.y_true[i], self.y_pred[i])
                auc_scores.append(auc)

        if len(auc_scores) == 0:
            average_auc = 0.0  # Handle the case where no valid ROC AUC can be calculated
            print("Warning: No valid AUC can be calculated.")
        else:
            average_auc = sum(auc_scores) / len(auc_scores)

        return {
            "average_auc": average_auc
        }

In [None]:
def train_epoch(config, model, loader, criterion, optimizer, scheduler, epoch,tokenizer):
    losses = AverageMeter()
    scores = MetricMeter()

    model.train()
    t = tqdm(loader)


    for i, sample in enumerate(t):
        optimizer.zero_grad()

        waveform_list=sample['waveform'].tolist()
        input_values =tokenizer(waveform_list, return_tensors = "pt").input_values

        input = input_values.to(config['device'])
        target = sample['target'].to(config['device'])
        output = model(input)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if scheduler and config['step_scheduler']:
            scheduler.step()

        bs = input.size(0)
        #scores.update(target, torch.sigmoid(torch.max(output['framewise_output'], dim=1)[0]))
        scores.update(target, output)
        losses.update(loss.item(), bs)

        t.set_description(f"Train E:{epoch} - Loss{losses.avg:0.4f}")
    t.close()
    return scores.avg, losses.avg

def valid_epoch(config, model, loader, criterion, epoch,tokenizer):
    losses = AverageMeter()
    scores = MetricMeter()

    model.eval()

    with torch.no_grad():
        t = tqdm(loader)
        for i, sample in enumerate(t):
            waveform_list=sample['waveform'].tolist()
            input_values =tokenizer(waveform_list, return_tensors = "pt").input_values
            input = input_values.to(config['device'])
            target = sample['target'].to(config['device'])
            output = model(input)
            loss = criterion(output, target)
            bs = input.size(0)
            scores.update(target,output)
            losses.update(loss.item(), bs)
            t.set_description(f"Valid E:{epoch} - Loss:{losses.avg:0.4f}")
    t.close()
    return scores.avg, losses.avg

def test_epoch(config, model, loader,tokenizer):

    model.eval()
    pred_list = []
    id_list = []
    with torch.no_grad():
        t = tqdm(loader)
        for i, sample in enumerate(t):
            waveform_list=sample['waveform'].tolist()
            input_values =tokenizer(waveform_list, return_tensors = "pt").input_values

            input = input_values.to(config['device'])
            id = sample["id"]
            output = torch.sigmoid(model(input)).cpu().detach().numpy().tolist()
            pred_list.extend(output)
            id_list.extend(id)

    return pred_list, id_list

In [None]:
import os
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup

def seed_everything(seed):
  # Setting seed
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def main(fold,config):
    seed_everything(config['seed'])

    config['fold'] = fold
    config['save_path'] = os.path.join(config['output_dir'],config['exp_name'])
    os.makedirs(config['save_path'], exist_ok=True)

    train_df = pd.read_csv(config['train_csv'])

    if config['DEBUG']:
        train_df = train_df.sample(200)

    train_fold = train_df[train_df.fold != fold]
    valid_fold = train_df[train_df.fold == fold]

    train_dataset = CustomAudioDataset(
        df = train_fold,
        period=config['period'],
        transforms=None,
        train=True
    )

    valid_dataset = CustomAudioDataset(
        df = valid_fold,
        period=config['period'],
        transforms=None,
        train=False
    )


    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        drop_last=True,
        num_workers=config['num_workers']
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        drop_last=False,
        num_workers=config['num_workers']
    )

    tokenizer = Wav2Vec2Tokenizer.from_pretrained(config['hf_path'])

    model = AudioClassifier(config['hf_path'],7,True)

    model = model.to(config['device'])

    if config['pretrain_weights']:
        model.load_state_dict(torch.load(config['pretrain_weights'], map_location=config['device']), strict=False)
        model = model.to(config['device'])

    criterion = BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    num_train_steps = int(len(train_loader) * config['epochs'])
    num_warmup_steps = int(0.1 * config['epochs'] * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)

    best_auc = -np.inf
    early_stop_count = 0

    for epoch in range(config['start_epoch'], config['epochs']):
        train_avg, train_loss = train_epoch(config, model, train_loader, criterion, optimizer, scheduler, epoch,tokenizer)
        valid_avg, valid_loss = valid_epoch(config, model, valid_loader, criterion, epoch,tokenizer)

        if config['epoch_scheduler']:
            scheduler.step()

        content = f"""
                {time.ctime()} \n
                Fold:{config['fold']}, Epoch:{epoch}, lr:{optimizer.param_groups[0]['lr']:.7}\n
                Train Loss:{train_loss:0.4f} - Auc:{train_avg['average_auc']:0.4f}\n
                Valid Loss:{valid_loss:0.4f} - Auc:{valid_avg['average_auc']:0.4f}\n
        """
        print(content)

        with open(config['save_path']+'/log_'+config['exp_name']+'.txt', 'a') as appender:
            appender.write(content+'\n')

        if valid_avg['average_auc'] > best_auc:
            print(f"########## >>>>>>>> Model Improved From {best_auc} ----> {valid_avg['average_auc']}")
            torch.save(model.state_dict(), os.path.join(config['save_path'],'fold-'+str(config['fold'])+'.bin'))
            best_auc = valid_avg['average_auc']
            early_stop_count = 0
        else:
            early_stop_count += 1

        if config['early_stop'] == early_stop_count:
            print("\n we reached early stoping count :", early_stop_count)
            break

    model.load_state_dict(torch.load(os.path.join(config['save_path'],'fold-'+str(config['fold'])+'.bin'), map_location=config['device']))
    model = model.to(config['device'])

if __name__ == "__main__":

    for fold in range(5):
       config = dict(
                DEBUG = True,
                wandb = False,
                exp_name = "W2V_V1",
                network = "AudioClassifier",
                pretrain_weights = None,
                lr = 1e-3,
                step_scheduler = True,
                epoch_scheduler = False,
                period = 10,
                seed = 42,
                start_epoch = 0,
                epochs = 10,
                batch_size = 2,
                num_workers = 2,
                early_stop = 10,
                device = ('cuda' if torch.cuda.is_available() else 'cpu'),
                train_csv = "/content/drive/MyDrive/DS5500/train.csv",
                output_dir = "weights",
                hf_path="facebook/wav2vec2-base-960h"
              )

       main(fold,config)

       print(f"***********Fold:{fold} done*******")
       print("\n")