In [None]:
!nvidia-smi

In [None]:
import os
if not os.path.isfile('master.zip'):
    !wget -q https://github.com/karoldvl/ESC-50/archive/master.zip
    !unzip -qo master.zip
    !pip install -qq wandb git+https://github.com/kan-bayashi/ParallelWaveGAN
os.environ['WANDB_API_KEY'] = ''
import wandb
wandb.login()

In [None]:
import torch
from torch import nn
import torchaudio

import pandas as pd
from sklearn import preprocessing
from tqdm.auto import tqdm

batch_size = 16
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

data = pd.read_csv('ESC-50-master/meta/esc50.csv')
data_dir = 'ESC-50-master/audio'

le = preprocessing.LabelEncoder()
le.fit(data.category)

X_train = data[data.fold != 5].filename.values
X_train = [os.path.join(data_dir, i) for i in X_train]
y_train = le.transform(data[data.fold != 5].category)
X_test = data[data.fold == 5].filename.values
X_test = [os.path.join(data_dir, i) for i in X_test]
y_test = le.transform(data[data.fold == 5].category)

In [None]:
class ESC50Dataset(torch.utils.data.Dataset):
    def __init__(self, files, labels):
        self.files = files
        self.labels = labels

    def load_wave(self, file):
        return torchaudio.load(file)[0][0].unsqueeze(0)

    def __getitem__(self, index):
        return self.load_wave(self.files[index]), self.labels[index]

    def __len__(self):
        return len(self.files)

train_loader = torch.utils.data.DataLoader(ESC50Dataset(X_train, y_train), batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True)
test_loader = torch.utils.data.DataLoader(ESC50Dataset(X_test, y_test), batch_size=batch_size, shuffle=False, num_workers=2, drop_last=False)

In [None]:
for batch in train_loader:
    print(batch[0].shape)
    break

In [None]:
def sum_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

from parallel_wavegan.models.hifigan import HiFiGANScaleDiscriminator

class HiFiGANClassifier(torch.nn.Module):
    def __init__(self, n_out):
        super().__init__()
        # downsample_scales from https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/hifigan.v1.yaml#L71
        self.conv = HiFiGANScaleDiscriminator(out_channels=n_out, downsample_scales=[4, 4, 4, 4, 1])

    def forward(self, x):
        return self.conv(x)[-1].mean(-1)


class ConvNextBlock(nn.Module):
    def __init__(self, channels, kernel=49, dropout=0.0):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(channels, channels, kernel, 1, padding='same', groups=channels),
            nn.GroupNorm(1, channels),
            nn.Conv1d(channels, channels*4, 1),
            nn.GELU(),
            nn.Conv1d(channels*4, channels, 1),
            nn.Dropout(dropout),
            nn.Dropout1d(dropout))
    def forward(self, x):
        shortcut = x
        x = self.conv(x)
        return shortcut + x

class ConvNextDownBlock(nn.Module):
    def __init__(self, channels_in, dropout=0.0):
        super().__init__()
        channels_out = channels_in * 2
        self.down = nn.Sequential(nn.Conv1d(channels_in, channels_out, 4, 4), nn.GroupNorm(1, channels_out))
        self.conv = nn.Sequential(ConvNextBlock(channels_out, dropout=dropout),
                                  ConvNextBlock(channels_out, dropout=dropout),
                                  ConvNextBlock(channels_out, dropout=dropout))
    def forward(self, x):
        x = self.down(x)
        x = self.conv(x)
        return x

class ConvNextlassifier(nn.Module):
    def __init__(self, n_out, num=4, channels=64, dropout=0.0):
        super().__init__()
        self.down = nn.Sequential(nn.Conv1d(1, channels, 16, 16), nn.GroupNorm(1, channels))
        self.conv = nn.Sequential(*[ConvNextDownBlock(channels*2**i, dropout) for i in range(num)])
        self.out = nn.Conv1d(channels*2**num, n_out, 1, 1)
    def forward(self, x):
        x = self.down(x)
        x = self.conv(x)
        x = self.out(x).mean(-1).squeeze(-1)
        return x

model = HiFiGANClassifier(n_out=len(le.classes_)).to(device)
#model = ConvNextlassifier(n_out=len(le.classes_), dropout=0.1).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

In [None]:
print(sum_params(model))
model

In [None]:
def train(model, optimizer, criterion, iterator):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    progress_bar = tqdm(range(len(iterator)), leave=False)
    for x, y in iterator:
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += (y_pred.argmax(-1) == y).sum().item()
        logs = {"loss": loss.item()}
        progress_bar.update(1)
        progress_bar.set_postfix(**logs)
        wandb.log(logs)

    return epoch_loss / len(iterator), epoch_acc / batch_size / len(iterator)

def evaluate(model, criterion, iterator):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for x, y in iterator:
            x = x.to(device)
            y = y.to(device)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            epoch_loss += loss.item()
            epoch_acc += (y_pred.argmax(-1) == y).sum().item()

    return epoch_loss / len(iterator), epoch_acc / batch_size / len(iterator)

In [None]:
wandb.init(project='esc50', entity='had', name='HiFiGANClassifier')
wandb.watch(model)
N_EPOCHS = 10000

best_valid_loss = float('inf')

for _ in tqdm(range(N_EPOCHS)):
    train_loss = train(model, optimizer, criterion, train_loader)
    valid_loss = evaluate(model, criterion, test_loader)
    print(train_loss, valid_loss)
    wandb.log({"eval_loss": valid_loss[0], "eval_acc": valid_loss[1], "train_acc": train_loss[1]})
    if valid_loss[0] < best_valid_loss:
        best_valid_loss = valid_loss[0]
print(best_valid_loss)
wandb.finish()