In [1]:

import numpy as np
import matplotlib.pyplot as plt
import time
import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import interpolate 
from torch.autograd import Variable
import imp
import torchaudio
import torchvision as tv
import matplotlib.pyplot as plt
from IPython.display import Audio
import librosa.display
import os, random
import pandas as pd
# import mir_utils as miru
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import librosa
# import pytorch_utils
# import pytorch_models
#reload these libraries because I change them often-ish
# imp.reload(pytorch_utils)
# imp.reload(miru)
# imp.reload(pytorch_models)
from scipy.signal import resample
from sklearn import preprocessing
le_major = preprocessing.LabelEncoder()

SR = 44100
#functions
spec = torchaudio.functional.spectrogram

def getMeanLength(x):
    gl=x.apply(lambda z: len(z["audio"]),axis=1)
    print(gl.mean()/SR,gl.mean(),x["label"].iloc[0])

audio_df = pd.read_csv("csvs/audio_df.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
audio_df["maj"] = audio_df["maj"].apply(lambda x: 1 if x=="drums" else 0)

In [3]:
%%time
#define a dataset
# reshample all sounds to 22050 and take the first half of a second (0:11025)
class audioDataset(torch.utils.data.Dataset):
    def __init__(self,audio_df,SR=44100,transform=None):
        self.audio_df = audio_df
        self.minLength = SR//4
    def __len__(self):
        return len(self.audio_df)
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        row = self.audio_df.iloc[idx]
        try:
            signal,sr = librosa.load(row["path"])
        except:
            signal = np.zeros(self.minLength)
            sr = self.minLength
        # resample to global SR
        signal = librosa.resample(signal,sr,SR//2)
        # pad the audio length if too short
        nz = np.max((self.minLength-signal.shape[0],0))
        signal = np.concatenate([signal[0:self.minLength],np.zeros(nz)])
        
        sound={"signal":signal,"major":row["maj"],"minor":row["min"],"path":row["path"],"sr":SR}
        return sound
  

CPU times: user 32 µs, sys: 14 µs, total: 46 µs
Wall time: 49.8 µs


In [4]:

adf = audio_df.copy().sample(frac=1)
train,val = train_test_split(adf, test_size=0.1) 

train_loader = DataLoader(audioDataset(train,SR), batch_size=64,shuffle=False, num_workers=8)
val_loader = DataLoader(audioDataset(val,SR), batch_size=128,shuffle=False, num_workers=8)

sample_iterator = iter(train_loader)
d = next(sample_iterator)

In [5]:
import math
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:

import torchmetrics
from torchmetrics.functional import auc
from torch.nn import functional as F
from torch import nn
import optuna
from optuna.integration import PyTorchLightningPruningCallback
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,LearningRateMonitor
import torchaudio
class Transformer_DVN(LightningModule):
    def __init__(self,attention_dropout=0.3,d_model=100,heads=25,encoding_layers=16,pool_dim=2,pct_start=0.05,max_lr=1e-4,max_momentum=0.95,epochs = 50):
        super().__init__()
        dropout=0.2
        self.attention_dropout=attention_dropout
        self.d_model = d_model
        self.heads = heads
        self.encoding_layers = encoding_layers
        self.pool_dim = pool_dim
        self.pct_start = pct_start
        self.max_lr = max_lr
        self.max_momentum = max_momentum
        self.epochs = epochs
        self.spectrogram_func = torchaudio.transforms.Spectrogram(n_fft = int(self.d_model*2)-1, hop_length = 200, power = 0.2, normalized = True)
        
        self.pos_encoder = PositionalEncoding(self.d_model, 0.1)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model, nhead=self.heads,
                                                        dropout = self.attention_dropout,)
        
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=self.encoding_layers)
        self.adaptiveavgpool = nn.AdaptiveAvgPool1d(self.pool_dim)
        self.adaptivemaxpool = nn.AdaptiveMaxPool1d(self.pool_dim)
        
        self.decoder = nn.Sequential(
          nn.Linear(self.d_model*2*self.pool_dim,64),
          nn.Linear(64,32),
          nn.Linear(32,1),
        )
#         d["signal"]
    def forward(self, x):
        x = x.float()
        x1 = self.spectrogram_func(x).transpose(0,1).transpose(0,2)
        x1 = self.pos_encoder(x1)
        x2 = self.transformer_encoder(x1).transpose(1,0).transpose(2,1)
        x3r = self.adaptiveavgpool(x2)
        x3c = self.adaptivemaxpool(x2)
        x4 = torch.cat((x3r, x3c), dim=1)
        x4 = x4.view(x4.size(0), -1)
        out =  self.decoder(x4)
        return out
    
    def step(self, batch, batch_idx):
        x, y = batch["signal"].float(),batch["major"].float().reshape(-1,1)
        x1 = self.spectrogram_func(x).transpose(0,1).transpose(0,2)
        x1 = self.pos_encoder(x1)
        x2 = self.transformer_encoder(x1).transpose(1,0).transpose(2,1)
        x3r = self.adaptiveavgpool(x2)
        x3c = self.adaptivemaxpool(x2)
        x4 = torch.cat((x3r, x3c), dim=1)
        x4 = x4.view(x4.size(0), -1)
        out =  self.decoder(x4)
#         loss = F.binary_cross_entropy_with_logits(out, y,pos_weight=self.w_pos.to(self.device))
        loss = F.binary_cross_entropy_with_logits(out, y,)
        accuracy = torchmetrics.functional.accuracy(out,y.int(),num_classes=1)
        print(accuracy,end="\r")
        return loss, {"loss": loss,"accuracy":accuracy}

    def training_step(self, batch, batch_idx):
        loss, logs = self.step(batch, batch_idx)

        
        self.log_dict({f"train_{k}": v for k, v in logs.items()}, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, logs = self.step(batch, batch_idx)
        self.log_dict({f"val_{k}": v for k, v in logs.items()}, on_step=False, on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-8)
        lr_scheduler = {'scheduler': torch.optim.lr_scheduler.OneCycleLR(
                                        optimizer,
                                        pct_start = self.pct_start,
                                        max_lr=self.max_lr,
                                        steps_per_epoch=int(len(self.train_dataloader())),
                                        epochs=self.epochs,
                                        anneal_strategy="cos",
                                        final_div_factor = 1000,
                                        max_momentum=self.max_momentum,
                                    ),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        return [optimizer],[lr_scheduler]



model = Transformer_DVN(attention_dropout=0.3,d_model=120,heads=30,encoding_layers=12,pool_dim=2,)

model(d["signal"]).shape

torch.Size([64, 1])

In [None]:
checkpoint_callback =  ModelCheckpoint(
    monitor='val_accuracy',
    dirpath='models/transformer',
    filename='DVN{epoch:2d}-{val_accuracy:.3f}-{val_loss:.3f}',
    save_top_k=3,
    mode='max',
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(gpus=1,precision=16,callbacks=[checkpoint_callback,lr_monitor],log_every_n_steps=1,max_epochs=100,stochastic_weight_avg=True,)
trainer.fit(model,train_loader,val_loader,)

In [None]:
SCALE_FACTOR = 0.5
num_windows = 5
window_shift = 300
smallest_loss,smallest_vloss = 1000,1000
step = 0
for epoch in range(40): 
    for i, data in enumerate(train_loader, 0):
        train_loss = 0
        bs = len(data["signal"]) # batch size
        signal = interpolate(data["signal"].reshape([bs,1,-1]),scale_factor = SCALE_FACTOR,recompute_scale_factor=False).reshape([bs,1,-1])
      
        optimizer.zero_grad()
        outputs = cnet(signal,)
        y = torch.tensor(le_major.transform(data["major"])).to(device)
        loss = loss_func(outputs,y.long())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if i%5==0:
            with torch.no_grad():
                val_loss = 0
                for iv,datav in enumerate(test_loader, 0):
                    signalv = interpolate(datav["signal"].reshape([len(datav["signal"]),1,-1]),scale_factor = SCALE_FACTOR,recompute_scale_factor=False).reshape([len(datav["signal"]),-1])
                    # zero the parameter gradients
                    optimizer.zero_grad()
                    voutputs = cnet(signalv,)
                    vy = torch.tensor(le_major.transform(datav["major"])).to(device)
                    vloss = loss_func(voutputs,vy.long())
                    val_loss += vloss
            print('[%d, %d] val loss: %.5f, loss: %.5f'%(epoch + 1, i , val_loss,train_loss))
    #         writer.add_scalar('Loss/Training', train_loss,)
    #         writer.add_scalar('Loss/Validation', val_loss)
            if val_loss < smallest_vloss:        
                torch.save({
                'epoch': epoch,
                'vloss': vloss,
                'model_state_dict': cnet.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                }, "models/1d_conv//%.3f_%.4f_.checkpoint"%(val_loss,train_loss,))
                smallest_vloss = val_loss
                smallest_loss = train_loss 