In [1]:
from audio_dataset import MelSpectrogramDataset,denormalize

from pathlib import Path

import os

import numpy as np
import pandas as pd

import librosa
import librosa.display

import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torchvision.models as torch_models
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from fastai.vision.all import *

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
root = '/home/jiajia2011us/heart_sz'
audio_data = root + '/audio_data'
data = root + '/data'

In [3]:
df_train = pd.read_csv(data+'/train.csv')
df_valid = pd.read_csv(data+'/valid_test.csv')
df_valid = df_valid[df_valid['valid']==True]

In [4]:
def get_train_transform():
    return A.Compose([
        #A.HorizontalFlip(p=0.5),
        #A.VerticalFlip(p=0.5),
        #A.Resize(height=512,width=512,p=1.0),
        A.Normalize(p=1.0),
        ToTensorV2(p=1.0),
    ],p=1.0)

def get_valid_transform():
    return A.Compose([
        #A.Resize(height=512,width=512,p=1.0),
        A.Normalize(p=1.0),
        ToTensorV2(p=1.0),
    ],p=1.0)

In [5]:
train_ds = MelSpectrogramDataset(df_train,audio_data,img_tfms=get_train_transform())
valid_ds = MelSpectrogramDataset(df_valid,audio_data,img_tfms=get_valid_transform())

In [39]:
len(train_ds),len(valid_ds)

(394, 149)

In [6]:
#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
#implemented for PyTorch / FastAI by lessw2020 
#github: https://github.com/lessw2020/mish
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        #inlining this saves 1 second per epoch (V100 GPU) vs having a temp x and then returning x(!)
        return x * (torch.tanh(F.softplus(x)))

In [34]:
class Model_Head(nn.Module):
    def __init__(self,ni,nc,ps=0.25):
        '''
        ni : input filter size
        nc : output class size
        ps : dropout rate
        '''
        super().__init__()
        layers = ([Mish(),ConvLayer(ni,ni,act_cls=None),AdaptiveConcatPool2d(),
                   Flatten(), LinBnDrop(ni*2,512,p=ps,act=Mish()), 
                   LinBnDrop(512,nc,p=ps*2)])
        self.head = nn.Sequential(*layers)
    
    def forward(self,xb):
        return self.head(xb)
    
class Resnet_audio(nn.Module):
    def __init__(self,arch,nc=[1,1,1],pretrained=True):
        super().__init__()
        self.body = nn.Sequential(*list(arch(pretrained=pretrained).children())[:-2])
       
        # change input filter size to 1
#         nf,ni,h,w = self.body[0].weight.shape
#         w = self.body[0].weight.sum(dim=1,keepdim=True)
#         conv_input = conv2d(1,nf,ks=h)
#         conv_input.weight.data = w
#         self.body[0] = conv_input
        
        # multi-head output
        # 1,1,1 regression head
        ni = num_features_model(self.body)
        self.head_tsh = Model_Head(ni,nc[0])
        self.head_t3 = Model_Head(ni,nc[1])
        self.head_t4 = Model_Head(ni,nc[2])
    
    def forward(self,x):
        x = self.body(x)
        return (self.head_tsh(x),self.head_t3(x),self.head_t4(x))
    
# replace all relu layer with Mish        
def to_mish(model):
    for name,child in model.named_children():
        if isinstance(child,nn.ReLU):
            setattr(model,name,Mish())
        else:
            to_mish(child)

In [35]:
model = Resnet_audio(torch_models.resnet34)

In [66]:
class Loss_multi_head(nn.Module):
    def __init__(self,weights=[1,1,1]):
        super().__init__()
        self.weights = weights
        
    def forward(self,preds,target):
        outp_tsh,outp_t3,outp_t4 = preds
        outp_tsh,outp_t3,outp_t4 = outp_tsh.float(),outp_t3.float(),outp_t4.float()
        targ_tsh,targ_t3,targ_t4 = target[0]
        return (
            self.weights[0] * F.mse_loss(outp_tsh.squeeze(),targ_tsh)
            + self.weights[1] * F.mse_loss(outp_t3.squeeze(),targ_t3)
            + self.weights[2] * F.mse_loss(outp_t4.squeeze(),targ_t4)
        )
        

In [None]:
# not done #
def audio_mse(inp,targ):
    outp_tsh,outp_t3,outp_t4 = inp
    outp_tsh,outp_t3,outp_t4 = outp_tsh.float(),outp_t3.float(),outp_t4.float()
    targ_tsh,targ_t3,targ_t4 = targ[0]
    return (
        self.weights[0] * F.mse_loss(outp_tsh.squeeze(),targ_tsh)
        + self.weights[1] * F.mse_loss(outp_t3.squeeze(),targ_t3)
        + self.weights[2] * F.mse_loss(outp_t4.squeeze(),targ_t4)
    )

In [67]:
loss_func = Loss_multi_head()

In [50]:
train_loader = torch.utils.data.DataLoader(
        train_ds,
        sampler=RandomSampler(train_ds),
        batch_size=32,
        pin_memory=False,
        drop_last=True,
        num_workers=4,
    )

val_loader = torch.utils.data.DataLoader(
        valid_ds, 
        batch_size=32,
        num_workers=4,
        shuffle=False,
        sampler=SequentialSampler(valid_ds),
        pin_memory=False,
    )

In [70]:
from fastai.data.core import DataLoaders

In [74]:
dls = DataLoaders.from_dsets(train_ds, valid_ds,bs=32)

In [75]:
b = dls.one_batch()

In [76]:
b[0].shape,len(b[1])

(torch.Size([32, 3, 128, 235]), 2)

In [77]:
preds = model(b[0])

In [81]:
loss_func(preds,b[1])

tensor(175.6771, dtype=torch.float64, grad_fn=<AddBackward0>)

In [84]:
learn = Learner(dls,model,loss_func=loss_func,metrics=loss_func)