In [None]:
!git clone https://github.com/kovacsdotgergo/szakdolgozat.git
%cd szakdolgozat
!pip install wget torch torchvision torchaudio matplotlib pandas numpy timm==0.4.5

In [None]:
#TODO: tmp for branch
!git branch
!git checkout feature/cnn
!git branch

In [None]:
import utils
esc_path, save_path, workspace_path = utils.setup_env()

In [None]:
import torch
import torch.nn as nn
#128x512(436) input mel spect
class conv2d_v1(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv_layers = nn.Sequential(
        nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),#128*512
        nn.ReLU(),
        nn.MaxPool2d(2),#64*256
        nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),#64*256
        nn.ReLU(),
        nn.MaxPool2d(2),#32*128
        nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),#32*128
        nn.ReLU(),
        nn.MaxPool2d(2),#16*64
        nn.Dropout(),

        nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),#16*64
        nn.ReLU(),
        nn.MaxPool2d(2),#8*32
        nn.Dropout(),
        nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),#8*32
        nn.ReLU(),
        nn.MaxPool2d(2),#4*16
        nn.Dropout(),
        nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),#4*16
        nn.ReLU(),
        nn.MaxPool2d(2)#2*8
    )
    #flatten all dimensions except batch
    self.flatten = nn.Flatten(1)
    #fully connected layer
    self.mlp = nn.Sequential(
        nn.Linear(2*8*512, 1024),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(1024, 50),
    )
  
  def forward(self, x):
    #passing through all layers
    x = torch.unsqueeze(x, 1)
    x = self.conv_layers(x)
    x = self.flatten(x)
    return self.mlp(x)

In [None]:
import torch
import torch.nn as nn
import esc_dataset
import trainer
import numpy as np

have_cuda = torch.cuda.is_available()

## Model
audio_model = conv2d_v1()
audio_model = torch.nn.DataParallel(audio_model, device_ids=[0])
audio_model = audio_model.to(torch.device("cuda:0" if have_cuda else 'cpu'))

## Dataset
dataset = esc_dataset.ESCdataset(esc_path, n_fft=1024, hop_length=256,
                     n_mels=128, augment=False,  log_mel=True,
                     use_kaldi=True, target_len=512, resample_rate=22500)

#dividing the dataset randomly, 80% train, 10% validation, 10% test
numtrain = int(0.8*len(dataset))
numval = (len(dataset) - numtrain) // 2
numtest = len(dataset) - numtrain - numval
split_dataset = torch.utils.data.random_split(dataset, [numtrain, numval, numtest])
#using augment on the training data
#split_dataset[0].augment = True

## DataLoader
BATCHSIZE = 16
trainloader = torch.utils.data.DataLoader(split_dataset[0], batch_size=BATCHSIZE,
                         shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(split_dataset[1], batch_size=BATCHSIZE, shuffle=True)
testloader = torch.utils.data.DataLoader(split_dataset[2], batch_size=BATCHSIZE, shuffle=True)

## Trainer
trainer = trainer.Trainer(audio_model, have_cuda, criterion=nn.CrossEntropyLoss)

## Inference
spect, label = dataset[0]
print(f'trainer inference: {dataset.get_class_name(trainer.inference(spect, ret_index=True).item())}, '
    f'true label: {dataset.get_class_name(label)}')

## Training
lrs = np.logspace(-2, -6, num=5)
params = trainer.hyperparameter_plotting(lrs, trainloader, valloader, train_epochs=5)
# save_name = 'tmp.pth'
# trainer.train(trainloader, valloader, optimizer=torch.optim.AdamW, train_epochs=1,
#               val_interval=25, lr=5e-06, save_best_model=True, save_path=save_path + save_name)
# trainer.plot_train_proc('30 epoch training')

# ## Test
# trainer.load_model(save_path + save_name)
# print(f'test accuracy: {trainer.test(testloader)}')

# ## Inference after training
# print(f'Trainer inference after training: {dataset.get_class_name(trainer.inference(spect, ret_index=True).item())}, '
#     f'true label: {dataset.get_class_name(label)}')