In [None]:
import librosa
import argparse
import json
import os
import time
import shutil
import pandas as pd
import numpy as np
import pickle as pkl 
from PIL import Image
import lmdb
import random

import torch
import torch.nn as nn
from torch.utils.data import *
import torchaudio
import torchvision
import torchvision.models as models
from torchsummary import summary

from tqdm import tqdm
from tensorboardX import SummaryWriter

import wandb

In [None]:
wandb.login()

In [None]:
%cd

### preprocessing with 2 channels

In [None]:
def extract_spectrogram(values, clip, entries):
    sampling_rate = 44100
    
    for data in entries:

        num_channels = 3
        window_sizes = [25, 50, 100]
        hop_sizes = [10, 25, 50]
        centre_sec = 2.5

        specs = []
        for i in range(num_channels):
            window_length = int(round(window_sizes[i]*sampling_rate/1000))
            hop_length = int(round(hop_sizes[i]*sampling_rate/1000))

            clip = torch.Tensor(clip)
            spec = torchaudio.transforms.MelSpectrogram(sample_rate=sampling_rate, n_fft=4410, win_length=window_length, hop_length=hop_length, n_mels=128)(clip)
            eps = 1e-6
            spec = spec.numpy()
            spec = np.log(spec+ eps)
            spec = np.asarray(torchvision.transforms.Resize((128, 250))(Image.fromarray(spec)))
            specs.append(spec)
        new_entry = {}
        new_entry["audio"] = clip.numpy()
        new_entry["values"] = np.array(specs)
        new_entry["target"] = data["target"]
        values.append(new_entry)

def extract_features(audios):
    data_dir = 'DataSci251_FinalProject/DataSet/ESC-50-master/audio'
    sampling_rate = 44100
    
    audio_names = list(audios.filename.unique())
    values = []
    for audio in audio_names:
        clip, sr = librosa.load("{}/{}".format(data_dir, audio), sr=sampling_rate)
        entries = audios.loc[audios["filename"]==audio].to_dict(orient="records")
        extract_spectrogram(values, clip, entries)
        print("Finished audio {}".format(audio))
    return values

In [None]:
audios = pd.read_csv('DataSci251_FinalProject/DataSet/ESC-50-master/meta/esc50.csv', skipinitialspace=True)
num_folds = 5

store_dir = 'DataSci251_FinalProject/DataSet/ESC-50-master/store_spectograms_2'

for i in range(1, num_folds+1):
    training_audios = audios.loc[audios["fold"]!=i]
    validation_audios = audios.loc[audios["fold"]==i]

    training_values = extract_features(training_audios)
    with open("{}training128mel{}.pkl".format(store_dir, i),"wb") as handler:
        pkl.dump(training_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

    validation_values = extract_features(validation_audios)
    with open("{}validation128mel{}.pkl".format(store_dir, i),"wb") as handler:
        pkl.dump(validation_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

---

In [None]:
class MelSpectrogram(object):
    def __init__(self, bins, mode, dataset):
        self.window_length = [25, 50, 100]
        self.hop_length = [10, 25, 50]
        self.fft = 4410
        self.melbins = bins
        self.mode = mode
        self.sr = 44100
        self.length = 250
    def __call__(self, value):
        sample = value
        limits = ((-2, 2), (0.9, 1.2))

        if self.mode=="train":
            pitch_shift = np.random.randint(limits[0][0], limits[0][1] + 1)
            time_stretch = np.random.random() * (limits[1][1] - limits[1][0]) + limits[1][0]
            new_audio = librosa.effects.time_stretch(y = librosa.effects.pitch_shift(y = sample, sr = self.sr, n_steps = pitch_shift), rate = time_stretch)
        else:
            pitch_shift = 0
            time_stretch = 1
            new_audio = sample
        specs = []
        for i in range(len(self.window_length)):
            clip = torch.Tensor(new_audio)

            window_length = int(round(self.window_length[i]*self.sr/1000))
            hop_length = int(round(self.hop_length[i]*self.sr/1000))
            spec = torchaudio.transforms.MelSpectrogram(sample_rate=self.sr, n_fft=self.fft, win_length=window_length, hop_length=hop_length, n_mels=self.melbins)(clip)
            eps = 1e-6
            spec = spec.numpy()
            spec = np.log(spec+ eps)
            spec = np.asarray(torchvision.transforms.Resize((128, self.length))(Image.fromarray(spec)))
            specs.append(spec)
        specs = np.array(specs).reshape(-1, 128, self.length)
        specs = torch.Tensor(specs)
        return specs

class AudioDataset(Dataset):
    def __init__(self, pkl_dir, dataset_name, transforms=None):
        self.transforms = transforms
        self.data = []
        self.length = 250
        with open(pkl_dir, "rb") as f:
            self.data = pkl.load(f)
    def __len__(self):
        if self.transforms.mode == "train":
            return 2*len(self.data)
        else:
            return len(self.data)
    def __getitem__(self, idx):
        if idx >= len(self.data):
            print("getting item if")
            new_idx = idx - len(self.data)
            entry = self.data[new_idx]
            if self.transforms:
                values = self.transforms(entry["audio"])
        else:
            print("getting item else")
            entry = self.data[idx]
            values = torch.Tensor(entry["values"].reshape(-1, 128, self.length))
        target = torch.LongTensor([entry["target"]])
        print("returning values and target")
        return (values, target)

def fetch_dataloader(pkl_dir, dataset_name, batch_size, num_workers, mode):
    transforms = MelSpectrogram(128, mode, dataset_name)
    dataset = AudioDataset(pkl_dir, dataset_name, transforms=transforms)
    print(type(dataset[0]))
    print(dataset[0][0].size())
    print(dataset[0][1].size())
    print(dataset[1][0].size())
    print(dataset[1][1].size())
    dataloader = DataLoader(dataset,shuffle=True, batch_size=batch_size, num_workers=num_workers)
    return dataloader

In [None]:
import torch.nn.functional as F
from torch.nn import init

# Audio Classification Model

class AudioClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(3, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=50)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # Forward pass
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

In [None]:
# parameters
dataset_name = "ESC"
data_dir = "DataSci251_FinalProject/DataSet/ESC-50-master/store_spectograms_2"
dataaug = True
pretrained = True
scheduler = True,
model = "resnet"
batch_size = 32
num_workers = 8
epochs = 70
lr = 1e-4
weight_decay = 1e-3
num_folds = 5
checkpoint_dir = "DataSci251_FinalProject/DataSet/ESC-50-master/checkpoint_dir_cnn"

In [None]:
run = wandb.init(
    # Set the project where this run will be logged
    project="audio_densenet",
    # Track hyperparameters and run metadata
    config={
        "model": 'cnn',
        "batch_size":batch_size,
        "learning_rate": lr,
        "epochs": epochs,
    })

In [None]:
class RunningAverage():
    def __init__(self):
        self.total = 0
        self.steps = 0
    def update(self, loss):
        self.total += loss
        self.steps += 1
    def __call__(self):
        return (self.total/float(self.steps))

In [None]:
def save_checkpoint(state, is_best, split, checkpoint):
    filename = os.path.join(checkpoint, 'last{}.pth.tar'.format(split))
    if not os.path.exists(checkpoint):
        print("Checkpoint Directory does not exist")
        os.mkdir(checkpoint)
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(checkpoint, "model_best_{}.pth.tar".format(split)))

In [None]:
def train(model, device, data_loader, optimizer, loss_fn):
    model.train()
    loss_avg = RunningAverage()

    with tqdm(total=len(data_loader)) as t:
        for batch_idx, data in enumerate(data_loader):
            inputs = data[0].to(device)
            target = data[1].squeeze(1).to(device)

            outputs = model(inputs)

            loss = loss_fn(outputs, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_avg.update(loss.item())

            t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
            t.update()
    return loss_avg()

In [None]:
def evaluate(model, device, test_loader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            inputs = data[0].to(device)
            target = data[1].squeeze(1).to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return (100*correct/total)

In [None]:
def train_and_evaluate(model, device, train_loader, val_loader, optimizer, loss_fn, writer, epochs, checkpoint_dir, split, scheduler=None):
    best_acc = 0.0

    for epoch in range(epochs):
        avg_loss = train(model, device, train_loader, optimizer, loss_fn)

        acc = evaluate(model, device, val_loader)
        print("Epoch {}/{} Loss:{} Valid Acc:{}".format(epoch, epochs, avg_loss, acc))
        
        wandb.log({"accuracy": acc, "loss": avg_loss})

        is_best = (acc > best_acc)
        if scheduler:
            scheduler.step()
        if is_best:
            best_acc = acc
            filename = os.path.join("{}".format(checkpoint_dir), 'myModel_export.pt')
            model_cpu = model.to('cpu')
            model_scripted = torch.jit.script(model_cpu)
            model_scripted.save(filename)
            
            model = model.to('cuda')

        save_checkpoint({"epoch": epoch + 1,
                               "model": model.state_dict(),
                               "optimizer": optimizer.state_dict()}, is_best, split, "{}".format(checkpoint_dir))
        writer.add_scalar("data{}/trainingLoss{}".format(dataset_name, split), avg_loss, epoch)
        writer.add_scalar("data{}/valLoss{}".format(dataset_name, split), acc, epoch)
    writer.close()

In [None]:
## need config path
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for i in range(1, num_folds+1):
    if dataaug:
        train_loader = fetch_dataloader( "{}training128mel{}.pkl".format(data_dir, i), "ESC", batch_size, num_workers, 'train')
        val_loader = fetch_dataloader("{}validation128mel{}.pkl".format(data_dir, i), "ESC", batch_size, num_workers, 'validation')
    else:
        print("something wrong")

    writer = SummaryWriter(comment="ESC")
    model = AudioClassifier().to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    if scheduler:
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 30, gamma=0.1)
    else:
        scheduler = None

    train_and_evaluate(model, device, train_loader, val_loader, optimizer, loss_fn, writer,epochs, checkpoint_dir, i, scheduler)