<a href="https://colab.research.google.com/github/isaacgoff/DL_Project_2022/blob/master/Isaac_Deep_Learning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive (Must allow access manually when prompted)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Models Class

In [2]:
from torch import nn
from torch.nn.functional import softmax
import torchvision.models as models

# List of models to choose from. Currently in list:
#   * Basic 4 layer CNN
#   * AlexNet
#   * VGG16
#   * ResNet18
class Models():
    def __init__(self, model_name: str):
        self.model_list = ['Basic_4_Layer_CNN', 'Alex_Net', 'VGG_16', 'Res_Net_18']
        self.input_model = model_name
        self.num_output_classes = 11
        if self.input_model not in self.model_list:
            raise ValueError('Model list does not contain model "%s"' %(model_name))
    
    def choose_model(self):
        if self.input_model == 'Basic_4_Layer_CNN':
            model = Basic_4_Layer_CNN()
        elif self.input_model == 'Alex_Net':
            model = models.alexnet(False, False)
            model.classifier[6] = nn.Linear(in_features=4096, out_features=self.num_output_classes, bias=True)
            model.features[0] = nn.Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
        elif self.input_model == 'VGG_16':
            model = models.vgg16(False, False)
            model.classifier[6] = nn.Linear(in_features=4096, out_features=self.num_output_classes, bias=True)
            model.features[0] = nn.Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
        elif self.input_model == 'Res_Net_18':
            model = models.resnet18(False, False)
            model.fc = nn.Linear(in_features=512, out_features=self.num_output_classes, bias=True)
            model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        return model


class Basic_4_Layer_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(                                       # Dimension starts with 1 of 128 x 128
            # larger kernel CNN layers
            nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.ReLU(),       # Dimension becomes 6 of 128 x 128
            nn.AvgPool2d(kernel_size=2, stride=2),                      # Dimension now 6 of 64 x 64
            nn.Conv2d(6, 16, kernel_size=5, padding=2), nn.ReLU(),      # Dimension now 16 of 64 x 64
            nn.AvgPool2d(kernel_size=2, stride=2),                      # Dimension now 16 of 32 x 32
            # smaller kernel CNN layers
            nn.Conv2d(16, 24, kernel_size=3, padding=1), nn.ReLU(),     # Dimension now 24 of 32 x 32
            nn.AvgPool2d(kernel_size=2, stride=2),                      # Dimension now 24 of 16 x 16
            nn.Conv2d(24, 30, kernel_size=3, padding=1), nn.ReLU(),     # Dimension now 30 of 16 x 16
            nn.AvgPool2d(kernel_size=2, stride=2),                      # Dimension now 30 of 8 x 8
            # fully connected layers
            nn.Flatten(),
            nn.Linear(30 * 8 * 8, 200), nn.ReLU(),
            nn.Linear(200, 100), nn.ReLU(),
            nn.Linear(100, 11)                                          # Because we have 11 output classes
        )

    def forward(self, x):
        return softmax(self.net(x))

Create Dataset Module

In [4]:
import os
import torch
import librosa
import numpy as np
import json
from torch.utils.data import Dataset
import torch.nn.functional as F


def create_dataset(audio_input_path, json_path):
    # Load JSON file data
    file = open(json_path, 'rb')
    metadata = json.load(file)
    file.close()
    # Create list of audio files
    sample_list = os.listdir(audio_input_path)

    data = []
    labels = []
    # Loop through files and store spectrogram and instrument family for each sample
    for file in sample_list:
        labels.append(metadata[file[:-4]]['instrument_family'])
        # load the waveform y and sampling rate sr
        y, sr = librosa.load(f'{audio_input_path}{file}', sr=None)
        # convert to 2 dimensional spectogram format
        spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000, hop_length=502)
        # Convert raw power to dB
        S_dB = librosa.power_to_db(spectrogram, ref=np.max)
        data.append(S_dB)

    data_np = torch.tensor(np.stack(data))
    # labels = F.one_hot(torch.tensor(np.stack(labels)), num_classes=11)
    labels = F.one_hot(torch.tensor(np.stack(labels)), num_classes=11).type(torch.float32)
    return AudioSpectogramDataset(data_np, labels)


class AudioSpectogramDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        return self.data[index], self.labels[index]


Training Module

In [None]:
import argparse
import torch
from torch import nn
from datetime import datetime
from torch.utils.data import DataLoader
# from create_dataset import create_dataset
# from Models import Models
# from BasicCNN import BasicCNN


def main():
    parser = argparse.ArgumentParser(description='Train the individual Transformer model')
    parser.add_argument('-f')  # Required for argument parser to work in Colab
    parser.add_argument('--train_folder', type=str, default='small_audio/')
    parser.add_argument('--val_folder', type=str, default='small_audio/')
    parser.add_argument('--model', type=str, default='Basic_4_Layer_CNN')
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--lr', type=float, default=.01)
    parser.add_argument('--num_epochs', type=int, default=20)
    parser.add_argument('--status_interval', type=int, default=1)
    args = parser.parse_args()

    drive_path = '/content/drive/MyDrive/DL_data/'
    json_path_tng = f'{drive_path}nsynth-train/examples.json'
    json_path_val = f'{drive_path}nsynth-valid/examples.json'
    audio_input_path_tng = f'{drive_path}nsynth-train/{args.train_folder}'
    audio_input_path_val = f'{drive_path}nsynth-valid/{args.val_folder}'

    # Select GPU for runtime if available
    if not torch.cuda.is_available():
        device = torch.device("cpu")
        print('No GPU selected')
    else:
        device = torch.device("cuda")
        print(torch.cuda.get_device_name(device))

    start = datetime.now()
    # Create datasets
    tng_dataset = create_dataset(audio_input_path_tng, json_path_tng)
    val_dataset = create_dataset(audio_input_path_val, json_path_val)

    # Create Data Loaders
    tng_dataloader = DataLoader(tng_dataset, batch_size=args.batch_size, shuffle=False)
    val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

    print(f'\nDatasets created in {datetime.now()-start}')

    # Load model
    net = Models('Basic_4_Layer_CNN').choose_model().to(device)

    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)

    optimizer = torch.optim.SGD(net.parameters(), lr=args.lr)
    loss = nn.CrossEntropyLoss()

    epoch = 0
    for i in range(args.num_epochs):
        epoch_tng_loss = 0
        epoch_tng_score = 0
        epoch_val_score = 0

        # Training Loop
        net.train()
        n = 0
        # print(f'\n*** TRAINING LOOP ***\n')
        for (img_batch, label_batch) in tng_dataloader:
            # print(img_batch.shape)

            optimizer.zero_grad()
            img_batch = img_batch.to(device)
            label_batch = label_batch.to(device)
            # print(f'img_batch:\n{img_batch}\nlabel_batch ({label_batch.shape}):\n{label_batch}')

            img_batch = img_batch.reshape(img_batch.shape[0], 1, img_batch.shape[1], img_batch.shape[2])
            # print(f'img_batch shape: {img_batch.shape}')
            predicted_labels = net(img_batch)
            # print(f'predicted_labels ({predicted_labels.shape}): {predicted_labels}')

            tng_loss = loss(predicted_labels, label_batch)
            tng_loss.backward()
            optimizer.step()
            epoch_tng_loss += float(tng_loss.detach().item())
            with torch.no_grad():
                epoch_tng_score += (predicted_labels.argmax(axis=1) == label_batch.argmax(axis=1)).sum().item()
            n += len(label_batch)

        # print(f'\nn = {n}')
        epoch_tng_loss /= len(tng_dataloader)
        epoch_tng_score /= n

        # Validation Loop
        # print(f'\n*** VALIDATION LOOP ***\n')
        with torch.no_grad():
            net.eval()
            n = 0
            for (img_batch, label_batch) in val_dataloader:
                img_batch = img_batch.to(device)
                label_batch = label_batch.to(device)
                # print(f'img_batch:\n{img_batch}\nlabel_batch:\n{label_batch}')

                img_batch = img_batch.reshape(img_batch.shape[0], 1, img_batch.shape[1], img_batch.shape[2])
                predicted_labels = net(img_batch)
                # print(f'predicted_labels: {predicted_labels}')

                epoch_val_score += (predicted_labels.argmax(axis=1) == label_batch.argmax(axis=1)).sum().item()
                n += len(label_batch)

            # print(f'\nn = {n}')
            epoch_val_score /= n

        if epoch % args.status_interval == 0:
            print(f'\nepoch {epoch} completed: Training Loss = {epoch_tng_loss} //'
                  f' Training Score = {epoch_tng_score} // Validation Score = {epoch_val_score}')

        epoch += 1

    end = datetime.now()
    print(f'\nelapsed time: {end - start}')


if __name__ == '__main__':
    main()


Tesla V100-SXM2-16GB
