In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from audioDataLoader import audioDataloader
from tqdm import tqdm

In [2]:
#define the device to use
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# ResNet

In [3]:
resnet18 = models.resnet18()
num_classes = 1
resnet18.fc = nn.Sequential(nn.Linear(resnet18.fc.in_features, 1), nn.Sigmoid())# change to binary classification 
resnet18.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) # change input layer to greyscale (for the spectrogram )
loss_function = nn.BCELoss()
optimizer = optim.SGD(resnet18.parameters(), lr=0.1, weight_decay=0.0001, momentum=0.9)
num_epochs = 2
resnet18.to(device)

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [4]:
batch_size = 15
train_data = audioDataloader(index_file="/data/khood/GitHub/MLAudio/dataset/train/trainIndex.csv")
valid_data = audioDataloader(index_file="/data/khood/GitHub/MLAudio/dataset/valid/validIndex.csv")
test_data = audioDataloader(index_file="/data/khood/GitHub/MLAudio/dataset/test/testIndex.csv")
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [5]:
inputs, labels = next(iter(train_loader))

In [None]:
torch.save(resnet18.state_dict(), "./resnet18.pt")

In [6]:
losses = []
accuracies = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    resnet18.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for inputs, labels in train_loader:
            inputs, labels = torch.unsqueeze(inputs, 1).to(device), torch.unsqueeze(labels, 1).type(torch.float32).to(device)
            optimizer.zero_grad()
            outputs = resnet18(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            predicted = torch.round(outputs)
            total = labels.size(0)
            correct = (predicted == labels).sum().item()
            losses.append(loss.item())
            accuracy = correct / total
            accuracies.append(accuracy)
            tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy)

  0%|          | 0/2400 [00:53<?, ?batch/s, accuracy=40, loss=0.839]  


KeyboardInterrupt: 

In [None]:
# Evaluate on the test set
resnet18.eval()
correct, total = 0, 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = resnet18(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')