In [1]:
%load_ext autoreload
%autoreload 2
from transformer import SpeechTransformer
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from custom_dataset import SpectrogramDataset, BinaryDataset, create_sampler
from training_pipeline import repeat_training, set_seed, worker_init_fn, plot_results
from collections import Counter
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

## Parameters

In [2]:
SEED = 42
set_seed(SEED)
repetitions = 4
lr = 0.001
epochs = 100
tolerance = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

alpha = 1
dropout = 0.2
weight_decay = 0.0
augmented_fraction = 0.3
label_smoothing = 0.0

batch_size = 1024
n_workers = 4
prefetch_factor = 2 if n_workers > 0 else None
persistent_workers = True if n_workers > 0 else False

# architecture
d_model = 128
nhead = 8
num_layers = 2

## 10 classes + unknown

In [3]:
data_path = "data/train/audio_transformed"
train_dataset = SpectrogramDataset(data_path, set_type=SpectrogramDataset.TRAIN, augmentation=True, augmented_fraction=augmented_fraction)
val_dataset = SpectrogramDataset(data_path, set_type=SpectrogramDataset.VAL)
test_dataset = SpectrogramDataset(data_path, set_type=SpectrogramDataset.TEST)

sampler = create_sampler(train_dataset, alpha)
train_loader = DataLoader(train_dataset, sampler=sampler, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor,persistent_workers=persistent_workers, worker_init_fn=worker_init_fn)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)

In [4]:
def init_transformer_all_classes():
     return SpeechTransformer(num_classes=11, dropout=dropout, d_model=d_model, nhead=nhead, num_layers=num_layers)

set_seed(SEED)

model_dir = f"output/models/all_classes/final/transformer"
history_dir = f"output/history/all_classes/final/transformer"

os.makedirs(model_dir, exist_ok=True)
os.makedirs(history_dir, exist_ok=True)

model_path = model_dir + "/transformer.pth"
history_path = history_dir + "/transformer.pkl"

repeat_training(repetitions, init_transformer_all_classes, lr, model_path, history_path, epochs, train_loader, val_loader, test_loader, device, tolerance=tolerance, weight_decay=weight_decay, label_smoothing=label_smoothing)

training iteration: 1 of 4
starting training...
epoch: 1, training loss: 0.0023684205818131467, training accuracy: 12.300344503601629, training balanced accuracy: 12.341898996972397
epoch: 1, validation loss: 0.002460873733727011, validation accuracy: 6.031185642836128, validation balanced accuracy: 14.403250880315799
model saved

epoch: 2, training loss: 0.0022331252064109034, training accuracy: 16.913952395865955, training balanced accuracy: 16.99120099099927
epoch: 2, validation loss: 0.0024803178946598025, validation accuracy: 6.619593998234775, validation balanced accuracy: 15.962233851717633

epoch: 3, training loss: 0.0021875340280359604, training accuracy: 19.090588787973694, training balanced accuracy: 18.9900295332741
epoch: 3, validation loss: 0.0024680024071559027, validation accuracy: 7.56104736687261, validation balanced accuracy: 18.349682728408354

epoch: 4, training loss: 0.002116839475218047, training accuracy: 22.179376761666145, training balanced accuracy: 22.246514

## 10 classes

In [5]:
data_path = "data/train/audio_transformed"
train_dataset = SpectrogramDataset(data_path, set_type=SpectrogramDataset.TRAIN, augmentation=True, augmented_fraction=augmented_fraction, use_unknown=False)
val_dataset = SpectrogramDataset(data_path, set_type=SpectrogramDataset.VAL, use_unknown=False)
test_dataset = SpectrogramDataset(data_path, set_type=SpectrogramDataset.TEST, use_unknown=False)

# without oversampling 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor,persistent_workers=persistent_workers, worker_init_fn=worker_init_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)

In [6]:
def init_transformer_without_unknown():
    return SpeechTransformer(num_classes=10, dropout=dropout, d_model=d_model, nhead=nhead, num_layers=num_layers)

set_seed(SEED)

model_dir = f"output/models/without_unknown/final/transformer"
history_dir = f"output/history/without_unknown/final/transformer"

os.makedirs(model_dir, exist_ok=True)
os.makedirs(history_dir, exist_ok=True)

model_path = model_dir + "/transformer.pth"
history_path = history_dir + "/transformer.pkl"

repeat_training(repetitions, init_transformer_without_unknown, lr, model_path, history_path, epochs, train_loader, val_loader, test_loader, device, tolerance=tolerance, weight_decay=weight_decay, label_smoothing=label_smoothing)

training iteration: 1 of 4
starting training...
epoch: 1, training loss: 0.0024412060802532563, training accuracy: 12.207357859531772, training balanced accuracy: 12.17675817332767
epoch: 1, validation loss: 0.00260835892940428, validation accuracy: 15.17268141249515, validation balanced accuracy: 15.141481157295708
model saved

epoch: 2, training loss: 0.0023184005259078533, training accuracy: 14.667170137015859, training balanced accuracy: 14.66692897128683
epoch: 2, validation loss: 0.0025666944269158833, validation accuracy: 15.289095847885138, validation balanced accuracy: 15.315499703525667
model saved

epoch: 3, training loss: 0.002273918985870358, training accuracy: 17.062250512460892, training balanced accuracy: 16.999232757238794
epoch: 3, validation loss: 0.0025268237536204244, validation accuracy: 18.23826154443151, validation balanced accuracy: 18.377727822057853
model saved

epoch: 4, training loss: 0.0022305496328791983, training accuracy: 18.853166468874743, training ba

## Binary case

In [7]:
data_path = "data/train/audio_transformed"
train_dataset = BinaryDataset(data_path, set_type=SpectrogramDataset.TRAIN, augmentation=True, augmented_fraction=augmented_fraction)
val_dataset = BinaryDataset(data_path, set_type=SpectrogramDataset.VAL)
test_dataset = BinaryDataset(data_path, set_type=SpectrogramDataset.TEST)

sampler = create_sampler(train_dataset, alpha)
train_loader = DataLoader(train_dataset, sampler=sampler, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor,persistent_workers=persistent_workers, worker_init_fn=worker_init_fn)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=n_workers, pin_memory=True, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)

In [8]:
def init_transformer_binary():
     return SpeechTransformer(num_classes=2, dropout=dropout, d_model=d_model, nhead=nhead, num_layers=num_layers)

set_seed(SEED)

model_dir = f"output/models/binary/final/transformer"
history_dir = f"output/history/binary/final/transformer"

os.makedirs(model_dir, exist_ok=True)
os.makedirs(history_dir, exist_ok=True)

model_path = model_dir + "/transformer.pth"
history_path = history_dir + "/transformer.pkl"

repeat_training(repetitions, init_transformer_binary, lr, model_path, history_path, epochs, train_loader, val_loader, test_loader, device, tolerance=tolerance, weight_decay=weight_decay, label_smoothing=label_smoothing)

training iteration: 1 of 4
starting training...
epoch: 1, training loss: 0.0007414877020356949, training accuracy: 49.931490761039775, training balanced accuracy: 49.93352883675464
epoch: 1, validation loss: 0.0007198838652425319, validation accuracy: 39.541041482789055, validation balanced accuracy: 50.3400178551778
model saved

epoch: 2, training loss: 0.0006782827946111545, training accuracy: 50.96108675227059, training balanced accuracy: 50.958237933730054
epoch: 2, validation loss: 0.0007091610527768069, validation accuracy: 56.92850838481906, validation balanced accuracy: 51.62322430753268
model saved

epoch: 3, training loss: 0.0006776414193257321, training accuracy: 51.297760726589416, training balanced accuracy: 51.30186725982142
epoch: 3, validation loss: 0.0007162271316839198, validation accuracy: 50.94145336863784, validation balanced accuracy: 52.84758920624992

epoch: 4, training loss: 0.000677845904278673, training accuracy: 51.507203257124964, training balanced accuracy