In [1]:
import argparse
import os
import pickle
import sys
from collections import OrderedDict
import time
import torch
import shutil
sys.path.append('/home/jake/repositories/vocalsound/src')
import dataloaders
import models
from traintest import train, validate
import ast
from torch.utils.data import WeightedRandomSampler
import numpy as np

In [2]:
torch.cuda.empty_cache()

In [3]:
print("I am process %s, running on %s: starting (%s)" % (
        os.getpid(), os.uname()[1], time.asctime()))

I am process 4220, running on jake-workstation: starting (Sat Jul 16 17:44:44 2022)


In [4]:
import datetime
from utilities import *
from torch.optim.swa_utils import AveragedModel, SWALR

In [5]:
def train(audio_model, train_loader, test_loader,
          exp_dir,lr,weight_decay,n_epochs,n_print_steps,save_model): #exp_dir,lr,weight_decay,n_epochs,n_print_steps,save_model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    torch.set_grad_enabled(True)
    # Initialize all of the statistics we want to keep track of
    batch_time = AverageMeter()
    per_sample_time = AverageMeter()
    data_time = AverageMeter()
    per_sample_data_time = AverageMeter()
    loss_meter = AverageMeter()
    per_sample_dnn_time = AverageMeter()
    progress = []
    best_epoch, best_cum_epoch, best_mAP, best_acc, best_cum_mAP = 0, 0, -np.inf, -np.inf, -np.inf
    global_step, epoch = 0, 0
    swa_sign = False
    start_time = time.time()
    exp_dir = exp_dir

    def _save_progress():
        progress.append([epoch, global_step, best_epoch, best_mAP,
                time.time() - start_time])
        with open("%s/progress.pkl" % exp_dir, "wb") as f:
            pickle.dump(progress, f)

    if not isinstance(audio_model, nn.DataParallel):
        audio_model = nn.DataParallel(audio_model)

    audio_model = audio_model.to(device)
    # Set up the optimizer
    audio_trainables = [p for p in audio_model.parameters() if p.requires_grad]
    print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in audio_model.parameters()) / 1000000))
    print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in audio_trainables) / 1000000))
    trainables = audio_trainables

    optimizer = torch.optim.Adam(trainables, lr, weight_decay=weight_decay, betas=(0.95, 0.999))

    print('now use new scheduler')
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, list(range(10, 60)), gamma=1.0)

    epoch += 1

    print("current #steps=%s, #epochs=%s" % (global_step, epoch))
    print("start training...")

    result = np.zeros([n_epochs, 10])
    audio_model.train()
    while epoch < n_epochs + 1:
        begin_time = time.time()
        end_time = time.time()
        audio_model.train()
        print(datetime.datetime.now())

        for i, (audio_input, labels) in enumerate(train_loader):
            # measure data loading time
            B = audio_input.size(0)
            audio_input = audio_input.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            data_time.update(time.time() - end_time)
            per_sample_data_time.update((time.time() - end_time) / audio_input.shape[0])
            dnn_start_time = time.time()

            audio_output = audio_model(audio_input)
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(audio_output, torch.argmax(labels.long(), axis=1))

            # original optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # record loss
            loss_meter.update(loss.item(), B)
            batch_time.update(time.time() - end_time)
            per_sample_time.update((time.time() - end_time)/audio_input.shape[0])
            per_sample_dnn_time.update((time.time() - dnn_start_time)/audio_input.shape[0])

            print_step = global_step % n_print_steps == 0
            early_print_step = epoch == 0 and global_step % (n_print_steps/10) == 0
            print_step = print_step or early_print_step

            if print_step and global_step != 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                  'Per Sample Total Time {per_sample_time.avg:.5f}\t'
                  'Per Sample Data Time {per_sample_data_time.avg:.5f}\t'
                  'Per Sample DNN Time {per_sample_dnn_time.avg:.5f}\t'
                  'Train Loss {loss_meter.val:.4f}\t'.format(
                   epoch, i, len(train_loader), per_sample_time=per_sample_time, per_sample_data_time=per_sample_data_time,
                      per_sample_dnn_time=per_sample_dnn_time, loss_meter=loss_meter), flush=True)
                if np.isnan(loss_meter.avg):
                    print("training diverged...")
                    return

            end_time = time.time()
            global_step += 1

        print('start validation')
        stats, valid_loss = validate(audio_model, test_loader, exp_dir, epoch)
        print('validation finished')

        cum_stats = stats

        cum_mAP = np.mean([stat['AP'] for stat in cum_stats])
        cum_mAUC = np.mean([stat['auc'] for stat in cum_stats])
        cum_acc = np.mean([stat['acc'] for stat in cum_stats])

        mAP = np.mean([stat['AP'] for stat in stats])
        mAUC = np.mean([stat['auc'] for stat in stats])
        acc = np.mean([stat['acc'] for stat in stats])

        middle_ps = [stat['precisions'][int(len(stat['precisions'])/2)] for stat in stats]
        middle_rs = [stat['recalls'][int(len(stat['recalls'])/2)] for stat in stats]
        average_precision = np.mean(middle_ps)
        average_recall = np.mean(middle_rs)

        print("---------------------Epoch {:d} Results---------------------".format(epoch))
        print("ACC: {:.6f}".format(acc))
        print("mAP: {:.6f}".format(mAP))
        print("AUC: {:.6f}".format(mAUC))
        print("Avg Precision: {:.6f}".format(average_precision))
        print("Avg Recall: {:.6f}".format(average_recall))
        print("d_prime: {:.6f}".format(d_prime(mAUC)))
        print("train_loss: {:.6f}".format(loss_meter.avg))
        print("valid_loss: {:.6f}".format(valid_loss))

        result[epoch-1, :] = [mAP, acc, average_precision, average_recall, d_prime(mAUC), loss_meter.avg, valid_loss, cum_mAP, cum_acc, optimizer.param_groups[0]['lr']]

        np.savetxt(exp_dir + '/result.csv', result, delimiter=',')

        if acc > best_acc:
            best_acc = acc
            best_acc_epoch = epoch
            torch.save(audio_model.state_dict(), "%s/models/best_audio_model.pth" % (exp_dir))

        if cum_mAP > best_cum_mAP:
            best_cum_epoch = epoch
            best_cum_mAP = cum_mAP

        if save_model == True:
            torch.save(audio_model.state_dict(), "%s/models/audio_model.%d.pth" % (exp_dir, epoch))

        scheduler.step()

        #print('number of params groups:' + str(len(optimizer.param_groups)))
        print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))

        with open(exp_dir + '/stats_' + str(epoch) +'.pickle', 'wb') as handle:
            pickle.dump(stats, handle, protocol=pickle.HIGHEST_PROTOCOL)
        _save_progress()

        finish_time = time.time()
        print('epoch {:d} training time: {:.3f}'.format(epoch, finish_time-begin_time))

        epoch += 1

        batch_time.reset()
        per_sample_time.reset()
        data_time.reset()
        per_sample_data_time.reset()
        loss_meter.reset()
        per_sample_dnn_time.reset()

In [6]:
def validate(audio_model, val_loader, exp_dir, epoch):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    batch_time = AverageMeter()
    if not isinstance(audio_model, nn.DataParallel):
        audio_model = nn.DataParallel(audio_model)
    audio_model = audio_model.to(device)
    audio_model.eval()

    end = time.time()
    A_predictions = []
    A_targets = []
    A_loss = []
    with torch.no_grad():
        for i, (audio_input, labels) in enumerate(val_loader):
            audio_input = audio_input.to(device)

            # compute output
            audio_output = audio_model(audio_input)
            predictions = audio_output.to('cpu').detach()

            A_predictions.append(predictions)
            A_targets.append(labels)

            # compute the loss
            labels = labels.to(device)
            loss_fn = nn.CrossEntropyLoss()
            # loss without reduction, easy to check per-sample loss
            loss = loss_fn(audio_output, torch.argmax(labels.long(), axis=1))
            A_loss.append(loss.to('cpu').detach())

            batch_time.update(time.time() - end)
            end = time.time()

        audio_output = torch.cat(A_predictions)
        target = torch.cat(A_targets)
        loss = np.mean(A_loss)
        stats = calculate_stats(audio_output, target)

        # save the prediction here
        exp_dir = exp_dir
        if os.path.exists(exp_dir+'/predictions') == False:
            os.mkdir(exp_dir+'/predictions')
            np.savetxt(exp_dir+'/predictions/target.csv', target, delimiter=',')
        np.savetxt(exp_dir+'/predictions/predictions_' + str(epoch) + '.csv', audio_output, delimiter=',')

    return stats, loss

In [7]:
model='eff_mean'
model_size=0
imagenet_pretrain=False

lr=1e-4
freqm=48
timem=192
mixup=0
batch_size=64

data_dir='/home/jake/repositories/VocalSound-project/data'
exp_dir=f'/home/jake/repositories/VocalSound-project/exp/vocalsound-{model}-{lr}'

CUDA_CACHE_DISABLE=1
n_class=6
n_epochs=30
weight_decay=5e-7
data_train=f'{data_dir}/datafiles/tr.json'
data_val=f'{data_dir}/datafiles/val.json'
label_csv=f'{data_dir}/class_labels_indices_vs.csv'
save_model=True
n_print_steps=100
num_workers=8

In [8]:
audio_conf = {'num_mel_bins': 128, 'target_length': 512, 'freqm': freqm, 'timem': timem, 'mixup': mixup, 'mode': 'train'}

In [9]:
print('balanced sampler is not used')
train_loader = torch.utils.data.DataLoader(
    dataloaders.VSDataset(data_train, label_csv=label_csv, audio_conf=audio_conf, raw_wav_mode=False, specaug=True),
    batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)

balanced sampler is not used


In [10]:
val_audio_conf = {'num_mel_bins': 128, 'target_length': 512, 'mixup': 0, 'mode': 'test'}

In [11]:
val_loader = torch.utils.data.DataLoader(
    dataloaders.VSDataset(data_val, label_csv=label_csv, audio_conf=val_audio_conf, raw_wav_mode=False),
    batch_size=200, shuffle=False, num_workers=num_workers, pin_memory=True)

In [12]:
if model == 'eff_mean':
    audio_model = models.EffNetMean(label_dim=n_class, level=model_size, pretrain=imagenet_pretrain)
else:
    raise ValueError('Model Unrecognized')


not using imagenet pretrained network


In [13]:
# start training
if os.path.exists(exp_dir):
    print(f"Deleting existing experiment directory {exp_dir}")
    shutil.rmtree(exp_dir)
print(f"\nCreating experiment directory: {exp_dir}")
os.makedirs(f"{exp_dir}/models")
#with open("%s/args.pkl" % exp_dir, "wb") as f:
#    pickle.dump(args, f)

Deleting existing experiment directory /home/jake/repositories/VocalSound-project/exp/vocalsound-eff_mean-0.0001

Creating experiment directory: /home/jake/repositories/VocalSound-project/exp/vocalsound-eff_mean-0.0001


In [14]:
print('Now starting training for {:d} epochs'.format(n_epochs))
train(audio_model, train_loader, val_loader, exp_dir,lr,weight_decay,n_epochs,n_print_steps,save_model)

Now starting training for 30 epochs
cuda
Total parameter number is : 4.015 million
Total trainable parameter number is : 4.015 million
now use new scheduler
current #steps=0, #epochs=1
start training...
2022-07-16 17:44:45.169260
Epoch: [1][100/243]	Per Sample Total Time 0.00278	Per Sample Data Time 0.00008	Per Sample DNN Time 0.00269	Train Loss 1.6997	
Epoch: [1][200/243]	Per Sample Total Time 0.00262	Per Sample Data Time 0.00004	Per Sample DNN Time 0.00258	Train Loss 1.6541	
start validation
validation finished
---------------------Epoch 1 Results---------------------
ACC: 0.167116
mAP: 0.196946
AUC: 0.552512
Avg Precision: 0.184412
Avg Recall: 0.542327
d_prime: 0.186691
train_loss: 1.738725
valid_loss: 1.796750
Epoch-1 lr: 0.0001
epoch 1 training time: 43.053
2022-07-16 17:45:28.222703
Epoch: [2][57/243]	Per Sample Total Time 0.00261	Per Sample Data Time 0.00013	Per Sample DNN Time 0.00248	Train Loss 1.6913	
Epoch: [2][157/243]	Per Sample Total Time 0.00251	Per Sample Data Time 0.00

In [15]:
# test on the test set and sub-test set, model selected on the validation set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sd = torch.load(exp_dir + '/models/best_audio_model.pth', map_location=device)
audio_model = torch.nn.DataParallel(audio_model)
audio_model.load_state_dict(sd)

<All keys matched successfully>

In [16]:
all_res = []

In [17]:
# best model on the validation set, repeat to confirm
stats, _ = validate(audio_model, val_loader, exp_dir, 'valid_set')
# note it is NOT mean of class-wise accuracy
val_acc = stats[0]['acc']
val_mAUC = np.mean([stat['auc'] for stat in stats])
print('---------------evaluate on the validation set---------------')
print("Accuracy: {:.6f}".format(val_acc))
all_res.append(val_acc)

---------------evaluate on the validation set---------------
Accuracy: 0.899730


In [18]:
# test the model on the evaluation set
data_eval_list = ['te.json', 'subtest/te_age1.json', 'subtest/te_age2.json', 'subtest/te_age3.json', 'subtest/te_female.json', 'subtest/te_male.json']
eval_name_list = ['all_test', 'test age 18-25', 'test age 26-48', 'test age 49-80', 'test female', 'test male']

In [19]:
data_dir = '/'.join(data_val.split('/')[:-1])
for idx, cur_eval in enumerate(data_eval_list):
    cur_eval = data_dir + '/' + cur_eval
    eval_loader = torch.utils.data.DataLoader(
        dataloaders.VSDataset(cur_eval, label_csv=label_csv, audio_conf=val_audio_conf),
        batch_size=batch_size*2, shuffle=False, num_workers=num_workers, pin_memory=True)
    stats, _ = validate(audio_model, eval_loader, exp_dir, eval_name_list[idx])
    eval_acc = stats[0]['acc']
    all_res.append(eval_acc)
    print('---------------evaluate on {:s}---------------'.format(eval_name_list[idx]))
    print("Accuracy: {:.6f}".format(eval_acc))

---------------evaluate on all_test---------------
Accuracy: 0.909217
---------------evaluate on test age 18-25---------------
Accuracy: 0.920404
---------------evaluate on test age 26-48---------------
Accuracy: 0.904296
---------------evaluate on test age 49-80---------------
Accuracy: 0.913793
---------------evaluate on test female---------------
Accuracy: 0.917840
---------------evaluate on test male---------------
Accuracy: 0.901017


In [20]:
all_res = np.array(all_res)
all_res = all_res.reshape([1, all_res.shape[0]])
np.savetxt(exp_dir + '/all_eval_result.csv', all_res, header=','.join(['validation'] + eval_name_list))