In [1]:
%cd /home/jaeheonshim/music-vibes

/home/jaeheonshim/music-vibes


In [None]:
import torch
import numpy as np
import pandas as pd
from torch import nn
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader, Subset
import librosa
from tqdm import tqdm
from torchvggish import vggish_input

from vibenet import labels
from vibenet.dataset import FMAVGGishDataset
from vibenet.models.teacher import VGGishLSTM
from vibenet.utils import Mixup, do_mixup

In [4]:
dataset = FMAVGGishDataset('data/preprocessed/vggish')
# dataset = Subset(dataset, list(range(50)))

N = len(dataset)
print(N)
train_size = int(N * 0.8)
train_ds, test_ds = random_split(dataset, [train_size, N - train_size], generator=torch.Generator().manual_seed(42))
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

13122


In [5]:
device = 'cuda'

model = VGGishLSTM()
model = model.to(device)

NUM_EPOCHS = 200

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=1e-5)
mixup = Mixup(0.2)

In [None]:
best_loss = 1

for epoch in range(NUM_EPOCHS):
    scheduler.step()

    print(f"Epoch {epoch+1}:")
    model.train()
    
    train_losses = []

    with tqdm(train_dl, desc='Training') as pbar:
        for data, label in pbar:
            data, label = data.to(device).float(), label.to(device).float()
            optimizer.zero_grad()

            if data.shape[0] % 2 != 0:
                data = data.expand(2, -1, -1, -1)
                label = label.expand(2, -1)

            mixup_lambda = torch.from_numpy(mixup.get_lambda(data.shape[0]).astype(np.float32)).to(device)
            label = do_mixup(label, mixup_lambda)

            with torch.autocast(device_type="cuda"):
                pred = model(data, mixup_lambda)
                loss = F.l1_loss(pred, label)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            train_loss = np.mean(train_losses)

            pbar.set_postfix({'loss': f"{train_loss:.4f}", 'lr': f"{scheduler.get_last_lr()[0]}"})

    model.eval()

    eval_losses = []
    label_maes = []

    with tqdm(test_dl, desc='Validation') as pbar:
        with torch.inference_mode():
            for data, label in pbar:
                data, label = data.to(device).float(), label.to(device).float()

                pred = model(data)
                loss = F.l1_loss(pred, label)

                eval_losses.append(loss.item())
                eval_loss = np.mean(eval_losses)

                mae = torch.mean(torch.abs(pred - label), dim=0) # for per-metric MAE
                label_maes.append(mae.cpu().numpy())

                pbar.set_postfix({'loss': f"{eval_loss:.4f}"})
    
    eval_loss = np.mean(eval_losses)
    print("Validation loss:", eval_loss)

    label_mae = np.mean(label_maes, axis=0)
    print("MAE per label:")
    for i, label in enumerate(labels):
        print(f"\t{label:<20}{label_mae[i]:.4f}")

    if eval_loss < best_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print('Saved new best model')



Epoch 1:


  return collate([torch.as_tensor(b) for b in batch], collate_fn_map=collate_fn_map)
Training: 100%|██████████| 165/165 [00:25<00:00,  6.40it/s, loss=0.1921, lr=0.000999938933078422]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.54it/s, loss=0.1751]


Validation loss: 0.1751357778197243
MAE per label:
	acousticness        0.2671
	danceability        0.1322
	energy              0.1565
	instrumentalness    0.2750
	liveness            0.1037
	speechiness         0.0630
	valence             0.2284
Saved new best model
Epoch 2:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.19it/s, loss=0.1655, lr=0.000999755747381037]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1659]


Validation loss: 0.16592283937193097
MAE per label:
	acousticness        0.2597
	danceability        0.1224
	energy              0.1605
	instrumentalness    0.2479
	liveness            0.0963
	speechiness         0.0617
	valence             0.2129
Saved new best model
Epoch 3:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.32it/s, loss=0.1569, lr=0.000999450488106175]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.02it/s, loss=0.1565]


Validation loss: 0.15648973271960304
MAE per label:
	acousticness        0.2520
	danceability        0.1159
	energy              0.1470
	instrumentalness    0.2237
	liveness            0.0963
	speechiness         0.0584
	valence             0.2020
Saved new best model
Epoch 4:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.27it/s, loss=0.1503, lr=0.0009990232305719944]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.75it/s, loss=0.1512]


Validation loss: 0.15122402211030325
MAE per label:
	acousticness        0.2373
	danceability        0.1103
	energy              0.1382
	instrumentalness    0.2205
	liveness            0.0948
	speechiness         0.0575
	valence             0.1999
Saved new best model
Epoch 5:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.15it/s, loss=0.1475, lr=0.0009984740801978983]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.74it/s, loss=0.1512]


Validation loss: 0.15116329313743682
MAE per label:
	acousticness        0.2357
	danceability        0.1106
	energy              0.1411
	instrumentalness    0.2189
	liveness            0.0971
	speechiness         0.0571
	valence             0.1975
Saved new best model
Epoch 6:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.28it/s, loss=0.1461, lr=0.0009978031724785245]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.89it/s, loss=0.1504]


Validation loss: 0.1503907892675627
MAE per label:
	acousticness        0.2405
	danceability        0.1123
	energy              0.1397
	instrumentalness    0.2104
	liveness            0.0953
	speechiness         0.0549
	valence             0.1996
Saved new best model
Epoch 7:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.23it/s, loss=0.1452, lr=0.000997010672950314]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.16it/s, loss=0.1464]


Validation loss: 0.14644881515275865
MAE per label:
	acousticness        0.2277
	danceability        0.1088
	energy              0.1310
	instrumentalness    0.2101
	liveness            0.0945
	speechiness         0.0571
	valence             0.1958
Saved new best model
Epoch 8:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.66it/s, loss=0.1443, lr=0.0009960967771506664]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.18it/s, loss=0.1437]


Validation loss: 0.14371332277854285
MAE per label:
	acousticness        0.2172
	danceability        0.1053
	energy              0.1369
	instrumentalness    0.2091
	liveness            0.0940
	speechiness         0.0541
	valence             0.1894
Saved new best model
Epoch 9:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.25it/s, loss=0.1415, lr=0.0009950617105696957]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1448]


Validation loss: 0.1447856461717969
MAE per label:
	acousticness        0.2217
	danceability        0.1070
	energy              0.1324
	instrumentalness    0.2083
	liveness            0.0946
	speechiness         0.0576
	valence             0.1919
Epoch 10:


Training: 100%|██████████| 165/165 [00:11<00:00, 13.99it/s, loss=0.1412, lr=0.0009939057285945929]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.82it/s, loss=0.1420]


Validation loss: 0.14199090216841018
MAE per label:
	acousticness        0.2119
	danceability        0.1073
	energy              0.1295
	instrumentalness    0.2069
	liveness            0.0956
	speechiness         0.0558
	valence             0.1869
Saved new best model
Epoch 11:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.16it/s, loss=0.1397, lr=0.0009926291164466126]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.92it/s, loss=0.1423]


Validation loss: 0.14226612200339636
MAE per label:
	acousticness        0.2099
	danceability        0.1071
	energy              0.1311
	instrumentalness    0.2128
	liveness            0.0938
	speechiness         0.0539
	valence             0.1873
Epoch 12:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.11it/s, loss=0.1382, lr=0.0009912321891107005]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.00it/s, loss=0.1403]


Validation loss: 0.1403114398320516
MAE per label:
	acousticness        0.2102
	danceability        0.1053
	energy              0.1283
	instrumentalness    0.2071
	liveness            0.0939
	speechiness         0.0549
	valence             0.1826
Saved new best model
Epoch 13:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.15it/s, loss=0.1393, lr=0.0009897152912577737]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.01it/s, loss=0.1394]


Validation loss: 0.13944441789672488
MAE per label:
	acousticness        0.2070
	danceability        0.1070
	energy              0.1266
	instrumentalness    0.2037
	liveness            0.0936
	speechiness         0.0538
	valence             0.1845
Saved new best model
Epoch 14:


Training: 100%|██████████| 165/165 [00:11<00:00, 13.99it/s, loss=0.1382, lr=0.0009880787971596797]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1420]


Validation loss: 0.14200138584488914
MAE per label:
	acousticness        0.2149
	danceability        0.1070
	energy              0.1283
	instrumentalness    0.2122
	liveness            0.0939
	speechiness         0.0533
	valence             0.1844
Epoch 15:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.33it/s, loss=0.1377, lr=0.0009863231105968495]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1392]


Validation loss: 0.1391877168346019
MAE per label:
	acousticness        0.2091
	danceability        0.1079
	energy              0.1244
	instrumentalness    0.2016
	liveness            0.0933
	speechiness         0.0527
	valence             0.1854
Saved new best model
Epoch 16:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.46it/s, loss=0.1368, lr=0.000984448664758672]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.95it/s, loss=0.1391]


Validation loss: 0.13911519518920354
MAE per label:
	acousticness        0.2076
	danceability        0.1048
	energy              0.1253
	instrumentalness    0.2034
	liveness            0.0939
	speechiness         0.0525
	valence             0.1863
Saved new best model
Epoch 17:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.32it/s, loss=0.1367, lr=0.0009824559221366096]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1381]


Validation loss: 0.13812380461465745
MAE per label:
	acousticness        0.2040
	danceability        0.1046
	energy              0.1271
	instrumentalness    0.2022
	liveness            0.0924
	speechiness         0.0539
	valence             0.1827
Saved new best model
Epoch 18:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.32it/s, loss=0.1350, lr=0.0009803453744100864]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1400]


Validation loss: 0.13995802260580517
MAE per label:
	acousticness        0.2059
	danceability        0.1104
	energy              0.1264
	instrumentalness    0.2068
	liveness            0.0936
	speechiness         0.0527
	valence             0.1839
Epoch 19:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.24it/s, loss=0.1351, lr=0.0009781175423251728]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.99it/s, loss=0.1410]


Validation loss: 0.14096161616700037
MAE per label:
	acousticness        0.2183
	danceability        0.1063
	energy              0.1256
	instrumentalness    0.2104
	liveness            0.0929
	speechiness         0.0525
	valence             0.1807
Epoch 20:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.29it/s, loss=0.1342, lr=0.0009757729755661004]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.03it/s, loss=0.1384]


Validation loss: 0.1383972785302571
MAE per label:
	acousticness        0.2126
	danceability        0.1068
	energy              0.1233
	instrumentalness    0.2008
	liveness            0.0940
	speechiness         0.0522
	valence             0.1790
Epoch 21:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.57it/s, loss=0.1341, lr=0.0009733122526196342]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.01it/s, loss=0.1375]


Validation loss: 0.1374866980172339
MAE per label:
	acousticness        0.2025
	danceability        0.1052
	energy              0.1234
	instrumentalness    0.2009
	liveness            0.0942
	speechiness         0.0531
	valence             0.1830
Saved new best model
Epoch 22:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.30it/s, loss=0.1322, lr=0.000970735980632341]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.11it/s, loss=0.1365]


Validation loss: 0.13651976769878751
MAE per label:
	acousticness        0.2023
	danceability        0.1035
	energy              0.1256
	instrumentalness    0.1988
	liveness            0.0931
	speechiness         0.0536
	valence             0.1789
Saved new best model
Epoch 23:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.34it/s, loss=0.1330, lr=0.0009680447952607838]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1385]


Validation loss: 0.13849144925673804
MAE per label:
	acousticness        0.2083
	danceability        0.1036
	energy              0.1271
	instrumentalness    0.2043
	liveness            0.0932
	speechiness         0.0525
	valence             0.1804
Epoch 24:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.36it/s, loss=0.1326, lr=0.0009652393605146839]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1369]


Validation loss: 0.1368947055722986
MAE per label:
	acousticness        0.2086
	danceability        0.1041
	energy              0.1232
	instrumentalness    0.1983
	liveness            0.0931
	speechiness         0.0524
	valence             0.1785
Epoch 25:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.43it/s, loss=0.1319, lr=0.0009623203685930864]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.39it/s, loss=0.1361]


Validation loss: 0.13611660063976333
MAE per label:
	acousticness        0.2047
	danceability        0.1027
	energy              0.1235
	instrumentalness    0.1966
	liveness            0.0931
	speechiness         0.0520
	valence             0.1803
Saved new best model
Epoch 26:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.60it/s, loss=0.1322, lr=0.00095928853971357]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.26it/s, loss=0.1367]


Validation loss: 0.13670263989340692
MAE per label:
	acousticness        0.2021
	danceability        0.1025
	energy              0.1240
	instrumentalness    0.2017
	liveness            0.0935
	speechiness         0.0522
	valence             0.1810
Epoch 27:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.33it/s, loss=0.1309, lr=0.0009561446219345448]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1367]


Validation loss: 0.13673764946205275
MAE per label:
	acousticness        0.2060
	danceability        0.1027
	energy              0.1263
	instrumentalness    0.1988
	liveness            0.0927
	speechiness         0.0523
	valence             0.1783
Epoch 28:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.52it/s, loss=0.1335, lr=0.0009528893909706791]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.11it/s, loss=0.1379]


Validation loss: 0.1378722403730665
MAE per label:
	acousticness        0.2057
	danceability        0.1051
	energy              0.1241
	instrumentalness    0.1993
	liveness            0.0929
	speechiness         0.0510
	valence             0.1871
Epoch 29:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1341, lr=0.0009495236500015041]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1358]


Validation loss: 0.13578155086863608
MAE per label:
	acousticness        0.2044
	danceability        0.1015
	energy              0.1249
	instrumentalness    0.1958
	liveness            0.0931
	speechiness         0.0520
	valence             0.1787
Saved new best model
Epoch 30:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.47it/s, loss=0.1315, lr=0.0009460482294732414]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1359]


Validation loss: 0.13592888326162383
MAE per label:
	acousticness        0.2035
	danceability        0.1038
	energy              0.1223
	instrumentalness    0.1975
	liveness            0.0935
	speechiness         0.0525
	valence             0.1784
Epoch 31:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.56it/s, loss=0.1313, lr=0.0009424639868939027]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.12it/s, loss=0.1360]


Validation loss: 0.13599198613138425
MAE per label:
	acousticness        0.1999
	danceability        0.1014
	energy              0.1258
	instrumentalness    0.2018
	liveness            0.0926
	speechiness         0.0532
	valence             0.1771
Epoch 32:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1315, lr=0.0009387718066217117]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1357]


Validation loss: 0.13569964752310798
MAE per label:
	acousticness        0.2004
	danceability        0.1035
	energy              0.1218
	instrumentalness    0.1985
	liveness            0.0937
	speechiness         0.0515
	valence             0.1805
Saved new best model
Epoch 33:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.80it/s, loss=0.1291, lr=0.000934972599646904]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.99it/s, loss=0.1351]


Validation loss: 0.13512074432912327
MAE per label:
	acousticness        0.1993
	danceability        0.1016
	energy              0.1217
	instrumentalness    0.1970
	liveness            0.0934
	speechiness         0.0525
	valence             0.1804
Saved new best model
Epoch 34:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.22it/s, loss=0.1297, lr=0.0009310673033669515]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1344]


Validation loss: 0.1343592797361669
MAE per label:
	acousticness        0.1989
	danceability        0.1021
	energy              0.1217
	instrumentalness    0.1941
	liveness            0.0925
	speechiness         0.0510
	valence             0.1801
Saved new best model
Epoch 35:


Training: 100%|██████████| 165/165 [00:29<00:00,  5.58it/s, loss=0.1302, lr=0.0009270568813552751]
Validation: 100%|██████████| 42/42 [00:20<00:00,  2.02it/s, loss=0.1360]


Validation loss: 0.13602009876852944
MAE per label:
	acousticness        0.2079
	danceability        0.1009
	energy              0.1234
	instrumentalness    0.1945
	liveness            0.0929
	speechiness         0.0510
	valence             0.1817
Epoch 36:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.42it/s, loss=0.1307, lr=0.0009229423231234969]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1349]


Validation loss: 0.13485559608255113
MAE per label:
	acousticness        0.1981
	danceability        0.1026
	energy              0.1220
	instrumentalness    0.1978
	liveness            0.0930
	speechiness         0.0525
	valence             0.1781
Epoch 37:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.46it/s, loss=0.1280, lr=0.0009187246438772933]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1347]


Validation loss: 0.13468844354862258
MAE per label:
	acousticness        0.2001
	danceability        0.1020
	energy              0.1195
	instrumentalness    0.1961
	liveness            0.0924
	speechiness         0.0510
	valence             0.1817
Epoch 38:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.42it/s, loss=0.1301, lr=0.0009144048842659075]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.09it/s, loss=0.1344]


Validation loss: 0.13436609045380637
MAE per label:
	acousticness        0.1978
	danceability        0.1014
	energy              0.1192
	instrumentalness    0.1989
	liveness            0.0927
	speechiness         0.0510
	valence             0.1796
Epoch 39:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.45it/s, loss=0.1320, lr=0.0009099841101253861]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.85it/s, loss=0.1365]


Validation loss: 0.13653446751691045
MAE per label:
	acousticness        0.2034
	danceability        0.1038
	energy              0.1230
	instrumentalness    0.2015
	liveness            0.0925
	speechiness         0.0528
	valence             0.1787
Epoch 40:


Training: 100%|██████████| 165/165 [00:40<00:00,  4.07it/s, loss=0.1286, lr=0.0009054634122155986]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1353]


Validation loss: 0.13532371109440214
MAE per label:
	acousticness        0.1991
	danceability        0.1033
	energy              0.1205
	instrumentalness    0.2023
	liveness            0.0928
	speechiness         0.0524
	valence             0.1769
Epoch 41:


Training: 100%|██████████| 165/165 [00:28<00:00,  5.81it/s, loss=0.1276, lr=0.0009008439059511095]
Validation: 100%|██████████| 42/42 [00:16<00:00,  2.57it/s, loss=0.1338]


Validation loss: 0.13376101443455332
MAE per label:
	acousticness        0.1979
	danceability        0.1002
	energy              0.1208
	instrumentalness    0.1959
	liveness            0.0924
	speechiness         0.0510
	valence             0.1781
Saved new best model
Epoch 42:


Training: 100%|██████████| 165/165 [02:11<00:00,  1.25it/s, loss=0.1281, lr=0.0008961267311259662]
Validation: 100%|██████████| 42/42 [00:31<00:00,  1.31it/s, loss=0.1338]


Validation loss: 0.13378590292164258
MAE per label:
	acousticness        0.1963
	danceability        0.1007
	energy              0.1206
	instrumentalness    0.1961
	liveness            0.0925
	speechiness         0.0518
	valence             0.1785
Epoch 43:


Training: 100%|██████████| 165/165 [00:44<00:00,  3.71it/s, loss=0.1293, lr=0.0008913130516324728]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.59it/s, loss=0.1341]


Validation loss: 0.13408838123792693
MAE per label:
	acousticness        0.1965
	danceability        0.1009
	energy              0.1215
	instrumentalness    0.1985
	liveness            0.0931
	speechiness         0.0513
	valence             0.1768
Epoch 44:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.33it/s, loss=0.1275, lr=0.0008864040551740152]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.01it/s, loss=0.1343]


Validation loss: 0.13427860520425297
MAE per label:
	acousticness        0.1993
	danceability        0.1025
	energy              0.1202
	instrumentalness    0.1938
	liveness            0.0927
	speechiness         0.0506
	valence             0.1808
Epoch 45:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.42it/s, loss=0.1282, lr=0.000881400952972015]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1335]


Validation loss: 0.13350777753761836
MAE per label:
	acousticness        0.1973
	danceability        0.1024
	energy              0.1193
	instrumentalness    0.1953
	liveness            0.0927
	speechiness         0.0510
	valence             0.1765
Saved new best model
Epoch 46:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1266, lr=0.0008763049794670771]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1339]


Validation loss: 0.13391408998341786
MAE per label:
	acousticness        0.1959
	danceability        0.1001
	energy              0.1210
	instrumentalness    0.1998
	liveness            0.0926
	speechiness         0.0505
	valence             0.1776
Epoch 47:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.30it/s, loss=0.1264, lr=0.0008711173920144113]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1333]


Validation loss: 0.13334922900512106
MAE per label:
	acousticness        0.1980
	danceability        0.1011
	energy              0.1193
	instrumentalness    0.1926
	liveness            0.0929
	speechiness         0.0509
	valence             0.1785
Saved new best model
Epoch 48:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.41it/s, loss=0.1277, lr=0.0008658394705735983]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.09it/s, loss=0.1343]


Validation loss: 0.13430901687769664
MAE per label:
	acousticness        0.1966
	danceability        0.1038
	energy              0.1222
	instrumentalness    0.1937
	liveness            0.0925
	speechiness         0.0503
	valence             0.1810
Epoch 49:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.55it/s, loss=0.1269, lr=0.0008604725173927781]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.37it/s, loss=0.1325]


Validation loss: 0.13247257790395192
MAE per label:
	acousticness        0.1921
	danceability        0.1007
	energy              0.1213
	instrumentalness    0.1920
	liveness            0.0928
	speechiness         0.0501
	valence             0.1783
Saved new best model
Epoch 50:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.46it/s, loss=0.1263, lr=0.0008550178566873405]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.11it/s, loss=0.1326]


Validation loss: 0.1325515198210875
MAE per label:
	acousticness        0.1949
	danceability        0.1004
	energy              0.1200
	instrumentalness    0.1927
	liveness            0.0926
	speechiness         0.0494
	valence             0.1779
Epoch 51:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.28it/s, loss=0.1255, lr=0.0008494768343131951]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1322]


Validation loss: 0.13220453759034476
MAE per label:
	acousticness        0.1949
	danceability        0.0993
	energy              0.1191
	instrumentalness    0.1916
	liveness            0.0930
	speechiness         0.0513
	valence             0.1764
Saved new best model
Epoch 52:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.29it/s, loss=0.1259, lr=0.0008438508174347005]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.13it/s, loss=0.1333]


Validation loss: 0.13329118649874414
MAE per label:
	acousticness        0.1999
	danceability        0.1017
	energy              0.1205
	instrumentalness    0.1936
	liveness            0.0925
	speechiness         0.0504
	valence             0.1744
Epoch 53:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.59it/s, loss=0.1262, lr=0.0008381411941873374]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1329]


Validation loss: 0.13287096328678585
MAE per label:
	acousticness        0.1940
	danceability        0.1006
	energy              0.1191
	instrumentalness    0.1964
	liveness            0.0924
	speechiness         0.0505
	valence             0.1772
Epoch 54:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1258, lr=0.0008323493733352073]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1330]


Validation loss: 0.13304500299550237
MAE per label:
	acousticness        0.1963
	danceability        0.1006
	energy              0.1209
	instrumentalness    0.1944
	liveness            0.0923
	speechiness         0.0499
	valence             0.1770
Epoch 55:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.31it/s, loss=0.1265, lr=0.0008264767839234405]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1322]


Validation loss: 0.13215010045539766
MAE per label:
	acousticness        0.1915
	danceability        0.0999
	energy              0.1183
	instrumentalness    0.1951
	liveness            0.0926
	speechiness         0.0486
	valence             0.1790
Saved new best model
Epoch 56:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.18it/s, loss=0.1255, lr=0.000820524874925601]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.01it/s, loss=0.1334]


Validation loss: 0.13339893271525702
MAE per label:
	acousticness        0.1941
	danceability        0.1013
	energy              0.1197
	instrumentalness    0.2002
	liveness            0.0921
	speechiness         0.0495
	valence             0.1768
Epoch 57:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1257, lr=0.0008144951148861737]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.11it/s, loss=0.1314]


Validation loss: 0.13142094388604164
MAE per label:
	acousticness        0.1913
	danceability        0.0998
	energy              0.1193
	instrumentalness    0.1916
	liveness            0.0924
	speechiness         0.0495
	valence             0.1760
Saved new best model
Epoch 58:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.44it/s, loss=0.1238, lr=0.0008083889915582231]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.01it/s, loss=0.1332]


Validation loss: 0.13321123733406975
MAE per label:
	acousticness        0.1924
	danceability        0.1002
	energy              0.1191
	instrumentalness    0.2007
	liveness            0.0920
	speechiness         0.0497
	valence             0.1783
Epoch 59:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1247, lr=0.0008022080115363123]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.12it/s, loss=0.1335]


Validation loss: 0.13346021994948387
MAE per label:
	acousticness        0.1978
	danceability        0.1011
	energy              0.1181
	instrumentalness    0.1979
	liveness            0.0927
	speechiness         0.0488
	valence             0.1778
Epoch 60:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.46it/s, loss=0.1249, lr=0.0007959536998847739]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1324]


Validation loss: 0.13235024291844594
MAE per label:
	acousticness        0.1942
	danceability        0.1011
	energy              0.1195
	instrumentalness    0.1960
	liveness            0.0923
	speechiness         0.0489
	valence             0.1745
Epoch 61:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.40it/s, loss=0.1247, lr=0.0007896275997614226]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1320]


Validation loss: 0.1319653947083723
MAE per label:
	acousticness        0.1983
	danceability        0.1017
	energy              0.1176
	instrumentalness    0.1913
	liveness            0.0926
	speechiness         0.0481
	valence             0.1742
Epoch 62:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.29it/s, loss=0.1246, lr=0.0007832312720368044]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.99it/s, loss=0.1326]


Validation loss: 0.13261526290859496
MAE per label:
	acousticness        0.1986
	danceability        0.1017
	energy              0.1182
	instrumentalness    0.1915
	liveness            0.0929
	speechiness         0.0492
	valence             0.1762
Epoch 63:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.31it/s, loss=0.1257, lr=0.0007767662949090749]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1319]


Validation loss: 0.13185649560320944
MAE per label:
	acousticness        0.1918
	danceability        0.1003
	energy              0.1195
	instrumentalness    0.1962
	liveness            0.0924
	speechiness         0.0484
	valence             0.1745
Epoch 64:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.57it/s, loss=0.1252, lr=0.000770234263514603]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1317]


Validation loss: 0.1317012095380397
MAE per label:
	acousticness        0.1915
	danceability        0.1003
	energy              0.1179
	instrumentalness    0.1918
	liveness            0.0923
	speechiness         0.0497
	valence             0.1784
Epoch 65:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.44it/s, loss=0.1247, lr=0.0007636367895343944]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.12it/s, loss=0.1315]


Validation loss: 0.13150737133054508
MAE per label:
	acousticness        0.1913
	danceability        0.1002
	energy              0.1182
	instrumentalness    0.1936
	liveness            0.0920
	speechiness         0.0491
	valence             0.1761
Epoch 66:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.29it/s, loss=0.1244, lr=0.0007569755007964335]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.20it/s, loss=0.1311]


Validation loss: 0.13108325625459352
MAE per label:
	acousticness        0.1932
	danceability        0.0999
	energy              0.1180
	instrumentalness    0.1894
	liveness            0.0922
	speechiness         0.0493
	valence             0.1756
Saved new best model
Epoch 67:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.25it/s, loss=0.1229, lr=0.0007502520408740415]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1313]


Validation loss: 0.1312895848282746
MAE per label:
	acousticness        0.1931
	danceability        0.1006
	energy              0.1174
	instrumentalness    0.1920
	liveness            0.0923
	speechiness         0.0495
	valence             0.1741
Epoch 68:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1227, lr=0.0007434680686803488]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.03it/s, loss=0.1310]


Validation loss: 0.13101922294923238
MAE per label:
	acousticness        0.1931
	danceability        0.0996
	energy              0.1166
	instrumentalness    0.1927
	liveness            0.0920
	speechiness         0.0488
	valence             0.1744
Saved new best model
Epoch 69:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.32it/s, loss=0.1227, lr=0.0007366252580589837]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1312]


Validation loss: 0.131246609524602
MAE per label:
	acousticness        0.1938
	danceability        0.0988
	energy              0.1185
	instrumentalness    0.1924
	liveness            0.0921
	speechiness         0.0489
	valence             0.1743
Epoch 70:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.31it/s, loss=0.1244, lr=0.0007297252973710754]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.03it/s, loss=0.1325]


Validation loss: 0.13248068484522046
MAE per label:
	acousticness        0.1930
	danceability        0.0995
	energy              0.1194
	instrumentalness    0.1948
	liveness            0.0921
	speechiness         0.0506
	valence             0.1781
Epoch 71:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.56it/s, loss=0.1250, lr=0.0007227698890786777]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.34it/s, loss=0.1322]


Validation loss: 0.1321639326356706
MAE per label:
	acousticness        0.1901
	danceability        0.1008
	energy              0.1174
	instrumentalness    0.1957
	liveness            0.0921
	speechiness         0.0494
	valence             0.1797
Epoch 72:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.30it/s, loss=0.1234, lr=0.0007157607493247108]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.02it/s, loss=0.1316]


Validation loss: 0.131598373431535
MAE per label:
	acousticness        0.1912
	danceability        0.1006
	energy              0.1172
	instrumentalness    0.1966
	liveness            0.0921
	speechiness         0.0492
	valence             0.1743
Epoch 73:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.33it/s, loss=0.1240, lr=0.0007086996075095287]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1316]


Validation loss: 0.1316260832051436
MAE per label:
	acousticness        0.1946
	danceability        0.0995
	energy              0.1176
	instrumentalness    0.1918
	liveness            0.0922
	speechiness         0.0494
	valence             0.1762
Epoch 74:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.28it/s, loss=0.1231, lr=0.0007015882058642161]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.02it/s, loss=0.1317]


Validation loss: 0.13169878916371436
MAE per label:
	acousticness        0.1926
	danceability        0.0995
	energy              0.1179
	instrumentalness    0.1954
	liveness            0.0920
	speechiness         0.0485
	valence             0.1761
Epoch 75:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.64it/s, loss=0.1254, lr=0.0006944282990207193]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1322]


Validation loss: 0.13219182565808296
MAE per label:
	acousticness        0.1945
	danceability        0.0984
	energy              0.1209
	instrumentalness    0.1920
	liveness            0.0926
	speechiness         0.0486
	valence             0.1783
Epoch 76:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.42it/s, loss=0.1228, lr=0.0006872216535789155]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1314]


Validation loss: 0.1313645179782595
MAE per label:
	acousticness        0.1940
	danceability        0.0987
	energy              0.1164
	instrumentalness    0.1953
	liveness            0.0921
	speechiness         0.0485
	valence             0.1747
Epoch 77:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.44it/s, loss=0.1234, lr=0.0006799700476707321]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.09it/s, loss=0.1307]


Validation loss: 0.1306642136048703
MAE per label:
	acousticness        0.1907
	danceability        0.0989
	energy              0.1174
	instrumentalness    0.1922
	liveness            0.0919
	speechiness         0.0491
	valence             0.1745
Saved new best model
Epoch 78:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.21it/s, loss=0.1231, lr=0.0006726752705214191]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.00it/s, loss=0.1317]


Validation loss: 0.1317442006298474
MAE per label:
	acousticness        0.1905
	danceability        0.0990
	energy              0.1179
	instrumentalness    0.2006
	liveness            0.0921
	speechiness         0.0483
	valence             0.1739
Epoch 79:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.29it/s, loss=0.1213, lr=0.0006653391220080838]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1304]


Validation loss: 0.13035913468116805
MAE per label:
	acousticness        0.1938
	danceability        0.0975
	energy              0.1164
	instrumentalness    0.1891
	liveness            0.0924
	speechiness         0.0489
	valence             0.1743
Saved new best model
Epoch 80:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.41it/s, loss=0.1225, lr=0.0006579634122155988]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.09it/s, loss=0.1319]


Validation loss: 0.131863284856081
MAE per label:
	acousticness        0.1925
	danceability        0.0990
	energy              0.1171
	instrumentalness    0.2005
	liveness            0.0920
	speechiness         0.0485
	valence             0.1734
Epoch 81:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.82it/s, loss=0.1218, lr=0.0006505499609899903]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.33it/s, loss=0.1308]


Validation loss: 0.13083472989854358
MAE per label:
	acousticness        0.1916
	danceability        0.0987
	energy              0.1179
	instrumentalness    0.1920
	liveness            0.0922
	speechiness         0.0489
	valence             0.1746
Epoch 82:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.45it/s, loss=0.1217, lr=0.0006431005974894184]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.00it/s, loss=0.1309]


Validation loss: 0.13089310697146825
MAE per label:
	acousticness        0.1909
	danceability        0.0970
	energy              0.1183
	instrumentalness    0.1927
	liveness            0.0923
	speechiness         0.0488
	valence             0.1763
Epoch 83:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1222, lr=0.0006356171597328593]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.26it/s, loss=0.1311]


Validation loss: 0.13113955426074209
MAE per label:
	acousticness        0.1943
	danceability        0.0981
	energy              0.1186
	instrumentalness    0.1891
	liveness            0.0924
	speechiness         0.0491
	valence             0.1764
Epoch 84:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.65it/s, loss=0.1229, lr=0.0006281014941466028]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1307]


Validation loss: 0.13071530063947043
MAE per label:
	acousticness        0.1935
	danceability        0.0992
	energy              0.1179
	instrumentalness    0.1884
	liveness            0.0921
	speechiness         0.0486
	valence             0.1754
Epoch 85:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.46it/s, loss=0.1212, lr=0.000620555455108673]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.78it/s, loss=0.1312]


Validation loss: 0.1312478000209445
MAE per label:
	acousticness        0.1924
	danceability        0.1017
	energy              0.1177
	instrumentalness    0.1926
	liveness            0.0922
	speechiness         0.0481
	valence             0.1742
Epoch 86:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1207, lr=0.0006129809044912885]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.21it/s, loss=0.1307]


Validation loss: 0.1307020710692519
MAE per label:
	acousticness        0.1935
	danceability        0.0979
	energy              0.1176
	instrumentalness    0.1908
	liveness            0.0922
	speechiness         0.0486
	valence             0.1743
Epoch 87:


Training:  18%|█▊        | 30/165 [00:04<00:19,  7.07it/s, loss=0.1227, lr=0.0006053797112014735]


KeyboardInterrupt: 

In [6]:
model.load_state_dict(torch.load('best_model.pt'))

y, sr = librosa.load("Dancing Queen.flac", sr=16000)
log_mel = vggish_input.waveform_to_examples(y, sample_rate=16000).to(device).float()
log_mel = log_mel.squeeze(1)
log_mel = log_mel.unsqueeze(0)

model.eval()

with torch.inference_mode():
    with torch.autocast(device_type="cuda"):
        pred = model(log_mel)

pred = pred[0].cpu().numpy()
for i, label in enumerate(labels):
    print(f"{label:<20}{pred[i]:.4f}")

acousticness        0.4377
danceability        0.5791
energy              0.7266
instrumentalness    0.5913
liveness            0.1249
speechiness         0.0463
valence             0.7236
