In [1]:
%cd /home/jaeheonshim/music-vibes

/home/jaeheonshim/music-vibes


In [None]:
import torch
import numpy as np
import pandas as pd
from torch import nn
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader, Subset
import librosa
from vibenet.dataset import FMAVGGishDataset
from vibenet.models import VGGishMLP
from tqdm import tqdm
from torchvggish import vggish_input

In [3]:
labels = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']

In [4]:
dataset = FMAVGGishDataset('data/preprocessed/vggish')
print(dataset[0][0])
# dataset = Subset(dataset, list(range(50)))

N = len(dataset)
print(N)
train_size = int(N * 0.8)
train_ds, test_ds = random_split(dataset, [train_size, N - train_size], generator=torch.Generator().manual_seed(42))
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

[[[-2.7525146  -2.683021   -2.6432276  ... -3.9684596  -3.9523442
   -3.869654  ]
  [ 0.2586684   0.8037823   1.1073251  ... -2.2905943  -1.5651563
   -1.6188581 ]
  [ 0.42302436  1.05617     0.926886   ... -0.8017502  -1.0229014
   -1.076192  ]
  ...
  [ 0.88455707  0.4266815   1.2014482  ... -0.1787137  -0.5918389
    0.19124033]
  [ 1.2265248   1.0417923   0.40203607 ... -0.55309474 -0.64335
   -1.0096501 ]
  [ 1.2111201   1.3818171   2.2253103  ... -0.9067962  -1.1063846
   -0.70985943]]

 [[ 0.8623978   1.2762761   2.1428616  ... -0.6347559  -0.6723031
   -0.95313996]
  [ 1.1082301   1.233897    1.2746985  ... -0.9564062  -0.79365534
   -0.35816047]
  [ 1.0627131   1.342058    2.4580252  ... -0.79503554 -0.6214866
   -0.40497547]
  ...
  [ 2.5603573   2.0551615   1.9706583  ...  0.7395899  -0.07511103
   -0.7212197 ]
  [ 2.4735575   1.516179   -0.05098555 ...  0.9755231   0.33867064
    0.08468862]
  [ 0.9329136  -0.07456745  0.80708444 ...  1.183292    1.3899543
    1.3542749 ]]


In [None]:
device = 'cuda'

model = VGGishMLP()
model = model.to(device)

NUM_EPOCHS = 100

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=1e-6)

In [None]:
best_loss = 1

for epoch in range(NUM_EPOCHS):
    scheduler.step()

    print(f"Epoch {epoch+1}:")
    model.train()
    
    train_losses = []

    with tqdm(train_dl, desc='Training') as pbar:
        for data, label in pbar:
            data, label = data.to(device).float(), label.to(device).float()
            optimizer.zero_grad()

            with torch.autocast(device_type="cuda"):
                pred = model(data)
                loss = F.l1_loss(pred, label)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            train_loss = np.mean(train_losses)

            pbar.set_postfix({'loss': f"{train_loss:.4f}", 'lr': f"{scheduler.get_last_lr()[0]}"})

    model.eval()

    eval_losses = []
    label_maes = []

    with tqdm(test_dl, desc='Validation') as pbar:
        with torch.inference_mode():
            for data, label in pbar:
                data, label = data.to(device).float(), label.to(device).float()

                pred = model(data)
                loss = F.l1_loss(pred, label)

                eval_losses.append(loss.item())
                eval_loss = np.mean(eval_losses)

                mae = torch.mean(torch.abs(pred - label), dim=0) # for per-metric MAE
                label_maes.append(mae.cpu().numpy())

                pbar.set_postfix({'loss': f"{eval_loss:.4f}"})
    
    eval_loss = np.mean(eval_losses)
    print("Validation loss:", eval_loss)

    label_mae = np.mean(label_maes, axis=0)
    print("MAE per label:")
    for i, label in enumerate(labels):
        print(f"\t{label:<20}{label_mae[i]:.4f}")

    if eval_loss < best_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print('Saved new best model')

Epoch 1:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1529, lr=0.0009977832013192385]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.35it/s, loss=0.1456]


Validation loss: 0.14564063648382822
MAE per label:
	acousticness        0.2308
	danceability        0.1007
	energy              0.1258
	instrumentalness    0.2165
	liveness            0.0947
	speechiness         0.0532
	valence             0.1978
Saved new best model
Epoch 2:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1482, lr=0.0009960612933065818]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.17it/s, loss=0.1456]


Validation loss: 0.14562499523162842
MAE per label:
	acousticness        0.2261
	danceability        0.1065
	energy              0.1249
	instrumentalness    0.2151
	liveness            0.0966
	speechiness         0.0516
	valence             0.1986
Saved new best model
Epoch 3:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.51it/s, loss=0.1473, lr=0.0009938503261272714]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1493]


Validation loss: 0.14930678513788043
MAE per label:
	acousticness        0.2517
	danceability        0.1035
	energy              0.1265
	instrumentalness    0.2162
	liveness            0.1013
	speechiness         0.0529
	valence             0.1930
Epoch 4:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.73it/s, loss=0.1459, lr=0.00099115248173898]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1425]


Validation loss: 0.1424621462467171
MAE per label:
	acousticness        0.2196
	danceability        0.0989
	energy              0.1241
	instrumentalness    0.2111
	liveness            0.1001
	speechiness         0.0522
	valence             0.1912
Saved new best model
Epoch 5:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.50it/s, loss=0.1443, lr=0.0009879704225884043]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1418]


Validation loss: 0.14179008081555367
MAE per label:
	acousticness        0.2153
	danceability        0.0994
	energy              0.1249
	instrumentalness    0.2180
	liveness            0.0954
	speechiness         0.0504
	valence             0.1891
Saved new best model
Epoch 6:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.54it/s, loss=0.1432, lr=0.0009843072889837512]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1417]


Validation loss: 0.14167736151388713
MAE per label:
	acousticness        0.2185
	danceability        0.1064
	energy              0.1242
	instrumentalness    0.2111
	liveness            0.0931
	speechiness         0.0492
	valence             0.1893
Saved new best model
Epoch 7:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.83it/s, loss=0.1426, lr=0.000980166695995633]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.09it/s, loss=0.1414]


Validation loss: 0.1413938914026533
MAE per label:
	acousticness        0.2237
	danceability        0.0973
	energy              0.1301
	instrumentalness    0.2095
	liveness            0.0936
	speechiness         0.0488
	valence             0.1867
Saved new best model
Epoch 8:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.44it/s, loss=0.1434, lr=0.0009755527298894294]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.13it/s, loss=0.1396]


Validation loss: 0.13959051313854398
MAE per label:
	acousticness        0.2124
	danceability        0.1008
	energy              0.1218
	instrumentalness    0.2082
	liveness            0.0953
	speechiness         0.0505
	valence             0.1881
Saved new best model
Epoch 9:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.98it/s, loss=0.1412, lr=0.0009704699440926358]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.39it/s, loss=0.1396]


Validation loss: 0.13955203355068252
MAE per label:
	acousticness        0.2190
	danceability        0.0981
	energy              0.1248
	instrumentalness    0.2070
	liveness            0.0932
	speechiness         0.0487
	valence             0.1861
Saved new best model
Epoch 10:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.82it/s, loss=0.1403, lr=0.0009649233547011817]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.14it/s, loss=0.1379]


Validation loss: 0.1378889608950842
MAE per label:
	acousticness        0.2091
	danceability        0.0972
	energy              0.1208
	instrumentalness    0.2071
	liveness            0.0940
	speechiness         0.0490
	valence             0.1880
Saved new best model
Epoch 11:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.49it/s, loss=0.1401, lr=0.0009589184355291487]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.14it/s, loss=0.1387]


Validation loss: 0.13868351475823493
MAE per label:
	acousticness        0.2107
	danceability        0.0993
	energy              0.1228
	instrumentalness    0.2077
	liveness            0.0945
	speechiness         0.0495
	valence             0.1862
Epoch 12:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.45it/s, loss=0.1401, lr=0.000952461112706777]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1395]


Validation loss: 0.1395109079423405
MAE per label:
	acousticness        0.2077
	danceability        0.1000
	energy              0.1303
	instrumentalness    0.2092
	liveness            0.0933
	speechiness         0.0494
	valence             0.1868
Epoch 13:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.37it/s, loss=0.1408, lr=0.00094555775883209]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.93it/s, loss=0.1434]


Validation loss: 0.143352849142892
MAE per label:
	acousticness        0.2211
	danceability        0.0985
	energy              0.1362
	instrumentalness    0.2064
	liveness            0.0940
	speechiness         0.0495
	valence             0.1978
Epoch 14:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.31it/s, loss=0.1397, lr=0.0009382151866819101]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.98it/s, loss=0.1372]


Validation loss: 0.1372495935786338
MAE per label:
	acousticness        0.2104
	danceability        0.0991
	energy              0.1212
	instrumentalness    0.2054
	liveness            0.0926
	speechiness         0.0465
	valence             0.1856
Saved new best model
Epoch 15:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1400, lr=0.0009304406424884702]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.12it/s, loss=0.1389]


Validation loss: 0.13888053223490715
MAE per label:
	acousticness        0.2162
	danceability        0.0964
	energy              0.1320
	instrumentalness    0.2055
	liveness            0.0921
	speechiness         0.0468
	valence             0.1833
Epoch 16:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.56it/s, loss=0.1388, lr=0.000922241798788257]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.17it/s, loss=0.1411]


Validation loss: 0.14108668161290033
MAE per label:
	acousticness        0.2200
	danceability        0.0996
	energy              0.1313
	instrumentalness    0.2058
	liveness            0.0925
	speechiness         0.0494
	valence             0.1890
Epoch 17:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.34it/s, loss=0.1386, lr=0.0009136267468501441]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.99it/s, loss=0.1376]


Validation loss: 0.13758148891585215
MAE per label:
	acousticness        0.2109
	danceability        0.1018
	energy              0.1222
	instrumentalness    0.2098
	liveness            0.0918
	speechiness         0.0463
	valence             0.1803
Epoch 18:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.45it/s, loss=0.1379, lr=0.0009046039886902867]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1382]


Validation loss: 0.13821153608816011
MAE per label:
	acousticness        0.2119
	danceability        0.0981
	energy              0.1266
	instrumentalness    0.2069
	liveness            0.0927
	speechiness         0.0480
	valence             0.1834
Epoch 19:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.50it/s, loss=0.1373, lr=0.0008951824286816577]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.03it/s, loss=0.1363]


Validation loss: 0.13634321306432998
MAE per label:
	acousticness        0.2100
	danceability        0.0966
	energy              0.1184
	instrumentalness    0.2060
	liveness            0.0920
	speechiness         0.0469
	valence             0.1845
Saved new best model
Epoch 20:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.43it/s, loss=0.1364, lr=0.0008853713647665071]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.32it/s, loss=0.1362]


Validation loss: 0.13621270567888305
MAE per label:
	acousticness        0.2113
	danceability        0.0973
	energy              0.1190
	instrumentalness    0.2054
	liveness            0.0920
	speechiness         0.0472
	valence             0.1814
Saved new best model
Epoch 21:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.43it/s, loss=0.1366, lr=0.0008751804792804149]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.02it/s, loss=0.1364]


Validation loss: 0.13644134820926757
MAE per label:
	acousticness        0.2072
	danceability        0.0985
	energy              0.1178
	instrumentalness    0.2058
	liveness            0.0922
	speechiness         0.0489
	valence             0.1848
Epoch 22:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.48it/s, loss=0.1367, lr=0.0008646198293969954]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1362]


Validation loss: 0.13623168000153132
MAE per label:
	acousticness        0.2106
	danceability        0.0966
	energy              0.1176
	instrumentalness    0.2076
	liveness            0.0917
	speechiness         0.0472
	valence             0.1822
Epoch 23:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.45it/s, loss=0.1364, lr=0.0008536998372026807]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1367]


Validation loss: 0.13668017869903928
MAE per label:
	acousticness        0.2086
	danceability        0.0985
	energy              0.1174
	instrumentalness    0.2041
	liveness            0.0920
	speechiness         0.0476
	valence             0.1886
Epoch 24:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.35it/s, loss=0.1361, lr=0.0008424312794113804]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.05it/s, loss=0.1369]


Validation loss: 0.13691121641369092
MAE per label:
	acousticness        0.2109
	danceability        0.0949
	energy              0.1178
	instrumentalness    0.2080
	liveness            0.0920
	speechiness         0.0482
	valence             0.1866
Epoch 25:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.62it/s, loss=0.1355, lr=0.0008308252767291644]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.03it/s, loss=0.1366]


Validation loss: 0.1365754740933577
MAE per label:
	acousticness        0.2097
	danceability        0.1027
	energy              0.1179
	instrumentalness    0.2075
	liveness            0.0916
	speechiness         0.0468
	valence             0.1798
Epoch 26:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.51it/s, loss=0.1343, lr=0.0008188932828794708]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1356]


Validation loss: 0.13561774346800076
MAE per label:
	acousticness        0.2107
	danceability        0.0960
	energy              0.1186
	instrumentalness    0.2029
	liveness            0.0921
	speechiness         0.0472
	valence             0.1819
Saved new best model
Epoch 27:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.34it/s, loss=0.1347, lr=0.0008066470732996621]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.93it/s, loss=0.1340]


Validation loss: 0.13402032390946433
MAE per label:
	acousticness        0.2058
	danceability        0.0942
	energy              0.1165
	instrumentalness    0.2043
	liveness            0.0913
	speechiness         0.0463
	valence             0.1798
Saved new best model
Epoch 28:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.52it/s, loss=0.1344, lr=0.0007940987335200907]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1352]


Validation loss: 0.13518685829781352
MAE per label:
	acousticness        0.2043
	danceability        0.0966
	energy              0.1205
	instrumentalness    0.2052
	liveness            0.0917
	speechiness         0.0460
	valence             0.1820
Epoch 29:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.88it/s, loss=0.1341, lr=0.0007812606472371396]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.35it/s, loss=0.1347]


Validation loss: 0.13469056820585615
MAE per label:
	acousticness        0.2062
	danceability        0.0943
	energy              0.1169
	instrumentalness    0.2036
	liveness            0.0915
	speechiness         0.0464
	valence             0.1839
Epoch 30:


Training: 100%|██████████| 165/165 [00:10<00:00, 15.06it/s, loss=0.1329, lr=0.0007681454840920091]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.40it/s, loss=0.1349]


Validation loss: 0.13489606834593273
MAE per label:
	acousticness        0.2088
	danceability        0.0991
	energy              0.1163
	instrumentalness    0.2035
	liveness            0.0916
	speechiness         0.0453
	valence             0.1796
Epoch 31:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.62it/s, loss=0.1330, lr=0.0007547661871673109]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.38it/s, loss=0.1341]


Validation loss: 0.13409813599927084
MAE per label:
	acousticness        0.2044
	danceability        0.0946
	energy              0.1172
	instrumentalness    0.2017
	liveness            0.0915
	speechiness         0.0458
	valence             0.1835
Epoch 32:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.52it/s, loss=0.1329, lr=0.0007411359602138072]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.11it/s, loss=0.1339]


Validation loss: 0.13391085448009626
MAE per label:
	acousticness        0.2047
	danceability        0.0955
	energy              0.1195
	instrumentalness    0.2018
	liveness            0.0911
	speechiness         0.0454
	valence             0.1793
Saved new best model
Epoch 33:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.22it/s, loss=0.1321, lr=0.000727268254619904]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.11it/s, loss=0.1339]


Validation loss: 0.1338789439981892
MAE per label:
	acousticness        0.2044
	danceability        0.0976
	energy              0.1170
	instrumentalness    0.2011
	liveness            0.0908
	speechiness         0.0458
	valence             0.1805
Saved new best model
Epoch 34:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.50it/s, loss=0.1329, lr=0.0007131767561367542]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1336]


Validation loss: 0.13355100137137232
MAE per label:
	acousticness        0.2067
	danceability        0.0961
	energy              0.1153
	instrumentalness    0.2012
	liveness            0.0913
	speechiness         0.0449
	valence             0.1794
Saved new best model
Epoch 35:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.34it/s, loss=0.1329, lr=0.0006988753713720732]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.02it/s, loss=0.1344]


Validation loss: 0.13442678447990192
MAE per label:
	acousticness        0.2130
	danceability        0.0944
	energy              0.1170
	instrumentalness    0.2016
	liveness            0.0909
	speechiness         0.0458
	valence             0.1784
Epoch 36:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.38it/s, loss=0.1318, lr=0.000684378214065997]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.23it/s, loss=0.1337]


Validation loss: 0.1337104987885271
MAE per label:
	acousticness        0.2062
	danceability        0.0963
	energy              0.1152
	instrumentalness    0.2032
	liveness            0.0909
	speechiness         0.0452
	valence             0.1791
Epoch 37:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.47it/s, loss=0.1321, lr=0.0006696995911625234]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.25it/s, loss=0.1339]


Validation loss: 0.13386023026846705
MAE per label:
	acousticness        0.2060
	danceability        0.0949
	energy              0.1213
	instrumentalness    0.2009
	liveness            0.0910
	speechiness         0.0451
	valence             0.1778
Epoch 38:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.36it/s, loss=0.1311, lr=0.0006548539886902865]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.01it/s, loss=0.1323]


Validation loss: 0.13225214626817478
MAE per label:
	acousticness        0.1981
	danceability        0.0963
	energy              0.1157
	instrumentalness    0.1974
	liveness            0.0911
	speechiness         0.0456
	valence             0.1817
Saved new best model
Epoch 39:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.53it/s, loss=0.1309, lr=0.0006398560574665953]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.96it/s, loss=0.1327]


Validation loss: 0.13273236634475843
MAE per label:
	acousticness        0.2031
	danceability        0.0959
	energy              0.1161
	instrumentalness    0.1998
	liveness            0.0910
	speechiness         0.0452
	valence             0.1780
Epoch 40:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.14it/s, loss=0.1319, lr=0.0006247205986388451]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1335]


Validation loss: 0.133514071504275
MAE per label:
	acousticness        0.2062
	danceability        0.0952
	energy              0.1149
	instrumentalness    0.2013
	liveness            0.0908
	speechiness         0.0462
	valence             0.1801
Epoch 41:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.64it/s, loss=0.1305, lr=0.0006094625490775733]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1351]


Validation loss: 0.13508072193889392
MAE per label:
	acousticness        0.2143
	danceability        0.0944
	energy              0.1169
	instrumentalness    0.2032
	liveness            0.0911
	speechiness         0.0453
	valence             0.1804
Epoch 42:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.70it/s, loss=0.1299, lr=0.0005940969666355698]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.12it/s, loss=0.1321]


Validation loss: 0.13213074757229715
MAE per label:
	acousticness        0.2028
	danceability        0.0942
	energy              0.1152
	instrumentalness    0.1991
	liveness            0.0905
	speechiness         0.0447
	valence             0.1784
Saved new best model
Epoch 43:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.24it/s, loss=0.1299, lr=0.0005786390152875956]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.06it/s, loss=0.1323]


Validation loss: 0.13229512849024364
MAE per label:
	acousticness        0.2022
	danceability        0.0945
	energy              0.1168
	instrumentalness    0.1992
	liveness            0.0907
	speechiness         0.0453
	valence             0.1775
Epoch 44:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.15it/s, loss=0.1297, lr=0.0005631039501653703]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.08it/s, loss=0.1326]


Validation loss: 0.13256071135401726
MAE per label:
	acousticness        0.2036
	danceability        0.0957
	energy              0.1142
	instrumentalness    0.2014
	liveness            0.0906
	speechiness         0.0452
	valence             0.1772
Epoch 45:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.56it/s, loss=0.1299, lr=0.0005475071025025981]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.64it/s, loss=0.1330]


Validation loss: 0.13298188291844867
MAE per label:
	acousticness        0.2025
	danceability        0.0974
	energy              0.1190
	instrumentalness    0.1997
	liveness            0.0905
	speechiness         0.0449
	valence             0.1769
Epoch 46:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.19it/s, loss=0.1295, lr=0.0005318638645048923]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.02it/s, loss=0.1326]


Validation loss: 0.13263997772619837
MAE per label:
	acousticness        0.2045
	danceability        0.0960
	energy              0.1143
	instrumentalness    0.2017
	liveness            0.0905
	speechiness         0.0452
	valence             0.1763
Epoch 47:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.45it/s, loss=0.1290, lr=0.0005161896741595253]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.83it/s, loss=0.1313]


Validation loss: 0.13131160785754523
MAE per label:
	acousticness        0.1992
	danceability        0.0947
	energy              0.1147
	instrumentalness    0.1986
	liveness            0.0904
	speechiness         0.0448
	valence             0.1768
Saved new best model
Epoch 48:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.40it/s, loss=0.1283, lr=0.0005005000000000002]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.09it/s, loss=0.1327]


Validation loss: 0.1327189597345534
MAE per label:
	acousticness        0.2053
	danceability        0.0943
	energy              0.1157
	instrumentalness    0.1990
	liveness            0.0907
	speechiness         0.0449
	valence             0.1792
Epoch 49:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.55it/s, loss=0.1279, lr=0.0004848103258404751]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.60it/s, loss=0.1326]


Validation loss: 0.13261199689337186
MAE per label:
	acousticness        0.2028
	danceability        0.0950
	energy              0.1162
	instrumentalness    0.2028
	liveness            0.0902
	speechiness         0.0448
	valence             0.1764
Epoch 50:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.31it/s, loss=0.1285, lr=0.0004691361354951082]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.14it/s, loss=0.1329]


Validation loss: 0.13294381914394243
MAE per label:
	acousticness        0.2067
	danceability        0.0942
	energy              0.1155
	instrumentalness    0.1998
	liveness            0.0904
	speechiness         0.0445
	valence             0.1795
Epoch 51:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.37it/s, loss=0.1272, lr=0.00045349289749740224]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.04it/s, loss=0.1318]


Validation loss: 0.13180241574134147
MAE per label:
	acousticness        0.2002
	danceability        0.0940
	energy              0.1155
	instrumentalness    0.2004
	liveness            0.0902
	speechiness         0.0449
	valence             0.1774
Epoch 52:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.52it/s, loss=0.1286, lr=0.0004378960498346302]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1313]


Validation loss: 0.13129086703771636
MAE per label:
	acousticness        0.2003
	danceability        0.0947
	energy              0.1150
	instrumentalness    0.1983
	liveness            0.0904
	speechiness         0.0446
	valence             0.1757
Saved new best model
Epoch 53:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.52it/s, loss=0.1266, lr=0.0004223609847124048]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1323]


Validation loss: 0.13226475956894101
MAE per label:
	acousticness        0.2051
	danceability        0.0929
	energy              0.1144
	instrumentalness    0.1995
	liveness            0.0902
	speechiness         0.0447
	valence             0.1790
Epoch 54:


Training: 100%|██████████| 165/165 [00:14<00:00, 11.20it/s, loss=0.1280, lr=0.0004069030333644307]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.07it/s, loss=0.1328]


Validation loss: 0.13281494609656788
MAE per label:
	acousticness        0.2068
	danceability        0.0945
	energy              0.1156
	instrumentalness    0.2007
	liveness            0.0902
	speechiness         0.0452
	valence             0.1767
Epoch 55:


Training: 100%|██████████| 165/165 [00:13<00:00, 11.89it/s, loss=0.1274, lr=0.0003915374509224273]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.06it/s, loss=0.1315]


Validation loss: 0.13153095827216194
MAE per label:
	acousticness        0.2005
	danceability        0.0944
	energy              0.1167
	instrumentalness    0.1986
	liveness            0.0902
	speechiness         0.0448
	valence             0.1754
Epoch 56:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.16it/s, loss=0.1270, lr=0.0003762794013611551]
Validation: 100%|██████████| 42/42 [00:05<00:00,  7.75it/s, loss=0.1314]


Validation loss: 0.1313964889517852
MAE per label:
	acousticness        0.1988
	danceability        0.0943
	energy              0.1150
	instrumentalness    0.1988
	liveness            0.0902
	speechiness         0.0445
	valence             0.1781
Epoch 57:


Training: 100%|██████████| 165/165 [00:13<00:00, 12.61it/s, loss=0.1267, lr=0.00036114394253340513]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.12it/s, loss=0.1310]


Validation loss: 0.1309751733427956
MAE per label:
	acousticness        0.1990
	danceability        0.0930
	energy              0.1153
	instrumentalness    0.1979
	liveness            0.0901
	speechiness         0.0448
	valence             0.1765
Saved new best model
Epoch 58:


Training: 100%|██████████| 165/165 [00:16<00:00,  9.99it/s, loss=0.1259, lr=0.000346146011309714]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.10it/s, loss=0.1304]


Validation loss: 0.13035389018200694
MAE per label:
	acousticness        0.1959
	danceability        0.0938
	energy              0.1146
	instrumentalness    0.1969
	liveness            0.0901
	speechiness         0.0448
	valence             0.1765
Saved new best model
Epoch 59:


Training: 100%|██████████| 165/165 [00:15<00:00, 10.32it/s, loss=0.1273, lr=0.00033130040883747703]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.03it/s, loss=0.1316]


Validation loss: 0.13159749514999844
MAE per label:
	acousticness        0.2023
	danceability        0.0935
	energy              0.1160
	instrumentalness    0.1975
	liveness            0.0902
	speechiness         0.0448
	valence             0.1767
Epoch 60:


Training: 100%|██████████| 165/165 [00:15<00:00, 10.40it/s, loss=0.1263, lr=0.0003166217859340036]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.10it/s, loss=0.1315]


Validation loss: 0.13147103272023655
MAE per label:
	acousticness        0.2010
	danceability        0.0947
	energy              0.1142
	instrumentalness    0.1978
	liveness            0.0902
	speechiness         0.0447
	valence             0.1777
Epoch 61:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.29it/s, loss=0.1251, lr=0.0003021246286279271]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.35it/s, loss=0.1314]


Validation loss: 0.1314193063548633
MAE per label:
	acousticness        0.1999
	danceability        0.0939
	energy              0.1153
	instrumentalness    0.1981
	liveness            0.0900
	speechiness         0.0448
	valence             0.1781
Epoch 62:


Training: 100%|██████████| 165/165 [00:17<00:00,  9.54it/s, loss=0.1257, lr=0.00028782324386324626]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.07it/s, loss=0.1304]


Validation loss: 0.13040846125001
MAE per label:
	acousticness        0.1964
	danceability        0.0946
	energy              0.1150
	instrumentalness    0.1973
	liveness            0.0898
	speechiness         0.0448
	valence             0.1750
Epoch 63:


Training: 100%|██████████| 165/165 [00:19<00:00,  8.53it/s, loss=0.1251, lr=0.00027373174538009644]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.08it/s, loss=0.1328]


Validation loss: 0.1328300056712968
MAE per label:
	acousticness        0.2085
	danceability        0.0953
	energy              0.1143
	instrumentalness    0.1985
	liveness            0.0902
	speechiness         0.0457
	valence             0.1773
Epoch 64:


Training: 100%|██████████| 165/165 [00:16<00:00,  9.93it/s, loss=0.1249, lr=0.00025986403978619317]
Validation: 100%|██████████| 42/42 [00:05<00:00,  7.73it/s, loss=0.1307]


Validation loss: 0.13066385331608
MAE per label:
	acousticness        0.1960
	danceability        0.0942
	energy              0.1150
	instrumentalness    0.1975
	liveness            0.0900
	speechiness         0.0446
	valence             0.1774
Epoch 65:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.26it/s, loss=0.1256, lr=0.00024623381283268956]
Validation: 100%|██████████| 42/42 [00:05<00:00,  7.09it/s, loss=0.1302]


Validation loss: 0.13018303559649558
MAE per label:
	acousticness        0.1954
	danceability        0.0940
	energy              0.1143
	instrumentalness    0.1971
	liveness            0.0900
	speechiness         0.0448
	valence             0.1758
Saved new best model
Epoch 66:


Training: 100%|██████████| 165/165 [00:17<00:00,  9.34it/s, loss=0.1245, lr=0.00023285451590799108]
Validation: 100%|██████████| 42/42 [00:05<00:00,  7.22it/s, loss=0.1297]


Validation loss: 0.12970548566608203
MAE per label:
	acousticness        0.1944
	danceability        0.0935
	energy              0.1141
	instrumentalness    0.1961
	liveness            0.0900
	speechiness         0.0449
	valence             0.1749
Saved new best model
Epoch 67:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.13it/s, loss=0.1236, lr=0.00021973935276286074]
Validation: 100%|██████████| 42/42 [00:05<00:00,  7.95it/s, loss=0.1303]


Validation loss: 0.1303483224695637
MAE per label:
	acousticness        0.1962
	danceability        0.0942
	energy              0.1150
	instrumentalness    0.1978
	liveness            0.0900
	speechiness         0.0446
	valence             0.1745
Epoch 68:


Training: 100%|██████████| 165/165 [00:15<00:00, 10.39it/s, loss=0.1250, lr=0.00020690126647990976]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.27it/s, loss=0.1314]


Validation loss: 0.13144070442233766
MAE per label:
	acousticness        0.2022
	danceability        0.0942
	energy              0.1142
	instrumentalness    0.1978
	liveness            0.0899
	speechiness         0.0446
	valence             0.1771
Epoch 69:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.19it/s, loss=0.1240, lr=0.00019435292670033822]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.09it/s, loss=0.1308]


Validation loss: 0.13082106411457062
MAE per label:
	acousticness        0.1985
	danceability        0.0951
	energy              0.1146
	instrumentalness    0.1979
	liveness            0.0900
	speechiness         0.0443
	valence             0.1753
Epoch 70:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.15it/s, loss=0.1239, lr=0.0001821067171205295]
Validation: 100%|██████████| 42/42 [00:07<00:00,  5.77it/s, loss=0.1306]


Validation loss: 0.1305857692800817
MAE per label:
	acousticness        0.1989
	danceability        0.0932
	energy              0.1144
	instrumentalness    0.1970
	liveness            0.0899
	speechiness         0.0446
	valence             0.1761
Epoch 71:


Training: 100%|██████████| 165/165 [00:16<00:00,  9.75it/s, loss=0.1239, lr=0.000170174723270836]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.08it/s, loss=0.1307]


Validation loss: 0.13069428379336992
MAE per label:
	acousticness        0.1998
	danceability        0.0936
	energy              0.1138
	instrumentalness    0.1969
	liveness            0.0900
	speechiness         0.0446
	valence             0.1762
Epoch 72:


Training: 100%|██████████| 165/165 [00:17<00:00,  9.57it/s, loss=0.1241, lr=0.00015856872058862001]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.07it/s, loss=0.1304]


Validation loss: 0.13042228243180684
MAE per label:
	acousticness        0.1968
	danceability        0.0935
	energy              0.1143
	instrumentalness    0.1966
	liveness            0.0899
	speechiness         0.0447
	valence             0.1772
Epoch 73:


Training: 100%|██████████| 165/165 [00:19<00:00,  8.46it/s, loss=0.1233, lr=0.00014730016279731957]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.07it/s, loss=0.1300]


Validation loss: 0.13004230565968014
MAE per label:
	acousticness        0.1949
	danceability        0.0938
	energy              0.1148
	instrumentalness    0.1961
	liveness            0.0898
	speechiness         0.0445
	valence             0.1764
Epoch 74:


Training: 100%|██████████| 165/165 [00:15<00:00, 10.37it/s, loss=0.1227, lr=0.00013638017060300508]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.05it/s, loss=0.1303]


Validation loss: 0.1302907294815495
MAE per label:
	acousticness        0.1952
	danceability        0.0936
	energy              0.1159
	instrumentalness    0.1963
	liveness            0.0900
	speechiness         0.0447
	valence             0.1764
Epoch 75:


Training: 100%|██████████| 165/165 [00:17<00:00,  9.38it/s, loss=0.1227, lr=0.00012581952071958545]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.03it/s, loss=0.1298]


Validation loss: 0.12977210237156778
MAE per label:
	acousticness        0.1946
	danceability        0.0935
	energy              0.1155
	instrumentalness    0.1966
	liveness            0.0899
	speechiness         0.0443
	valence             0.1740
Epoch 76:


Training: 100%|██████████| 165/165 [00:18<00:00,  8.76it/s, loss=0.1230, lr=0.00011562863523349334]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.07it/s, loss=0.1295]


Validation loss: 0.12952072013701713
MAE per label:
	acousticness        0.1942
	danceability        0.0938
	energy              0.1136
	instrumentalness    0.1956
	liveness            0.0899
	speechiness         0.0443
	valence             0.1752
Saved new best model
Epoch 77:


Training: 100%|██████████| 165/165 [00:16<00:00,  9.78it/s, loss=0.1224, lr=0.00010581757131834266]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.08it/s, loss=0.1301]


Validation loss: 0.1301116302964233
MAE per label:
	acousticness        0.1964
	danceability        0.0943
	energy              0.1149
	instrumentalness    0.1966
	liveness            0.0898
	speechiness         0.0445
	valence             0.1742
Epoch 78:


Training: 100%|██████████| 165/165 [00:17<00:00,  9.39it/s, loss=0.1218, lr=9.639601130971382e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.10it/s, loss=0.1303]


Validation loss: 0.13028390776543392
MAE per label:
	acousticness        0.1956
	danceability        0.0938
	energy              0.1157
	instrumentalness    0.1965
	liveness            0.0897
	speechiness         0.0443
	valence             0.1763
Epoch 79:


Training: 100%|██████████| 165/165 [00:17<00:00,  9.70it/s, loss=0.1215, lr=8.737325314985643e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.11it/s, loss=0.1293]


Validation loss: 0.1293357422664052
MAE per label:
	acousticness        0.1936
	danceability        0.0934
	energy              0.1140
	instrumentalness    0.1951
	liveness            0.0898
	speechiness         0.0445
	valence             0.1749
Saved new best model
Epoch 80:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.23it/s, loss=0.1218, lr=7.875820121174359e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.03it/s, loss=0.1300]


Validation loss: 0.12998618008125395
MAE per label:
	acousticness        0.1968
	danceability        0.0937
	energy              0.1143
	instrumentalness    0.1954
	liveness            0.0897
	speechiness         0.0443
	valence             0.1757
Epoch 81:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.24it/s, loss=0.1225, lr=7.05593575115301e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.24it/s, loss=0.1293]


Validation loss: 0.12932198423714863
MAE per label:
	acousticness        0.1938
	danceability        0.0933
	energy              0.1148
	instrumentalness    0.1946
	liveness            0.0897
	speechiness         0.0443
	valence             0.1747
Saved new best model
Epoch 82:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.39it/s, loss=0.1209, lr=6.278481331809015e-05]
Validation: 100%|██████████| 42/42 [00:04<00:00,  9.75it/s, loss=0.1293]


Validation loss: 0.12929879838512057
MAE per label:
	acousticness        0.1939
	danceability        0.0934
	energy              0.1136
	instrumentalness    0.1955
	liveness            0.0898
	speechiness         0.0443
	valence             0.1746
Saved new best model
Epoch 83:


Training: 100%|██████████| 165/165 [00:11<00:00, 14.32it/s, loss=0.1210, lr=5.54422411679103e-05]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.07it/s, loss=0.1303]


Validation loss: 0.130271089751096
MAE per label:
	acousticness        0.1978
	danceability        0.0935
	energy              0.1172
	instrumentalness    0.1935
	liveness            0.0897
	speechiness         0.0444
	valence             0.1759
Epoch 84:


Training: 100%|██████████| 165/165 [00:13<00:00, 12.28it/s, loss=0.1215, lr=4.853888729322334e-05]
Validation: 100%|██████████| 42/42 [00:04<00:00, 10.10it/s, loss=0.1300]


Validation loss: 0.1300199619006543
MAE per label:
	acousticness        0.1965
	danceability        0.0933
	energy              0.1150
	instrumentalness    0.1954
	liveness            0.0897
	speechiness         0.0444
	valence             0.1757
Epoch 85:


Training: 100%|██████████| 165/165 [00:13<00:00, 12.01it/s, loss=0.1211, lr=4.2081564470851536e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.12it/s, loss=0.1294]


Validation loss: 0.12937337905168533
MAE per label:
	acousticness        0.1942
	danceability        0.0932
	energy              0.1143
	instrumentalness    0.1949
	liveness            0.0898
	speechiness         0.0444
	valence             0.1749
Epoch 86:


Training: 100%|██████████| 165/165 [00:16<00:00, 10.13it/s, loss=0.1224, lr=3.6076645298818454e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.12it/s, loss=0.1293]


Validation loss: 0.12928109616041183
MAE per label:
	acousticness        0.1928
	danceability        0.0940
	energy              0.1155
	instrumentalness    0.1940
	liveness            0.0897
	speechiness         0.0446
	valence             0.1744
Saved new best model
Epoch 87:


Training: 100%|██████████| 165/165 [00:16<00:00,  9.76it/s, loss=0.1220, lr=3.053005590736439e-05]
Validation: 100%|██████████| 42/42 [00:06<00:00,  6.12it/s, loss=0.1292]


Validation loss: 0.12920704519464857
MAE per label:
	acousticness        0.1934
	danceability        0.0937
	energy              0.1146
	instrumentalness    0.1938
	liveness            0.0898
	speechiness         0.0445
	valence             0.1746


In [None]:
model.load_state_dict(torch.load('best_model.pt'))

y, sr = librosa.load("16 Imagine Dragons - Believer.mp3", sr=16000)
log_mel = vggish_input.waveform_to_examples(y, sample_rate=16000).to(device).float()
log_mel = log_mel.squeeze(1)
log_mel = log_mel.unsqueeze(0)

model.eval()

with torch.inference_mode():
    with torch.autocast(device_type="cuda"):
        pred = model(log_mel)

pred = pred[0].cpu().numpy()
for i, label in enumerate(labels):
    print(f"{label:<20}{pred[i]:.4f}")

acousticness        0.6216
danceability        0.4705
energy              0.7773
instrumentalness    0.0137
liveness            0.1431
speechiness         0.0778
valence             0.6460
