In [67]:
import itertools
import json
import logging
import math
import pathlib
import pickle
import random
import tempfile
from collections import OrderedDict, defaultdict
import random

#import mir_eval
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import mode as scipy_mode
from sklearn.metrics import average_precision_score, r2_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import glob
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [13]:
# Shape of each representation
npy_paths_clmr = sorted(glob.glob('features_our/clmr/*.npy'))
npy_paths_musicnn = sorted(glob.glob('features_our/musicnn/*.npy'))
npy_paths_jukebox_zeropad = sorted(glob.glob('features_our/jukebox/*.npy'))

# Feature representations
X_clmr = np.array([np.load(p) for p in npy_paths_clmr])
X_musicnn = np.array([np.load(p) for p in npy_paths_musicnn])
X_jukebox_zeropad = np.array([np.load(p) for p in npy_paths_jukebox_zeropad])

# Labels
y_clmr = np.array([os.path.split(p)[1].split('-')[0] for p in npy_paths_clmr])
y_musicnn = np.array([os.path.split(p)[1].split('-')[0] for p in npy_paths_musicnn])
y_jukebox_zeropad = np.array([os.path.split(p)[1].split('-')[0] for p in npy_paths_jukebox_zeropad])

In [21]:
DATASET_TO_ATTRS = {
    "dcase": {
        "num_outputs": 10,
        "output_type": "multiclass",
        "labels": """airport, bus, metro, metro_station, park, public_square, shopping_mall, street_pedestrian, street_traffic, tram""".split(
            ", "
        ),
    }}

PAPER_GRID = {
    "data_standardization": [False, True],
    "hidden_layer_sizes": [[], [512]],
    "batch_size": [64, 256],
    "learning_rate": [1e-5, 1e-4, 1e-3],
    "dropout_p": [0.25, 0.5, 0.75],
    "l2_weight_decay": [None, 1e-4, 1e-3],
}

In [68]:
cfg = {
        "dataset": None,
        "representation": None,
        "data_standardization": True,
        "hidden_layer_sizes": [],
        "batch_size": 64,
        "learning_rate": 1e-3,
        "dropout_input": True,
        "dropout_p": 0.5,
        "l2_weight_decay": None,
        "max_num_epochs": 300,
        "early_stopping_metric": "primary",
        "early_stopping": True,
        "early_stopping_eval_frequency": 8,
        "early_stopping_boredom": 256,
        "seed": 0,
    }

In [71]:
from sklearn.model_selection import train_test_split

# Clmr
# In the first step we will split the data in training and remaining dataset
X_train_clmr, X_rem_clmr, y_train_clmr, y_rem_clmr = train_test_split(X_clmr,y_clmr, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
X_valid_clmr, X_test_clmr, y_valid_clmr, y_test_clmr = train_test_split(X_rem_clmr,y_rem_clmr, test_size=0.5)

# Musicnn
X_train_musicnn, X_rem_musicnn, y_train_musicnn, y_rem_musicnn = train_test_split(X_musicnn,y_musicnn, train_size=0.8)
X_valid_musicnn, X_test_musicnn, y_valid_musicnn, y_test_musicnn = train_test_split(X_rem_musicnn,y_rem_musicnn, test_size=0.5)

# Jukebox
X_train_jukebox_zeropad, X_rem_jukebox_zeropad, y_train_jukebox_zeropad, y_rem_jukebox_zeropad = train_test_split(X_jukebox_zeropad,y_jukebox_zeropad, train_size=0.8)
X_valid_jukebox_zeropad, X_test_jukebox_zeropad, y_valid_jukebox_zeropad, y_test_jukebox_zeropad = train_test_split(X_rem_jukebox_zeropad,y_rem_jukebox_zeropad, test_size=0.5)

In [95]:
split_to_X = defaultdict(lambda: defaultdict(list))
split_to_y = defaultdict(lambda: defaultdict(list))

split_to_X["clmr"]["train"] = X_train_clmr
split_to_X["clmr"]["validation"] = X_valid_clmr
split_to_X["clmr"]["test"] = X_test_clmr
split_to_y["clmr"]["train"] = y_train_clmr
split_to_y["clmr"]["validation"] = y_valid_clmr
split_to_y["clmr"]["test"] = y_test_clmr

split_to_X["musicnn"]["train"] = X_train_musicnn
split_to_X["musicnn"]["validation"] = X_valid_musicnn
split_to_X["musicnn"]["test"] = X_test_musicnn
split_to_y["musicnn"]["train"] = y_train_musicnn
split_to_y["musicnn"]["validation"] = y_valid_musicnn
split_to_y["musicnn"]["test"] = y_test_musicnn

split_to_X["jukebox"]["train"] = X_train_jukebox_zeropad
split_to_X["jukebox"]["validation"] = X_valid_jukebox_zeropad
split_to_X["jukebox"]["test"] = X_test_jukebox_zeropad
split_to_y["jukebox"]["train"] = y_train_jukebox_zeropad
split_to_y["jukebox"]["validation"] = y_valid_jukebox_zeropad
split_to_y["jukebox"]["test"] = y_test_jukebox_zeropad

In [74]:
class SimpleMLP(nn.Module):
    def __init__(
        self,
        num_features,
        hidden_layer_sizes,
        num_outputs,
        dropout_input=True,
        dropout_p=0.5,
    ):
        super().__init__()
        d = num_features
        self.num_layers = 1
        for i, ld in enumerate(hidden_layer_sizes):
            setattr(self, f"hidden_{i}", nn.Linear(d, ld))
            d = ld
        self.output = nn.Linear(d, num_outputs)
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, x):
        x = self.dropout(x)
        for i in range(self.num_layers):
            x = getattr(self, f"hidden_{i}")(x)
            x = F.relu(x)
            x = self.dropout(x)
        return self.output(x)

In [75]:
# Create optimizer
optimizer = torch.optim.Adam(
    probe.parameters(),
    lr=cfg["learning_rate"],
    weight_decay=0
    if cfg["l2_weight_decay"] is None
    else cfg["l2_weight_decay"],
)

In [80]:
scaler_dict = {}
le_dict = {}

## Clmr

In [76]:
probe = SimpleMLP(
            num_features = split_to_X["clmr"]["train"].shape[1],
            hidden_layer_sizes = [512],
            num_outputs = DATASET_TO_ATTRS["dcase"]["num_outputs"],
            dropout_input=True,
            dropout_p=0.25,
        )

probe.to(device)

SimpleMLP(
  (hidden_0): Linear(in_features=512, out_features=512, bias=True)
  (output): Linear(in_features=512, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [84]:
std_scaler_clmr = StandardScaler()
std_scaler_clmr.fit(split_to_X["clmr"]["train"])

scaler_dict["clmr"] = std_scaler_clmr

In [81]:
from sklearn import preprocessing

le_clmr = preprocessing.LabelEncoder()
le.fit_transform(split_to_y["clmr"]["train"])

le_dict["clmr"] = le_clmr

In [85]:
import wandb as wandb_lib

summarize_frequency = 10

epoch = 0
while True:
    # Check if exceeded max num epochs
    if epoch == cfg["max_num_epochs"]:
        break
    # Create batch
    idxs = random.sample(
        list(range(X_train_clmr.shape[0])),
        min(cfg["batch_size"], X_train_clmr.shape[0]),
    )
    X, y = X_train_clmr[idxs], y_train_clmr[idxs]
    X = std_scaler.transform(X)
    
    le = preprocessing.LabelEncoder()
    y_trans = le.fit_transform(y)
    
    X = torch.tensor(X, dtype=torch.float32, device=device)
    y = torch.tensor(y_trans, device=device)

    # Update
    optimizer.zero_grad()
    loss = F.cross_entropy(input = probe(X), target =y )
    loss.backward()
    optimizer.step()
    epoch += 1
    
    # Summarize
    if epoch % summarize_frequency == 0:
        loss = loss.item()
        print("Epoch {}, loss {}".format(epoch, loss))

Epoch 10, loss 2.308100700378418
Epoch 20, loss 2.3520381450653076
Epoch 30, loss 2.351431369781494
Epoch 40, loss 2.4163386821746826
Epoch 50, loss 2.3564860820770264
Epoch 60, loss 2.3345210552215576
Epoch 70, loss 2.302258253097534
Epoch 80, loss 2.378150463104248
Epoch 90, loss 2.4029898643493652
Epoch 100, loss 2.3859221935272217
Epoch 110, loss 2.4038095474243164
Epoch 120, loss 2.397473096847534
Epoch 130, loss 2.330629587173462
Epoch 140, loss 2.3541345596313477
Epoch 150, loss 2.360898494720459
Epoch 160, loss 2.357152223587036
Epoch 170, loss 2.285766839981079
Epoch 180, loss 2.34651517868042
Epoch 190, loss 2.358009099960327
Epoch 200, loss 2.342772960662842
Epoch 210, loss 2.2927374839782715
Epoch 220, loss 2.400094985961914
Epoch 230, loss 2.3891122341156006
Epoch 240, loss 2.324869155883789
Epoch 250, loss 2.3273675441741943
Epoch 260, loss 2.4599697589874268
Epoch 270, loss 2.3674144744873047
Epoch 280, loss 2.2762556076049805
Epoch 290, loss 2.390181303024292
Epoch 300,

## Musicnn

In [105]:
probe = SimpleMLP(
            num_features = split_to_X["musicnn"]["train"].shape[1],
            hidden_layer_sizes = [512],
            num_outputs = DATASET_TO_ATTRS["dcase"]["num_outputs"],
            dropout_input=True,
            dropout_p=0.25,
        )

probe.to(device)

SimpleMLP(
  (hidden_0): Linear(in_features=4194, out_features=512, bias=True)
  (output): Linear(in_features=512, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [106]:
std_scaler_musicnn = StandardScaler()
std_scaler_musicnn.fit(split_to_X["musicnn"]["train"])

scaler_dict["musicnn"] = std_scaler_musicnn

In [107]:
from sklearn import preprocessing

le_musicnn = preprocessing.LabelEncoder()
le_musicnn.fit_transform(split_to_y["musicnn"]["train"])

le_dict["musicnn"] = le_musicnn

In [108]:
import wandb as wandb_lib

summarize_frequency = 10

epoch = 0
while True:
    # Check if exceeded max num epochs
    if epoch == cfg["max_num_epochs"]:
        break
    # Create batch
    idxs = random.sample(
        list(range(X_train_musicnn.shape[0])),
        min(cfg["batch_size"], X_train_musicnn.shape[0]),
    )
    X, y = X_train_musicnn[idxs], y_train_musicnn[idxs]
    X = std_scaler_musicnn.transform(X)
    
    le = preprocessing.LabelEncoder()
    y_trans = le.fit_transform(y)
    
    X = torch.tensor(X, dtype=torch.float32, device=device)
    y = torch.tensor(y_trans, device=device)

    # Update
    optimizer.zero_grad()
    loss = F.cross_entropy(input = probe(X), target =y )
    loss.backward()
    optimizer.step()
    epoch += 1
    
    # Summarize
    if epoch % summarize_frequency == 0:
        loss = loss.item()
        print("Epoch {}, loss {}".format(epoch, loss))

Epoch 10, loss 2.2825276851654053
Epoch 20, loss 2.278913736343384
Epoch 30, loss 2.3021867275238037
Epoch 40, loss 2.3278188705444336
Epoch 50, loss 2.325681686401367
Epoch 60, loss 2.2612669467926025
Epoch 70, loss 2.282681703567505
Epoch 80, loss 2.290393114089966
Epoch 90, loss 2.265821695327759
Epoch 100, loss 2.271136522293091
Epoch 110, loss 2.299163579940796
Epoch 120, loss 2.330639123916626
Epoch 130, loss 2.285731315612793
Epoch 140, loss 2.2961182594299316
Epoch 150, loss 2.287040948867798
Epoch 160, loss 2.337601661682129
Epoch 170, loss 2.2923672199249268
Epoch 180, loss 2.2931265830993652
Epoch 190, loss 2.3046576976776123
Epoch 200, loss 2.2912700176239014
Epoch 210, loss 2.2954659461975098
Epoch 220, loss 2.274556875228882
Epoch 230, loss 2.291787624359131
Epoch 240, loss 2.278048038482666
Epoch 250, loss 2.2880165576934814
Epoch 260, loss 2.3102943897247314
Epoch 270, loss 2.286292314529419
Epoch 280, loss 2.281397819519043
Epoch 290, loss 2.317507028579712
Epoch 300, 

## Jukebox (0-pad)

In [109]:
probe = SimpleMLP(
            num_features = split_to_X["jukebox"]["train"].shape[1],
            hidden_layer_sizes = [512],
            num_outputs = DATASET_TO_ATTRS["dcase"]["num_outputs"],
            dropout_input=True,
            dropout_p=0.25,
        )

probe.to(device)

SimpleMLP(
  (hidden_0): Linear(in_features=4800, out_features=512, bias=True)
  (output): Linear(in_features=512, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [110]:
std_scaler_jukebox = StandardScaler()
std_scaler_jukebox.fit(split_to_X["jukebox"]["train"])

scaler_dict["jukebox"] = std_scaler_jukebox

In [111]:
from sklearn import preprocessing

le_jukebox = preprocessing.LabelEncoder()
le_jukebox.fit_transform(split_to_y["jukebox"]["train"])

le_dict["jukebox"] = le_jukebox

In [114]:
import wandb as wandb_lib

summarize_frequency = 10

epoch = 0
while True:
    # Check if exceeded max num epochs
    if epoch == cfg["max_num_epochs"]:
        break
    # Create batch
    idxs = random.sample(
        list(range(X_train_jukebox_zeropad.shape[0])),
        min(cfg["batch_size"], X_train_jukebox_zeropad.shape[0]),
    )
    X, y = X_train_jukebox_zeropad[idxs], y_train_jukebox_zeropad[idxs]
    X = std_scaler_jukebox.transform(X)
    
    le = preprocessing.LabelEncoder()
    y_trans = le.fit_transform(y)
    
    X = torch.tensor(X, dtype=torch.float32, device=device)
    y = torch.tensor(y_trans, device=device)

    # Update
    optimizer.zero_grad()
    loss = F.cross_entropy(input = probe(X), target =y )
    loss.backward()
    optimizer.step()
    epoch += 1
    
    # Summarize
    if epoch % summarize_frequency == 0:
        loss = loss.item()
        print("Epoch {}, loss {}".format(epoch, loss))

Epoch 10, loss 2.4042441844940186
Epoch 20, loss 2.3660833835601807
Epoch 30, loss 2.384331703186035
Epoch 40, loss 2.40330171585083
Epoch 50, loss 2.4249908924102783
Epoch 60, loss 2.3597500324249268
Epoch 70, loss 2.292440891265869
Epoch 80, loss 2.4338154792785645
Epoch 90, loss 2.4052882194519043
Epoch 100, loss 2.3852698802948
Epoch 110, loss 2.402383327484131
Epoch 120, loss 2.4262704849243164
Epoch 130, loss 2.3900556564331055
Epoch 140, loss 2.375556230545044
Epoch 150, loss 2.4089126586914062
Epoch 160, loss 2.436285972595215
Epoch 170, loss 2.396090269088745
Epoch 180, loss 2.390254497528076
Epoch 190, loss 2.4125072956085205
Epoch 200, loss 2.4186441898345947
Epoch 210, loss 2.350250244140625
Epoch 220, loss 2.4532456398010254
Epoch 230, loss 2.3498756885528564
Epoch 240, loss 2.368086814880371
Epoch 250, loss 2.3486831188201904
Epoch 260, loss 2.3842172622680664
Epoch 270, loss 2.3629508018493652
Epoch 280, loss 2.4140963554382324
Epoch 290, loss 2.3933892250061035
Epoch 30