In [1]:
# setup
import os
import json
import gc
from tqdm import tqdm
import numpy as np
import soundfile as sf
from glob import glob
import librosa
from sklearn.metrics import precision_score, f1_score, classification_report
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2FeatureExtractor
from transformers import AutoModel
import warnings

from model import ShortChunkCNN_Res

# TODO: change the file path
TEST_FILE_DIR = "./hw1/slakh/test"
TEST_LABEL_PATH = "./hw1/slakh/test_labels.json"
RANDOM_SEED = 0

LABELS = ['Piano', 'Percussion', 'Organ', 'Guitar', 'Bass', 'Strings', 'Voice', 'Wind Instruments', 'Synth']
warnings.filterwarnings("ignore", category=UserWarning)



  from .autonotebook import tqdm as notebook_tqdm




  return self.fget.__get__(instance, owner)()


In [2]:
# DEVICE: GPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-params
EPOCHS = 30
PATIENCE = 10
BATCH_SIZE = 32  # 64
LR = 1e-3  # 1e-5
THRESHOLD = 0.9541 # current best

# for model
N_CHANNELS = 128  # 256


In [3]:
# Load pretrained pre-processor and model
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-330M",trust_remote_code=True)
MERT_model = AutoModel.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True)

# Freeze the pretrained model's parameters
for param in MERT_model.parameters():
    param.requires_grad = False




In [4]:
class AudioDataset(Dataset):
    def __init__(self, wav_directory: str, label_directory: str):
        """
        Args:
            directory (string): Path to the directory with all the .npy files.
        """
        self.directory = wav_directory
        self.files = os.listdir(wav_directory)  # List of all .npy files in the directory
        with open(label_directory, "r") as f:
            self.labels = json.load(f)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = os.path.join(self.directory, self.files[idx])
        audio_wave = np.load(file_path)
        label = np.array(self.labels[self.files[idx]], dtype=np.float32)
        return audio_wave, label


test_dataset = AudioDataset(TEST_FILE_DIR, TEST_LABEL_PATH)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


### Test DL model (ShortChunkCNN_Res)

In [5]:
def test(
        test_dataloader: DataLoader, 
        model: ShortChunkCNN_Res,
        processor: Wav2Vec2FeatureExtractor = processor,
        MERT_model: AutoModel = MERT_model,
        threshold: float = THRESHOLD,
        verbose: bool = True
    ):

    # Test start
    MERT_model = MERT_model.to(DEVICE)
    model = model.to(DEVICE)
    model.eval()
    loss_fn = nn.BCELoss()
    test_total_loss = 0
    test_true = torch.tensor([])
    test_pred_p = torch.tensor([])

    with torch.no_grad():
        for test_wavs, test_label in tqdm(test_dataloader, disable=(not verbose)):
            test_wavs = test_wavs.cpu().numpy()
            inputs = processor(test_wavs, sampling_rate=24000, return_tensors="pt")
            inputs = inputs.to(DEVICE)
            test_label = test_label.to(DEVICE)

            # pre-trained model      
            outputs = MERT_model(**inputs)
            pretrained_output = outputs.last_hidden_state # [batch_size, time, 1024 feature_dim]

            output = model(pretrained_output)
            loss = loss_fn(output, test_label)
            test_total_loss += loss.item()

            # Calculate Score
            test_label = test_label.cpu()
            output = output.cpu()
            test_true = torch.cat([test_true, test_label])
            test_pred_p = torch.cat([test_pred_p, output])

            # Delete Var
            del test_wavs, test_label, inputs, outputs, pretrained_output, output
            gc.collect()

    test_pred = (test_pred_p > threshold).float()
    test_score = precision_score(test_true, test_pred, average="macro")
    test_score_f1 = f1_score(test_true, test_pred, average="macro")

    print(f"Macro Precision: {test_score:.4f}")
    print(f"Macro F1-score: {test_score_f1:.4f}")
    if verbose:
        report = classification_report(test_true, test_pred, target_names=LABELS)
        print("Classification Report:\n", report)

    return test_true, test_pred_p


In [6]:
model_path = "DL_model_f1.pt"
model = ShortChunkCNN_Res(n_channels=N_CHANNELS)
model.load_state_dict(torch.load(model_path))

test_true, test_pred_p = test(test_dataloader, model, threshold=THRESHOLD)


100%|██████████| 71/71 [00:58<00:00,  1.21it/s]

Macro Precision: 0.7178
Macro F1-score: 0.6202
Classification Report:
                   precision    recall  f1-score   support

           Piano       0.88      0.96      0.92      1889
      Percussion       0.59      0.16      0.25       243
           Organ       0.56      0.31      0.40       461
          Guitar       0.93      0.80      0.86      1943
            Bass       0.96      0.99      0.97      2076
         Strings       0.72      0.87      0.79      1235
           Voice       0.69      0.30      0.42       485
Wind Instruments       0.55      0.55      0.55       889
           Synth       0.58      0.34      0.43       647

       micro avg       0.82      0.76      0.79      9868
       macro avg       0.72      0.59      0.62      9868
    weighted avg       0.81      0.76      0.77      9868
     samples avg       0.81      0.76      0.77      9868






### For Evaluation

In [7]:
def get_prediction(
        flac_file_path: str,
        model: ShortChunkCNN_Res,
        processor: Wav2Vec2FeatureExtractor = processor,
        MERT_model: AutoModel = MERT_model,
        threshold: float = THRESHOLD,
        save_file: bool = True,
    ):
    name = flac_file_path.split('/')[-1].split('.')[0]
    a, sr = sf.read(flac_file_path)
    n = librosa.resample(a, orig_sr=sr, target_sr=24000)
    n = n[:-(n.shape[0]%120000)]  # remove trailing
    n = n.reshape(((n.shape[0]//120000), 120000))  # reshape into 5 second
    inputs = processor(n, sampling_rate=24000, return_tensors="pt")
    inputs = inputs.to(DEVICE)
    MERT_model = MERT_model.to(DEVICE)
    model = model.to(DEVICE)

    # pre-trained model
    with torch.no_grad():
        outputs = MERT_model(**inputs)
        pretrained_output = outputs.last_hidden_state # [batch_size, time, 1024 feature_dim]
        output = model(pretrained_output)

    output = output.cpu()
    output = (output > threshold).float()
    output = output.numpy().T

    if save_file:
        np.save(f"./hw1/test_track/{name}.npy", output)
        print(f"File {name}.npy successfully saved. dim={output.shape}")

    return output


In [8]:
model_path = "DL_model_f1.pt"
model = ShortChunkCNN_Res(n_channels=N_CHANNELS)
model.load_state_dict(torch.load(model_path))

# TODO: change the file path
audio_path_list = glob(os.path.join("./hw1/test_track", "*.flac"))
for file in audio_path_list:
    o = get_prediction(file, model=model, threshold=THRESHOLD)


File Track01937.npy successfully saved. dim=(9, 40)
File Track01876.npy successfully saved. dim=(9, 51)
File Track02100.npy successfully saved. dim=(9, 45)
File Track02078.npy successfully saved. dim=(9, 43)
File Track02024.npy successfully saved. dim=(9, 49)
