# Model test

Test models that were pretrained on small fma dataset with **Google Colab** (**ANN_genre_classification.upynb**)

In [5]:
from models.HybrydNet_056a import HybrydNet
from models.HybrydMlNet114_043 import HybrydMLNet
import torch

In [6]:
net_coarse = HybrydNet()    # 8 genre model
net_other = HybrydMLNet()   # 114 genre model

In [7]:
net_coarse.load_state_dict(torch.load("models/HybrydNet_056a.pt"))
net_coarse.eval()

net_other.load_state_dict(torch.load("models/HybrydMlNet114_043.pt"))
net_other.eval()

HybrydMLNet(
  (conv1): Conv2d(1, 32, kernel_size=(7, 11), stride=(3, 5), padding=(3, 5))
  (act1): ReLU()
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do1): Dropout2d(p=0.05, inplace=False)
  (pool1): MaxPool2d(kernel_size=(2, 3), stride=(2, 3), padding=0, dilation=1, ceil_mode=False)
  (conv1ad): Conv2d(32, 128, kernel_size=(5, 7), stride=(3, 3), padding=(2, 5))
  (act1ad): ELU(alpha=1.0)
  (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do2): Dropout2d(p=0.05, inplace=False)
  (pool1ad): AvgPool2d(kernel_size=(1, 3), stride=(1, 3), padding=0)
  (conv2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act2): Tanh()
  (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do3): Dropout2d(p=0.05, inplace=False)
  (pool2): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2ad): Conv2d(256, 512, kern

## Model testing on custom data

Here we need to extract features that can be used as model input. In this case it is concatenated **Mel-spectrograms** with **FRAME_SIZE=2048**, **HOP_LENGTH=512**, **90 Mel bands** and **13 MFCCs and their 1st and 2nd derivatives**. Because model was trained on 29.5 sec data, we need to split our custom track to a number of samples of length 29.5 and drop the last fragment that will be less than 29.5

In [2]:
from collections import Counter

import librosa as lb
import numpy as np

In [13]:
device = torch.device("cuda:0")
net_coarse.to(device)
net_other.to(device)

classes = ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']
ml_labels = ['Avant-Garde', 'International', 'Sound Art', 'Novelty', 'Turkish', 'Pop', 'New Age', 'Rock', 'Romany (Gypsy)', 'Electronic', 'Sound Effects', 'Folk', 'Soundtrack', 'Hip-Hop', 'Audio Collage', 'Punk', 'Post-Rock', 'Lo-Fi', 'Compilation', 'Rap', 'Field Recordings', 'Metal', 'Noise', 'Psych-Folk', 'Trip-Hop', 'Breakbeat', 'Krautrock', 'Tango', 'Experimental', 'Dance', 'Electroacoustic', 'Chip Music', 'Ambient Electronic', 'Hip-Hop Beats', 'Loud-Rock', 'Latin America', 'Drone', 'Salsa', 'Free-Folk', 'Noise-Rock', 'Psych-Rock', 'Goth', 'Electro-Punk', 'Indie-Rock', 'Abstract Hip-Hop', 'Industrial', 'No Wave', 'Experimental Pop', 'French', 'Reggae - Dub', 'Drum & Bass', 'Afrobeat', 'Nerdcore', 'Garage', 'Indian', 'New Wave', 'Post-Punk', 'Reggae - Dancehall', 'Sludge', 'African', 'Freak-Folk', 'Progressive', 'Alternative Hip-Hop', 'Death-Metal', 'Middle East', 'Singer-Songwriter', 'Shoegaze', 'Kid-Friendly', 'Synth Pop', 'Spanish', 'Ambient', 'Hardcore', 'Thrash', 'Power-Pop', 'Space-Rock', 'Polka', 'Balkan', 'Unclassifiable', 'Europe', 'Chill-out', 'Bigbeat', 'Surf', 'Black-Metal', 'Christmas', 'Brazilian', 'Asia-Far East', 'South Indian Traditional', 'Celtic', 'British Folk', 'Techno', 'House', 'Glitch', 'Rock Opera', 'Breakcore - Hard', 'Minimal Electronic', 'Sound Poetry', 'Grindcore', 'Jungle', 'Minimalism', 'Instrumental', 'Dubstep', 'North African', 'Sound Collage', 'Klezmer', 'Flamenco', 'Skweee', 'IDM', 'Downtempo', 'Chiptune', 'Cumbia', 'Musique Concrete', 'Latin', 'Improv', 'Holiday']


track_file = "a.mp3"    # Enter path to file which you want to analyze
signal, sr = lb.load(track_file)

song_coarse_preds = []
song_other_preds = []

for i in range(signal.size // 650475):
    mel_spec = lb.feature.melspectrogram(signal[650475*i:650475*(i+1)], sr=sr, n_fft=2048, hop_length=512, n_mels=90)
    log_mel_spec = lb.power_to_db(mel_spec)

    mfcc = lb.feature.mfcc(signal[650475*i:650475*(i+1)], n_mfcc=13)
    mfcc2 = lb.feature.delta(mfcc)
    mfcc3 = lb.feature.delta(mfcc, order=2)
    mfcc = np.concatenate((mfcc, mfcc2, mfcc3))

    with torch.no_grad():
        X = torch.tensor(np.concatenate((mfcc, log_mel_spec))).unsqueeze(0).to(device)
        preds_coarse = list(zip(torch.nn.Softmax()(net_coarse.forward(X))[0].data, classes))
        preds_coarse.sort(reverse=True)
        song_coarse_preds.append(preds_coarse)

        preds_other = list(zip(torch.nn.Softmax()(net_other.forward(X))[0].data, ml_labels))
        preds_other.sort(reverse=True)
        song_other_preds.append(preds_other)
        print(preds_other[0][1], "-", preds_other[1][1], "-", preds_other[2][1], f"({preds_other[0][0]:.3f}:{preds_other[1][0]:.3f}:{preds_other[2][0]:.3f})")

  preds_coarse = list(zip(torch.nn.Softmax()(net_coarse.forward(X))[0].data, classes))
  preds_other = list(zip(torch.nn.Softmax()(net_other.forward(X))[0].data, ml_labels))


Experimental - Instrumental - Ambient (0.317:0.138:0.068)
Experimental - Rock - Noise (0.193:0.157:0.062)
Rock - Punk - Metal (0.461:0.079:0.066)
Rock - Punk - Garage (0.463:0.109:0.055)
Rock - Punk - Garage (0.481:0.092:0.083)
Rock - Punk - Metal (0.496:0.086:0.063)


Top1_genre - Top2_genre - Top3_genre (**corresponding Softmax values**)