In [1]:
import os
import numpy as np
from skmultilearn.dataset import load_dataset
from scipy.sparse import lil_matrix
import pandas as pd
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split
from collections import Counter

Check the current split. Is it balanced?

In [2]:
files_train = os.listdir('/home/laura/MedleyDB/processed/9_8_132300/train/input')
labels_train_dir = '/home/laura/MedleyDB/processed/9_8_132300/train/labels'

matrix_labels = []

for file in files_train:

    label = np.load(os.path.join(labels_train_dir, file), allow_pickle = True)
    matrix_labels.append(label)

y_train = lil_matrix(matrix_labels)

In [3]:
files_val = os.listdir('/home/laura/MedleyDB/processed/9_8_132300/val/input')
labels_val_dir = '/home/laura/MedleyDB/processed/9_8_132300/val/labels'

matrix_labels = []

for file in files_val:

    label = np.load(os.path.join(labels_val_dir, file), allow_pickle = True)
    matrix_labels.append(label)

y_val = lil_matrix(matrix_labels)

In [4]:
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train.A, order=2) for combination in row),
    'validation' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_val.A, order=2) for combination in row)
}).T.fillna(0.0)

Unnamed: 0,"(4, 7)","(0, 0)","(13, 13)","(4, 4)","(0, 13)","(7, 7)","(0, 7)","(7, 13)","(4, 13)","(0, 4)",...,"(4, 10)","(6, 10)","(0, 10)","(1, 10)","(2, 9)","(2, 10)","(5, 10)","(10, 13)","(5, 11)","(2, 13)"
train,197.0,1104.0,294.0,691.0,165.0,844.0,357.0,210.0,94.0,501.0,...,25.0,16.0,28.0,28.0,18.0,14.0,19.0,10.0,13.0,3.0
validation,49.0,267.0,74.0,173.0,41.0,214.0,89.0,52.0,26.0,125.0,...,8.0,4.0,10.0,10.0,6.0,7.0,7.0,2.0,2.0,1.0


It is not balanced at all!! e.g. combination (0,16) does not appear in the train set.

Using the scikit multilearn package we can get a better distributed split:

In [5]:
files = os.listdir('/home/laura/MedleyDB/processed/9_8_132300/input')
labels_dir = '/home/laura/MedleyDB/processed/9_8_132300/labels'

X = lil_matrix((len(files), 1))

int_to_file = {}
i = 0

matrix_labels = []

for file in files:
    int_to_file[i] = file
    X[i] = i
    
    label = np.load(os.path.join(labels_dir, file), allow_pickle = True)
    matrix_labels.append(label)
    
    i = i+1

y = lil_matrix(matrix_labels)

In [6]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = 0.5)

In [7]:
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train.A, order=2) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test.A, order=2) for combination in row)
}).T.fillna(0.0)

Unnamed: 0,"(4, 7)","(0, 0)","(13, 13)","(4, 4)","(0, 13)","(7, 7)","(0, 7)","(7, 13)","(4, 13)","(0, 4)",...,"(3, 16)","(13, 14)","(14, 17)","(4, 10)","(6, 10)","(0, 14)","(1, 14)","(4, 14)","(2, 13)","(5, 11)"
train,154,868,230,541,129,661,278,164,82,391,...,19,45,36,21,9,5,5,5,2,10
test,153,848,230,541,129,662,279,163,67,391,...,15,45,36,19,12,9,9,9,4,8


In [8]:
train_files = []
train_songs = []

for i in range(X_train.shape[0]):
    file = int_to_file[int(X_train[i,0])]
    train_files.append(file)
    song = file.split('_')[0]+'_'+file.split('_')[1]
    if song not in train_songs:
        train_songs.append(song)
        
test_files = []
test_songs = []

for i in range(X_test.shape[0]):
    file = int_to_file[int(X_test[i,0])]
    test_files.append(file)
    song = file.split('_')[0]+'_'+file.split('_')[1]
    if song not in test_songs:
        test_songs.append(song)

In [9]:
common_songs = 0

for song in train_songs:
    if song in test_songs:
        common_songs += 1

print(common_songs)
print(len(train_songs))
print(len(test_songs))

115
117
119


We get a better distribution of the label combinations, but the different songs are common in both sets...