In [1]:
import os
import numpy as np
from skmultilearn.dataset import load_dataset
from scipy.sparse import lil_matrix
import pandas as pd
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split
from collections import Counter

Check the current split. Is it balanced?

In [2]:
files_train = os.listdir('/home/laura/MedleyDB/processed/9_8_132300/train/input')
labels_train_dir = '/home/laura/MedleyDB/processed/9_8_132300/train/labels'

matrix_labels = []

for file in files_train:

    label = np.load(os.path.join(labels_train_dir, file), allow_pickle = True)
    matrix_labels.append(label)

y_train = lil_matrix(matrix_labels)

In [3]:
files_val = os.listdir('/home/laura/MedleyDB/processed/9_8_132300/val/input')
labels_val_dir = '/home/laura/MedleyDB/processed/9_8_132300/val/labels'

matrix_labels = []

for file in files_val:

    label = np.load(os.path.join(labels_val_dir, file), allow_pickle = True)
    matrix_labels.append(label)

y_val = lil_matrix(matrix_labels)

In [4]:
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train.A, order=2) for combination in row),
    'validation' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_val.A, order=2) for combination in row)
}).T.fillna(0.0)

Unnamed: 0,"(0, 1)","(1, 2)","(0, 0)","(5, 10)","(5, 5)","(10, 10)","(2, 10)","(1, 5)","(1, 1)","(0, 5)",...,"(3, 9)","(11, 11)","(5, 11)","(0, 11)","(1, 11)","(4, 13)","(4, 16)","(11, 17)","(0, 17)","(0, 16)"
train,776.0,402.0,917.0,17.0,478.0,75.0,9.0,348.0,1010.0,300.0,...,4.0,26.0,18.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0
validation,194.0,36.0,407.0,16.0,385.0,51.0,0.0,223.0,368.0,161.0,...,40.0,11.0,0.0,11.0,11.0,149.0,121.0,11.0,11.0,42.0


It is not balanced at all!! e.g. combination (0,16) does not appear in the train set.

Using the scikit multilearn package we can get a better distributed split:

In [2]:
files = os.listdir('/home/laura/MedleyDB/processed/9_8_132300/input')
labels_dir = '/home/laura/MedleyDB/processed/9_8_132300/labels'

X = lil_matrix((len(files), 1))

int_to_file = {}
i = 0

matrix_labels = []

for file in files:
    int_to_file[i] = file
    X[i] = i
    
    label = np.load(os.path.join(labels_dir, file), allow_pickle = True)
    matrix_labels.append(label)
    
    i = i+1

y = lil_matrix(matrix_labels)

In [3]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = 0.5)

In [4]:
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train.A, order=2) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test.A, order=2) for combination in row)
}).T.fillna(0.0)

Unnamed: 0,"(4, 4)","(4, 7)","(0, 0)","(13, 13)","(0, 13)","(7, 7)","(0, 7)","(7, 13)","(4, 13)","(0, 4)",...,"(5, 6)","(3, 6)","(14, 17)","(11, 17)","(11, 11)","(0, 17)","(0, 11)","(1, 11)","(2, 9)","(5, 11)"
train,541.0,153.0,847.0,230.0,129.0,662.0,279.0,164.0,73.0,391.0,...,52.0,46.0,38.0,5.0,20.0,5.0,20.0,20.0,14.0,11.0
test,541.0,154.0,869.0,230.0,129.0,661.0,278.0,163.0,76.0,391.0,...,41.0,38.0,34.0,6.0,17.0,6.0,17.0,17.0,16.0,7.0


In [13]:
train_files = []
train_songs = []

for i in range(X_train.shape[0]):
    file = int_to_file[int(X_train[i,0])]
    train_files.append(file)
    song = file.split('_')[0]+'_'+file.split('_')[0]
    if song not in train_songs:
        train_songs.append(song)
    
test_files = []
test_songs = []

for i in range(X_test.shape[0]):
    file = int_to_file[int(X_test[i,0])]
    test_files.append(file)
    song = file.split('_')[0]+'_'+file.split('_')[0]
    if song not in test_songs:
        test_songs.append(song)

In [18]:
count = 0

for song in train_songs:
    if song in test_songs:
        count += 1

print(count)
print(len(train_songs))
print(len(test_songs))

49
49
49


We get a better distribution of the label combinations, but the different songs are common in both sets...