### Final preprocessing and data creation

In this file the relevant audio data is transformed in the appropriate format, pre-preprocessed according to model requirements and split into training, validation, and test set.

In [1]:
# import relevant libraries

# for data manipulation 
import numpy as np

# for audio extraction
import librosa

# for path definitions, opening files
from pathlib import Path
import os
import tarfile
import pickle
import math

# for pre-processing and data creation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# counter for for loops
from tqdm import tqdm

In [2]:
# install library if necessary
# !pip install librosa

In [2]:
# save the path where data is stored (in UCloud)
root_path = './775771'

In [3]:
# load dictionary with track ID and labels 
with open('tracks.pkl', 'rb') as f:
    tracks = pickle.load(f)

In [4]:
# check amount of tracks in pickle file
print(len(list(tracks.items())))

7087


In [5]:
# check amount of audio files in the folder
print(len(os.listdir(root_path)))

18486


In [7]:
# create audio representation of each audio file that is in the dictionary

# create list with all audio files in folder
files = os.listdir(root_path)

# create a list to check which audio files have been processed afterwards
populated_IDS = []

# for loop with counter to append raw audio form to dictionary
for file in tqdm(files):
    
    # get file ID from audio file
    ID = file.split('.')[0]

    # when audio file is relevant, get raw audio form
    if int(ID) in tracks.keys():

        # storing the raw audio form using librosa
        y, sr = librosa.load(f'{root_path}/{file}', sr=22050)

        # add raw audio form to the dictionary
        tracks[int(ID)]['raw'] = y
        
        # populate the dictionary with relevant IDs
        populated_IDS.append(ID)    

100%|██████████| 18486/18486 [1:32:46<00:00,  3.32it/s]  


In [8]:
# check that population worked
print(populated_IDS[:10])
print(len(populated_IDS))

['1406057', '489238', '1320483', '325577', '1290879', '1110073', '1166111', '1137782', '1116404', '1199623']
7087


### Trimming the raw audio down to 30 seconds

Repeat the approach three times for start, middle, and end to create the training, validation and test data.

Approach to get the start (30 seconds) of the track:

In [9]:
# cut out first 30s of audio data

print('30 seconds of the audio at a sample rate of 22050 Hz results in', 30*22050, 'elements.')

for key in tqdm(populated_IDS):
    tracks[int(key)]['raw_30s'] = tracks[int(key)]['raw'][0:661500]

100%|██████████| 7087/7087 [00:00<00:00, 584275.82it/s]


Approach to get the middle (30 seconds) of the track:

In [13]:
# cut the raw audio data in 30 second windows

# print('30 seconds of the audio at a sample rate of 22050 Hz results in', 30*22050, 'elements.')

# for key in tqdm(populated_IDS):
#     middle = (int(math.ceil(len(tracks[int(key)]['raw']) / 2))) - 1
#     tracks[int(key)]['raw_30s'] = tracks[int(key)]['raw'][middle-330750:middle+330750]    

30 seconds of the audio at a sample rate of 22050 results in 661500 elements.


100%|██████████| 7087/7087 [00:00<00:00, 353436.05it/s]


Approach to get the end of the song

In [24]:
# cut out last 30s of audio data

# print('30 seconds of the audio at a sample rate of 22050 Hz results in', 30*22050, 'elements.')

# for key in tqdm(populated_IDS):
#     tracks[int(key)]['raw_30s'] = tracks[int(key)]['raw'][-661500:]

100%|██████████| 7087/7087 [00:00<00:00, 508354.84it/s]


### Get MFCC representation

In [25]:
# calculate MFCCs with the raw 30s data and store it in the tracks dictionary
    
for key in tqdm(populated_IDS):

    # compute MFCCs: sample rate: 22050, number of coefficients: 20, number of frames = 1292, frame size (hop_length) = 512
    y_mfcc_30s = librosa.feature.mfcc(y=tracks[int(key)]['raw_30s'], sr=22050, hop_length=512)
    # store in tracks dictionary
    tracks[int(key)]['mfcc_30s'] = y_mfcc_30s

100%|██████████| 7087/7087 [15:17<00:00,  7.73it/s]


In [11]:
# rows represent the number of mel-frequency cepstral coefficients extracted per frame
# columns represent the number of frames in the audio signal (frame size 512)

print(tracks[1385300]['mfcc_30s'][:5])
print(tracks[1385300]['mfcc_30s'].shape)

[[-418.9718    -403.4135    -421.6354    ... -380.0868    -382.3547
  -306.3769   ]
 [ 189.52945    205.0528     186.94696   ...  224.58728    221.09293
   207.38431  ]
 [ 108.909805   110.85075    108.369675  ...  105.90192    102.03871
    48.217697 ]
 [  36.145638    28.500072    35.76802   ...   11.860008     8.870483
    11.745757 ]
 [   3.7493386   -4.4960294    1.1900263 ...  -18.567358   -18.794258
    -7.999176 ]]
(20, 1292)
(20, 1292)


In [17]:
# the 30s MFCCs all have the same length, i.e., same number of frames
# first some visual inspection

for key in populated_IDS[:10]:
    print(key + ":")
    print(tracks[int(key)]['mfcc_30s'].shape)

1406057:
(20, 1292)
489238:
(20, 1292)
1320483:
(20, 1292)
325577:
(20, 1292)
1290879:
(20, 1292)
1110073:
(20, 1292)
1166111:
(20, 1292)
1137782:
(20, 1292)
1116404:
(20, 1292)
1199623:
(20, 1292)


In [26]:
# check if all items have same length with for loop

drop_list = []

for key in populated_IDS:
    if tracks[int(key)]['mfcc_30s'].shape[1] != 1292:
        print(key, tracks[int(key)]['mfcc_30s'].shape[1])
        drop_list.append(key)
print(drop_list)

[]


No elements that have another length!

In [14]:
# drop corrupted files if there are any

for key in drop_list:
    del tracks[int(key)]

# check how many tracks are left
print(len(tracks))

7087


In [15]:
# drop from populated list

for key in drop_list:
    populated_IDS.remove(key)

print(len(populated_IDS))

7087


### Creation of multi-hot encoding numpy arrays as labels 

In [27]:
# create numpy array of labels by multi-hot encoding the labels

labels_list = []

for key in populated_IDS:
    labels_list.append(list(tracks[int(key)]['mood/theme']))

mlb = MultiLabelBinarizer()
y_hot = mlb.fit_transform(labels_list)

print(mlb.classes_)
print(y_hot)
print(y_hot.shape)

['calm' 'energetic' 'happy' 'hopeful' 'inspiring' 'love' 'motivational'
 'relaxing' 'sad' 'uplifting']
[[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 1 ... 0 0 1]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(7087, 10)


### Creation of training, validation, and test set

##### Create a numpy array with the MFCCs and transform them into the appropriate shape for our models

In [28]:
# stack the 30s MFCCs of all the audio tracks inside a numpy array
# resulting array has dimensions (n_samples, n_coeff, n_frames)

mfccs_list = [] 

for key in populated_IDS:
    mfccs_list.append(tracks[int(key)]['mfcc_30s'])

mfccs = np.stack(mfccs_list, axis=0)

# check the results
print(mfccs[:3, :, :])
print(mfccs.shape)

[[[-1.15074341e+02 -9.67872009e+01 -7.89034195e+01 ... -4.51817413e+02
   -4.51817413e+02 -4.51817413e+02]
  [ 7.59934158e+01  7.37453308e+01  6.52228775e+01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.39426613e+01  1.06665401e+01  1.28579235e+01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [-2.59192634e+00 -1.65563512e+00 -4.81208563e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 7.04752445e+00 -9.54183221e-01 -1.05511503e+01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 6.23311901e+00  1.36929178e+00  8.76857221e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]]

 [[-1.95505722e+02 -2.07821503e+02 -2.00479874e+02 ... -5.74457092e+02
   -5.74457092e+02 -5.74457092e+02]
  [ 1.29257050e+02  1.45445129e+02  1.66857513e+02 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 2.46110287e+01  4.18668461e+00 -2.13272266e+01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [-5.40222931e+00 -7.5

In [29]:
# transpose the dimensions of the mfccs array to the order specified above, dimensions (n_samples, n_frames, n_coeff)
mfccs = mfccs.transpose(0, 2, 1)
print(mfccs.shape)

(7087, 1292, 20)


##### Scaling the MFCCs 

In [30]:
# scale each MFCC to a range between 0 and 1 across all samples and frames
# to ensure that the model can learn the relevant patterns using the whole data

# reshape the MFCCs to a 2D array for scaling
mfccs_2d = mfccs.reshape(-1, mfccs.shape[-1])

print(mfccs_2d)
print(mfccs_2d.shape)

# scale the MFCCs
mfccs_scaled = MinMaxScaler().fit_transform(mfccs_2d)

# reshape the scaled MFCCs back to the original shape
mfccs_scaled = mfccs_scaled.reshape(mfccs.shape)

print(mfccs_scaled)
print(mfccs_scaled.shape)

[[-1.15074341e+02  7.59934158e+01  1.39426613e+01 ... -2.59192634e+00
   7.04752445e+00  6.23311901e+00]
 [-9.67872009e+01  7.37453308e+01  1.06665401e+01 ... -1.65563512e+00
  -9.54183221e-01  1.36929178e+00]
 [-7.89034195e+01  6.52228775e+01  1.28579235e+01 ... -4.81208563e-01
  -1.05511503e+01  8.76857221e-01]
 ...
 [-6.61842468e+02  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-6.61842468e+02  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-6.61842468e+02  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
(9156404, 20)
[[[0.770476   0.54132587 0.5475009  ... 0.42626196 0.56453186 0.5611431 ]
  [0.78433985 0.5368557  0.54016334 ... 0.43274936 0.5081874  0.53254753]
  [0.79789793 0.51990926 0.54507136 ... 0.44088674 0.44060984 0.5296524 ]
  ...
  [0.5151839  0.39021742 0.5162734  ... 0.44422096 0.51490635 0.52449715]
  [0.5151839  0.39021742 0.5162734  ... 0.444220

In [31]:
# add n_channels = 1 to the MFCCs data, dimensions: (n_samples, n_frames, n_coeff, n_channels)
mfccs_scaled = np.expand_dims(mfccs_scaled, axis=-1)
print(mfccs_scaled.shape)

(7087, 1292, 20, 1)


In [32]:
# split the MFCCs in train:validation:test set in the ratio 60:20:20

# split the MFCCs into train and test sets
X_train_, X_test, y_train_, y_test = train_test_split(mfccs_scaled, y_hot, test_size=0.2, random_state=42)

# split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_, y_train_, test_size=0.25, random_state=42)

In [22]:
# check results

print("X_train:")
print(X_train.shape)
print(X_train[:1])
print()
print("X_val:")
print(X_val[:1])
print()
print("X_test:")
print(X_test[:1])
print()
print("y_train:")
print(y_train.shape)
print(y_train[:1])
print()
print("y_val:")
print(y_val[:1])
print()
print("y_test:")
print(y_test[:1])

X_train:
(4251, 1292, 20, 1)
[[[[0.28968054]
   [0.4086325 ]
   [0.57384235]
   ...
   [0.43866572]
   [0.5247755 ]
   [0.50184166]]

  [[0.29401487]
   [0.4190688 ]
   [0.57840246]
   ...
   [0.45471388]
   [0.5327516 ]
   [0.5050449 ]]

  [[0.29551417]
   [0.422373  ]
   [0.5787253 ]
   ...
   [0.45241883]
   [0.53241163]
   [0.5042787 ]]

  ...

  [[0.53238404]
   [0.7719376 ]
   [0.53440934]
   ...
   [0.5317178 ]
   [0.51749355]
   [0.5118062 ]]

  [[0.56447434]
   [0.8038856 ]
   [0.50384766]
   ...
   [0.4929266 ]
   [0.5124929 ]
   [0.51570314]]

  [[0.6165538 ]
   [0.7916075 ]
   [0.5277902 ]
   ...
   [0.4812412 ]
   [0.525434  ]
   [0.55409443]]]]

X_val:
[[[[0.37785777]
   [0.4355316 ]
   [0.5897354 ]
   ...
   [0.4184648 ]
   [0.52810264]
   [0.505432  ]]

  [[0.38803676]
   [0.4451772 ]
   [0.5864549 ]
   ...
   [0.41249344]
   [0.5267891 ]
   [0.4948068 ]]

  [[0.39757156]
   [0.47390112]
   [0.6168524 ]
   ...
   [0.4114784 ]
   [0.5342563 ]
   [0.50476485]]

  ...

  [

##### Store the final data 

In [33]:
# store the train, validation and test set of 30s MFCCs

files = [X_train, X_val, X_test, y_train, y_val, y_test]

file_names = ['X_train.npy',
              'X_val.npy',
              'X_test.npy',
              'y_train.npy',
              'y_val.npy',
              'y_test.npy']

for file, file_name in zip(files, file_names):
    np.save(f'{file_name}', file)

In [None]:
# store the final dictionary
with open('tracks_final_dict', 'wb') as file:
    pickle.dump(tracks, file)