In [1]:
import numpy as np
import librosa
from pathlib import Path
import os
import tarfile
import pickle
import math
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
# extract mp3 files from tar into my own folder

path = "C:/Users/marce/OneDrive/Dokumente/Master/CBS/2nd semester/ML/Project/Music/moodtheme_audio/autotagging_moodtheme_audio-02.tar"
folder = "C:/Users/marce/OneDrive/Dokumente/Master/CBS/2nd semester/ML/Project/Music/moodtheme_audio"

with tarfile.open(path, 'r') as tar:
    tar.extractall(folder)


In [6]:
# load labels dictionary
with open('tracks.pkl', 'rb') as f:
    tracks = pickle.load(f)

In [7]:
print(list(tracks.items())[:10])

[(3346, {'artist_id': 517, 'album_id': 521, 'path': '46/3346.mp3', 'duration': 195.0, 'mood/theme': {'calm'}}), (3347, {'artist_id': 517, 'album_id': 521, 'path': '47/3347.mp3', 'duration': 201.8, 'mood/theme': {'calm'}}), (3348, {'artist_id': 517, 'album_id': 521, 'path': '48/3348.mp3', 'duration': 253.3, 'mood/theme': {'calm'}}), (3349, {'artist_id': 517, 'album_id': 521, 'path': '49/3349.mp3', 'duration': 228.4, 'mood/theme': {'calm'}}), (3350, {'artist_id': 517, 'album_id': 521, 'path': '50/3350.mp3', 'duration': 194.7, 'mood/theme': {'calm'}}), (4639, {'artist_id': 87, 'album_id': 718, 'path': '39/4639.mp3', 'duration': 188.0, 'mood/theme': {'relaxing'}}), (6253, {'artist_id': 811, 'album_id': 960, 'path': '53/6253.mp3', 'duration': 271.8, 'mood/theme': {'relaxing'}}), (6254, {'artist_id': 811, 'album_id': 960, 'path': '54/6254.mp3', 'duration': 481.6, 'mood/theme': {'relaxing'}}), (6256, {'artist_id': 811, 'album_id': 960, 'path': '56/6256.mp3', 'duration': 236.5, 'mood/theme': {

In [8]:
#loading loop, takes ~2 min per batch

# Richard's path:
# path = "C:/Users/r/Desktop/DataScience/2nd semester/Machine Learning/Data/autotagging_moodtheme_audio-02/02"

# Marcel's path
path = "C:/Users/marce/OneDrive/Dokumente/Master/CBS/2nd semester/ML/Project/Music/moodtheme_audio/02"

files = Path(path).glob('*.mp3')
names = os.listdir(path)

i=0
populated_IDS = []

for file in files:
    
    ID = names[i].split(".") #do with enumerate
    i= i+1
    
    if int(ID[0]) in tracks.keys():
        
        #storing the raw audio file
        y,sr = librosa.load(file,sr=22050)
        #transforming the audio file into MFCC with 20 coefficients
        y_mfcc = librosa.feature.mfcc(y=y, sr=sr)
        
        #code to access the ID, where ID[0] is the id code
        tracks[int(ID[0])]['raw'] = y
        tracks[int(ID[0])]['mfcc'] = y_mfcc
        
        populated_IDS.append(ID[0])
    

In [10]:
print(populated_IDS[:10])
print(len(populated_IDS))

['1028902', '1053502', '1062502', '1066202', '1066302', '1070602', '1075402', '1081002', '1083302', '1087102']
78


In [11]:
tracks[1028902]

{'artist_id': 366613,
 'album_id': 120946,
 'path': '02/1028902.mp3',
 'duration': 239.3,
 'mood/theme': {'sad'},
 'raw': array([0.00012053, 0.000177  , 0.00013901, ..., 0.        , 0.        ,
        0.        ], dtype=float32),
 'mfcc': array([[-575.73     , -572.6834   , -576.24854  , ..., -592.38684  ,
         -592.38684  , -592.38684  ],
        [  23.13677  ,   27.315403 ,   22.5028   , ...,    0.       ,
            0.       ,    0.       ],
        [  21.908167 ,   25.724945 ,   21.56033  , ...,    0.       ,
            0.       ,    0.       ],
        ...,
        [  -4.4958925,   -4.0129175,   -2.7611818, ...,    0.       ,
            0.       ,    0.       ],
        [  -3.740967 ,   -3.3366613,   -2.3077912, ...,    0.       ,
            0.       ,    0.       ],
        [  -2.8742807,   -2.5723767,   -1.751741 , ...,    0.       ,
            0.       ,    0.       ]], dtype=float32)}

### Preprocess raw audio data

In [12]:
# cut the raw audio data in 30 second windows
print('30 seconds of the audio at a sample rate of 22050 results in', 30*22050, 'elements.')

for key in populated_IDS:
    middle = (int(math.ceil(len(tracks[int(key)]["raw"]) / 2))) - 1
#   print('key:',key)
#   print(len(tracks[int(key)]["raw"]))
#   print(middle)
    tracks[int(key)]['raw_30s'] = tracks[int(key)]['raw'][middle-330750:middle+330750]
    

30 seconds of the audio at a sample rate of 22050 results in 661500 elements.


In [13]:
print(len(tracks[1028902]["raw_30s"]))

661500


In [14]:
# even length test

print(len(tracks[1028902]["raw"]))
middle = int(math.ceil(len(tracks[1028902]["raw"]) // 2)) - 1
print(middle)
test = tracks[1028902]["raw"][middle-330750:middle+330750]
print(len(test))

#tracks[int(key)]['raw_30s'] = tracks[int(key)]['raw'][middle-330750:middle+330750]

5276418
2638208
661500


In [15]:
# odd length test

print(len(tracks[1088002]["raw"]))
middle = int(math.ceil(len(tracks[1088002]["raw"]) // 2)) - 1
print(middle)
test = tracks[1088002]["raw"][middle-330750:middle+330750]
print(len(test))

2046419
1023208
661500


In [16]:
# create 2d np array of raw audio wave arrays
X_list = [] 

for key in populated_IDS:
    X_list.append(tracks[int(key)]["raw_30s"])

X = np.vstack(X_list)

print(X)

[[ 0.03706461  0.01631058 -0.01171767 ...  0.02039137  0.03358239
   0.04638556]
 [ 0.18573937  0.15400901  0.18307237 ... -0.12891799 -0.13615173
  -0.11986807]
 [ 0.01824583  0.00645763 -0.00554469 ... -0.07551558 -0.03006215
   0.00649806]
 ...
 [ 0.1988284   0.02043443  0.07467554 ... -0.10073748  0.07264894
   0.09715492]
 [ 0.39635408  0.38926083  0.3762677  ... -0.13140306 -0.03390444
   0.04893222]
 [ 0.02752009  0.024911    0.11035223 ... -0.24078542 -0.23933424
  -0.10184911]]


In [67]:
# create np array of labels by multi-hot encoding the labels

labels_list = []
for key in populated_IDS:
    labels_list.append(list(tracks[int(key)]['mood/theme']))

print(labels_list)

mlb = MultiLabelBinarizer()
y_hot = mlb.fit_transform(labels_list)

print(mlb.classes_)
print(y_hot)


[['sad'], ['energetic'], ['relaxing', 'calm'], ['relaxing'], ['love', 'energetic'], ['energetic'], ['relaxing'], ['love', 'happy'], ['uplifting', 'happy', 'energetic'], ['motivational', 'happy', 'inspiring'], ['sad'], ['hopeful', 'motivational', 'relaxing'], ['happy', 'energetic'], ['happy'], ['love'], ['uplifting'], ['happy'], ['relaxing'], ['happy'], ['happy', 'energetic'], ['happy'], ['sad'], ['uplifting', 'inspiring'], ['love'], ['energetic'], ['sad'], ['love'], ['energetic'], ['uplifting'], ['happy'], ['sad'], ['happy'], ['love', 'happy'], ['sad'], ['love'], ['relaxing', 'calm'], ['energetic'], ['motivational'], ['motivational'], ['love'], ['love', 'sad'], ['energetic'], ['relaxing'], ['uplifting', 'energetic'], ['calm'], ['happy'], ['love', 'inspiring'], ['motivational'], ['sad'], ['motivational', 'inspiring'], ['inspiring'], ['relaxing'], ['inspiring'], ['energetic'], ['happy'], ['sad'], ['love'], ['relaxing'], ['love'], ['uplifting', 'motivational', 'inspiring'], ['inspiring'],

In [24]:
# split the raw 30s audio data in train:validation:test in the ratio 60:20:20

# Split data into training and test sets
X_train, X_test_raw30s, y_train, y_test_raw30s = train_test_split(X, y, test_size=0.2, random_state=42)

# Split training data into training and validation sets
X_train_raw30s, X_val_raw30s, y_train_raw30s, y_val_raw30s = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [36]:
print('X_train_raw30s:')
print(X_train_raw30s[:5])
print()
print('X_val_raw30s:')
print(X_val_raw30s[:5])
print()
print('X_test_raw30s:')
print(X_test_raw30s[:5])
print()
print("y_train_raw30s:")
print(y_train_raw30s[:5])
print()
print("y_val_raw30s:")
print(y_val_raw30s[:5])
print()
print("y_test_raw30s:")
print(y_test_raw30s[:5])

X_train_raw30s:
[[-0.13711806 -0.1233232  -0.13069163 ... -0.13901985 -0.10607645
  -0.06991287]
 [-0.11391765 -0.09935525 -0.08420548 ...  0.07240532  0.07600844
   0.07708387]
 [-0.12184409 -0.13313131 -0.15494016 ... -0.23650935 -0.2116234
  -0.15660292]
 [ 0.00041884  0.00161055  0.00213846 ... -0.13782282 -0.12460025
  -0.11125866]
 [ 0.1573968   0.19943252  0.17082568 ...  0.01431066  0.14673
   0.08440161]]

X_val_raw30s:
[[ 0.18439496  0.11363991  0.03020738 ...  0.13675877  0.14625196
   0.15062234]
 [-0.79269254 -0.7948322  -0.7597189  ... -0.4764083  -0.35967335
  -0.12030822]
 [-0.12319582 -0.11981892 -0.11754184 ... -0.21788496 -0.22930649
  -0.24478614]
 [ 0.07661212  0.06754164  0.06663017 ...  0.26493463  0.2301525
   0.2773824 ]
 [ 0.13482377  0.1231519   0.11258271 ...  0.04955355  0.07936467
   0.04797056]]

X_test_raw30s:
[[ 0.18893313 -0.00305178 -0.1929783  ... -0.2056819  -0.22154471
  -0.18743934]
 [ 0.03706461  0.01631058 -0.01171767 ...  0.02039137  0.03358239

### Preprocess MFCC data

In [37]:
print(tracks[1028902]["mfcc"].shape) # MFCCs have 20 coefficients, number of audio tensors depends on the length of the track, in this case 10306


(20, 10306)


In [47]:
# recalculate the MFCCs with the raw 30s data and store it in the tracks dictionary

for key in populated_IDS:
    
    # compute mfcc, sample rate: 22050, number of coefficients: 20, number of frames = 1292, frame size (hop_length) = 512
    y_mfcc_30s = librosa.feature.mfcc(y=tracks[int(key)]['raw_30s'], sr=sr, hop_length=512)
    # store in tracks dictionary
    tracks[int(key)]["mfcc_30s"] = y_mfcc_30s


In [48]:
# rows represent the number of mel-frequency cepstral coefficients extracted per frame
# columns represent the number of frames in the audio signal (frame size 512)

print(tracks[1028902]["mfcc_30s"][:5])
print(tracks[1028902]["mfcc_30s"].shape)
print(tracks[1053502]["mfcc_30s"].shape)

[[-229.25056   -247.84926   -285.79434   ... -321.55063   -328.1599
  -294.27643  ]
 [ 168.05162    183.61592    192.00797   ...  211.91943    208.89987
   180.95624  ]
 [  -9.696083   -19.102627   -22.822453  ...   -8.777971    -5.048983
     2.4669373]
 [  -0.9074652    8.102158     4.573628  ...   -3.579409    -1.549556
     8.751568 ]
 [ -21.143959   -15.89756     -3.6335862 ...   43.421406    44.076775
    31.60394  ]]
(20, 1292)
(20, 1292)


In [49]:
len(tracks[1028902]["mfcc_30s"][0])

1292

In [51]:
print(populated_IDS)

['1028902', '1053502', '1062502', '1066202', '1066302', '1070602', '1075402', '1081002', '1083302', '1087102', '1088002', '1095102', '1108302', '1110002', '1110302', '1110502', '1112002', '1116402', '1162702', '1164202', '1167302', '1168602', '1178102', '1179402', '1189902', '1203202', '1209202', '12102', '1210702', '1227102', '1231502', '1231702', '1234502', '1248602', '1249602', '1265902', '1277802', '1279602', '1294702', '1298302', '1299802', '1325602', '1329502', '1338302', '1347802', '1353302', '1356502', '1363602', '1366002', '1371002', '1375202', '1380002', '1385202', '1386702', '1388002', '1396502', '1400402', '1400502', '1404702', '1405302', '1406402', '1418302', '1420502', '1420702', '264302', '390002', '476802', '606902', '702502', '816202', '824302', '842402', '887202', '903602', '933502', '945202', '950102', '998102']


In [53]:
for key in populated_IDS:
    print(key + ":")
    print(tracks[int(key)]["mfcc_30s"].shape)

1028902:
(20, 1292)
1053502:
(20, 1292)
1062502:
(20, 1292)
1066202:
(20, 1292)
1066302:
(20, 1292)
1070602:
(20, 1292)
1075402:
(20, 1292)
1081002:
(20, 1292)
1083302:
(20, 1292)
1087102:
(20, 1292)
1088002:
(20, 1292)
1095102:
(20, 1292)
1108302:
(20, 1292)
1110002:
(20, 1292)
1110302:
(20, 1292)
1110502:
(20, 1292)
1112002:
(20, 1292)
1116402:
(20, 1292)
1162702:
(20, 1292)
1164202:
(20, 1292)
1167302:
(20, 1292)
1168602:
(20, 1292)
1178102:
(20, 1292)
1179402:
(20, 1292)
1189902:
(20, 1292)
1203202:
(20, 1292)
1209202:
(20, 1292)
12102:
(20, 1292)
1210702:
(20, 1292)
1227102:
(20, 1292)
1231502:
(20, 1292)
1231702:
(20, 1292)
1234502:
(20, 1292)
1248602:
(20, 1292)
1249602:
(20, 1292)
1265902:
(20, 1292)
1277802:
(20, 1292)
1279602:
(20, 1292)
1294702:
(20, 1292)
1298302:
(20, 1292)
1299802:
(20, 1292)
1325602:
(20, 1292)
1329502:
(20, 1292)
1338302:
(20, 1292)
1347802:
(20, 1292)
1353302:
(20, 1292)
1356502:
(20, 1292)
1363602:
(20, 1292)
1366002:
(20, 1292)
1371002:
(20, 1292)
13

In [62]:
# stack the 30s MFCCs inside a numpy array, resulting array has dimensions (n_samples, n_coeff, n_frames)
mfccs_list = [] 

for key in populated_IDS:
    mfccs_list.append(tracks[int(key)]["mfcc_30s"])

mfccs = np.stack(mfccs_list, axis=0)

print(mfccs[:3, :, :])
print(mfccs.shape)


[[[-2.29250565e+02 -2.47849258e+02 -2.85794342e+02 ... -3.21550629e+02
   -3.28159912e+02 -2.94276428e+02]
  [ 1.68051620e+02  1.83615921e+02  1.92007965e+02 ...  2.11919434e+02
    2.08899872e+02  1.80956238e+02]
  [-9.69608307e+00 -1.91026268e+01 -2.28224525e+01 ... -8.77797127e+00
   -5.04898310e+00  2.46693730e+00]
  ...
  [-2.35961008e+00 -6.37394047e+00 -1.02743568e+01 ... -1.11533394e+01
   -9.98740959e+00 -1.16045465e+01]
  [-6.65950871e+00 -1.00422621e+01 -1.15371590e+01 ... -1.72731304e+01
   -1.70959091e+01 -1.28553572e+01]
  [-1.18630800e+01 -1.33926849e+01 -1.57755833e+01 ... -9.43335342e+00
   -8.53530121e+00 -3.91024280e+00]]

 [[-1.70756622e+02 -1.96812408e+02 -2.86168579e+02 ... -2.95438171e+02
   -2.19356400e+02 -1.42721542e+02]
  [ 7.93940659e+01  7.22035675e+01  6.46648254e+01 ...  1.69778091e+02
    7.25990143e+01  4.56612244e+01]
  [ 4.28190536e+01  4.74704094e+01  4.23415375e+01 ...  6.07803955e+01
    5.25496826e+01  4.64538574e+01]
  ...
  [ 5.54176569e-01  5.3

To feed MFCCs into a CNN, we need to have the following dimensions:

1. Number of samples: This is the number of examples we have in our dataset.
1. Number of frames: This is the number of time steps or frames we have for each example.
1. Number of MFCC coefficients: This is the number of MFCC coefficients we have for each time step.
1. Number of channels: This is 1 for grayscale images and 3 for RGB images.

In [65]:
# transpose the dimensions of the mfccs array to the order specified above, dimensions (n_samples, n_frames, n_coeff)
mfccs = mfccs.transpose(0, 2, 1)
print(mfccs.shape)

(78, 1292, 20)


In [78]:
# Scale each MFCC to a range between 0 and 1 across all samples and frames to ensure that the model can learn the relevant patterns using the whole data)

# Reshape the MFCCs to a 2D array for scaling
mfccs_2d = mfccs.reshape(-1, mfccs.shape[-1])

print(mfccs_2d)
print(mfccs_2d.shape)

# Scale the MFCCs
mfccs_scaled = MinMaxScaler().fit_transform(mfccs_2d)

# Reshape the scaled MFCCs back to the original shape
mfccs_scaled = mfccs_scaled.reshape(mfccs.shape)

print(mfccs_scaled)
print(mfccs_scaled.shape)


[[-229.25056     168.05162      -9.696083   ...   -2.35961
    -6.6595087   -11.86308   ]
 [-247.84926     183.61592     -19.102627   ...   -6.3739405
   -10.042262    -13.392685  ]
 [-285.79434     192.00797     -22.822453   ...  -10.274357
   -11.537159    -15.775583  ]
 ...
 [ -87.587555     13.004988      3.7670639  ...  -10.488659
   -14.142265     -3.1197896 ]
 [ -29.45456      60.435925     18.289354   ...   -1.4908735
   -12.182734     -8.856272  ]
 [  16.064753     63.203087     19.906433   ...    0.63829553
    -6.16238      -1.4083827 ]]
(100776, 20)
[[[0.4690589  0.6901638  0.36783564 ... 0.49968064 0.44752997 0.39331752]
  [0.43882728 0.73412645 0.3318557  ... 0.46289492 0.41352698 0.37954915]
  [0.37714863 0.7578305  0.31762743 ... 0.42715305 0.39850047 0.3581    ]
  ...
  [0.31902784 0.81407213 0.37134743 ... 0.4190984  0.34084326 0.41518816]
  [0.30828464 0.8055431  0.38561076 ... 0.4297825  0.34262466 0.42327178]
  [0.36336127 0.726614   0.41435906 ... 0.41496372 0.385

In [81]:
# add n_channels = 1 to the MFCC data, dimensions: (n_samples, n_frames, n_coeff, n_channels)
mfccs_scaled = np.expand_dims(mfccs_scaled, axis=-1)
print(mfccs_scaled.shape)

(78, 1292, 20, 1)


In [83]:
# Split the MFCCs in train:validation:test in the ratio 60:20:20

# Split the MFCCs into train and test sets
X_train, X_test_mfcc30s, y_train, y_test_mfcc30s = train_test_split(mfccs_scaled, y_hot, test_size=0.2, random_state=42)

# Split training data into training and validation sets
X_train_mfcc30s, X_val_mfcc30s, y_train_mfcc30s, y_val_mfcc30s = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [86]:
print('X_train_mfcc30s:')
print(X_train_mfcc30s.shape)
print(X_train_mfcc30s[:5])
print()
print('X_val_mfcc30s:')
print(X_val_mfcc30s[:5])
print()
print('X_test_mfcc30s:')
print(X_test_mfcc30s[:5])
print()
print("y_train_mfcc30s:")
print(y_train_mfcc30s.shape)
print(y_train_mfcc30s[:5])
print()
print("y_val_mfcc30s:")
print(y_val_mfcc30s[:5])
print()
print("y_test_mfcc30s:")
print(y_test_mfcc30s[:5])

X_train_mfcc30s:
(46, 1292, 20, 1)
[[[[0.67774   ]
   [0.5948523 ]
   [0.36687022]
   ...
   [0.6421222 ]
   [0.6164288 ]
   [0.50742817]]

  [[0.7283077 ]
   [0.5810684 ]
   [0.28187776]
   ...
   [0.6635252 ]
   [0.6115376 ]
   [0.50559205]]

  [[0.7414038 ]
   [0.53843284]
   [0.2161282 ]
   ...
   [0.7265484 ]
   [0.58427227]
   [0.4969492 ]]

  ...

  [[0.8017187 ]
   [0.4740376 ]
   [0.30714184]
   ...
   [0.774511  ]
   [0.6202567 ]
   [0.6524192 ]]

  [[0.7961562 ]
   [0.4720609 ]
   [0.30010757]
   ...
   [0.7475865 ]
   [0.58090955]
   [0.6216844 ]]

  [[0.7949731 ]
   [0.47768018]
   [0.30396768]
   ...
   [0.72434163]
   [0.57238334]
   [0.59386766]]]


 [[[0.519477  ]
   [0.6003387 ]
   [0.38716605]
   ...
   [0.4300865 ]
   [0.5315058 ]
   [0.5513741 ]]

  [[0.48044172]
   [0.67623496]
   [0.34557223]
   ...
   [0.4077403 ]
   [0.5574274 ]
   [0.5411622 ]]

  [[0.36722496]
   [0.82098114]
   [0.27382547]
   ...
   [0.32118607]
   [0.63549066]
   [0.5696643 ]]

  ...

  [[