In [None]:
import numpy as np
import librosa
from pathlib import Path
import os
import tarfile
import pickle
import math
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [None]:
from google.colab import drive

# mount to Google Drive 
drive.mount("/content/drive")

# save the most common part of file path
root_path = "/content/drive/My Drive/Audio_Mood_Classification"

In [None]:
# load labels dictionary
with open(f"{root_path}/dictionary/tracks.pkl", "rb") as f:
    tracks = pickle.load(f)

In [None]:
# check amount of tracks in pickle file
print(len(list(tracks.items())))

7087


In [None]:
# loading loop, takes ~2 min per batch (99 batches in total)

files = os.listdir(f"{root_path}/audio_files")

i=0
populated_IDS = []


for file in tqdm(files):

    ID = file.split(".")[0] #do with enumerate
    i += 1

    if int(ID) in tracks.keys():

        #storing the raw audio file
        y, sr = librosa.load(f"{root_path}/audio_files/{file}", sr=22050)
        ##transforming the audio file into MFCC with 20 coefficients
        #y_mfcc = librosa.feature.mfcc(y=y, sr=sr)

        #code to access the ID, where ID is the id code
        tracks[int(ID)]["raw"] = y
        #tracks[int(ID)]["mfcc"] = y_mfcc

        populated_IDS.append(ID)    

 40%|████      | 7480/18486 [1:22:23<40:21,  4.55it/s]   

In [None]:
print(populated_IDS[:10])
print(len(populated_IDS))

['1000086', '1001307', '1001312', '1002052', '1002753', '1002756', '1002758', '1003417', '1003418', '1003517']
3246


In [None]:
tracks[1028902]

{'artist_id': 366613,
 'album_id': 120946,
 'path': '02/1028902.mp3',
 'duration': 239.3,
 'mood/theme': {'sad'}}

### Preprocess raw audio data

In [None]:
# cut the raw audio data in 30 second windows

print("30 seconds of the audio at a sample rate of 22050 results in", 30*22050, "elements.")

for key in tqdm(populated_IDS):
    middle = (int(math.ceil(len(tracks[int(key)]["raw"]) / 2))) - 1
    tracks[int(key)]["raw_30s"] = tracks[int(key)]["raw"][middle-330750:middle+330750]    

In [None]:
print(len(tracks[1028902]["raw_30s"]))

In [None]:
# even length test

print(len(tracks[1028902]["raw"]))

middle = int(math.ceil(len(tracks[1028902]["raw"]) // 2)) - 1
print(middle)

test = tracks[1028902]["raw"][middle-330750:middle+330750]
print(len(test))

In [None]:
# odd length test

print(len(tracks[1088002]["raw"]))

middle = int(math.ceil(len(tracks[1088002]["raw"]) // 2)) - 1
print(middle)

test = tracks[1088002]["raw"][middle-330750:middle+330750]
print(len(test))

In [None]:
# # create 2D numpy array of raw audio wave arrays

# X_list = [] 

# for key in populated_IDS:
#     X_list.append(tracks[int(key)]["raw_30s"])

# X = np.vstack(X_list)

# print(X)

In [None]:
# create numpy array of labels by multi-hot encoding the labels

labels_list = []

for key in populated_IDS:
    labels_list.append(list(tracks[int(key)]["mood/theme"]))

mlb = MultiLabelBinarizer()
y_hot = mlb.fit_transform(labels_list)

print(mlb.classes_)
print(y_hot)


### Preprocess MFCC data

In [None]:
# print(tracks[1028902]["mfcc"].shape) # MFCCs have 20 coefficients, number of frames (segment of the audio signal) depends on the length of the track, in this case 10306


In [None]:
# recalculate the MFCCs with the raw 30s data and store it in the tracks dictionary
    
for key in tqdm(populated_IDS):

    # compute mfcc, sample rate: 22050, number of coefficients: 20, number of frames = 1292, frame size (hop_length) = 512
    y_mfcc_30s = librosa.feature.mfcc(y=tracks[int(key)]["raw_30s"], sr=22050, hop_length=512)
    # store in tracks dictionary
    tracks[int(key)]["mfcc_30s"] = y_mfcc_30s

In [None]:
# rows represent the number of mel-frequency cepstral coefficients extracted per frame
# columns represent the number of frames in the audio signal (frame size 512)

print(tracks[1028902]["mfcc_30s"][:5])
print(tracks[1028902]["mfcc_30s"].shape)
print(tracks[1053502]["mfcc_30s"].shape)

In [None]:
print(populated_IDS)

In [None]:
# the 30s MFCCs all have the same length, i.e., same number of frames

for key in populated_IDS[:10]:
    print(key + ":")
    print(tracks[int(key)]["mfcc_30s"].shape)

In [None]:
# stack the 30s MFCCs of all the audio tracks inside a numpy array, resulting array has dimensions (n_samples, n_coeff, n_frames)

mfccs_list = [] 

for key in populated_IDS:
    mfccs_list.append(tracks[int(key)]["mfcc_30s"])

mfccs = np.stack(mfccs_list, axis=0)

print(mfccs[:3, :, :])
print(mfccs.shape)


To feed MFCCs into a CNN, we need to have the following dimensions:

1. Number of samples: This is the number of examples we have in our dataset.
1. Number of frames: This is the number of time steps or frames we have for each example.
1. Number of MFCC coefficients: This is the number of MFCC coefficients we have for each time step.
1. Number of channels: This is 1 for grayscale images and 3 for RGB images.

In [None]:
# transpose the dimensions of the mfccs array to the order specified above, dimensions (n_samples, n_frames, n_coeff)
mfccs = mfccs.transpose(0, 2, 1)
print(mfccs.shape)

In [None]:
# scale each MFCC to a range between 0 and 1 across all samples and frames to ensure that the model can learn the relevant patterns using the whole data)

# reshape the MFCCs to a 2D array for scaling
mfccs_2d = mfccs.reshape(-1, mfccs.shape[-1])

print(mfccs_2d)
print(mfccs_2d.shape)

# scale the MFCCs
mfccs_scaled = MinMaxScaler().fit_transform(mfccs_2d)

# reshape the scaled MFCCs back to the original shape
mfccs_scaled = mfccs_scaled.reshape(mfccs.shape)

print(mfccs_scaled)
print(mfccs_scaled.shape)


In [None]:
# add n_channels = 1 to the MFCCs data, dimensions: (n_samples, n_frames, n_coeff, n_channels)
mfccs_scaled = np.expand_dims(mfccs_scaled, axis=-1)
print(mfccs_scaled.shape)

In [None]:
# split the MFCCs in train:validation:test in the ratio 60:20:20

# split the MFCCs into train and test sets
X_train_, X_test, y_train_, y_test = train_test_split(mfccs_scaled, y_hot, test_size=0.2, random_state=42)

# split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_, y_train_, test_size=0.25, random_state=42)

In [None]:
print("X_train:")
print(X_train.shape)
print(X_train[:1])
print()
print("X_val:")
print(X_val[:1])
print()
print("X_test:")
print(X_test[:1])
print()
print("y_train:")
print(y_train.shape)
print(y_train[:1])
print()
print("y_val:")
print(y_val[:1])
print()
print("y_test:")
print(y_test[:1])

In [None]:
# store the train, validation and test set of 30s MFCCs

files = [X_train_mfcc30s, X_val_mfcc30s, X_test_mfcc30s, y_train_mfcc30s, y_val_mfcc30s, y_test_mfcc30s]

file_names = ["X_train.npy",
              "X_val.npy",
              "X_test.npy",
              "y_train.npy",
              "y_val.npy",
              "y_test.npy"]

for file, file_name in zip(files, file_names):
    # np.save(f"./mfcc30s/{file_name}", file)
    np.save(f"{root_path}/train_test_data_full/{file_name}", file) # path to new folder to store MFCCs of full dataset