# Preprocessing

Now that we have our structured audio data, let's perform some preprocessing to prepare it for analysis.

Steps:

1. Data Splitting: Divide our dataset into training and testing sets to evaluate our model's performance.
1. Audio Segmentation: Split each 30-second audio clip into 10 segments of 3 seconds each. This will increase the number of data points and potentially improve model performance.

We don't want to have segments of the same song in both training and test sets.

First construct the audio segmentation function.

In [35]:
import os
import soundfile as sf

def split_mp3(file_path, output_dir, segment_duration=3):

    y, sr = sf.read(file_path)

    # Calculate the number of segments
    num_segments = int(len(y) / (sr * segment_duration))

    for i in range(num_segments):
        start_index = i * sr * segment_duration
        end_index = (i + 1) * sr * segment_duration

        segment = y[start_index:end_index]

        # Create the output file path
        file_name = os.path.basename(file_path).replace('.mp3', f'_part{i+1}.mp3')
        output_file = os.path.join(output_dir, file_name)

        sf.write(output_file, segment, sr)

Then the function to split the files into training and testing sets.

In [42]:
import random
import shutil

def split_files(source_dir, destination_dir, train_ratio=0.8):

    os.makedirs(destination_dir, exist_ok=True)
    train_dir = os.path.join(destination_dir, 'train')
    test_dir = os.path.join(destination_dir, 'test')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)  


    for genre in os.listdir(source_dir):
        genre_dir = os.path.join(source_dir, genre)
        if os.path.isdir(genre_dir):
            train_genre_dir = os.path.join(train_dir, genre)
            test_genre_dir = os.path.join(test_dir, genre)
            os.makedirs(train_genre_dir, exist_ok=True)
            os.makedirs(test_genre_dir, exist_ok=True)  


            files = os.listdir(genre_dir)
            random.shuffle(files)
            num_files = len(files)
            num_train_files = int(num_files * train_ratio)

            for i, file in enumerate(files):
                file_path = os.path.join(genre_dir, file)
                if i < num_train_files:
                    shutil.copy(file_path, train_genre_dir)
                else:
                    shutil.copy(file_path, test_genre_dir)


Now let's try out. First the splitting into train and test

In [43]:
source_dir = 'songs'
destination_dir = 'songs_train_test'
split_files(source_dir, destination_dir)


Now let's segment the clips

In [44]:
root_dir = "./songs_train_test/test"
output_dir = "./split_songs_v2/test"

for genre in os.listdir(root_dir):
    genre_dir = os.path.join(root_dir, genre)
    output_genre_dir = os.path.join(output_dir, genre)
    os.makedirs(output_genre_dir, exist_ok=True)

    for file in os.listdir(genre_dir):
        if file.endswith('.mp3'):
            file_path = os.path.join(genre_dir, file)
            split_mp3(file_path, output_genre_dir)

root_dir = "./songs_train_test/train"
output_dir = "./split_songs_v2/train"

for genre in os.listdir(root_dir):
    genre_dir = os.path.join(root_dir, genre)
    output_genre_dir = os.path.join(output_dir, genre)
    os.makedirs(output_genre_dir, exist_ok=True)

    for file in os.listdir(genre_dir):
        if file.endswith('.mp3'):
            file_path = os.path.join(genre_dir, file)
            split_mp3(file_path, output_genre_dir)

Done! Let's move to the fun part, the model construction ⁺₊🎧✩°｡