# Data Preparation

The dataset contains 22 pieces by Frideric Chopin: 11 Polonaises and 11 Waltzes. The audio files in mp3 format have been downloaded from https://archive.org/details/musopen-chopin.

For spectrogram creation it is nessecary to prepare samples that are 4 seconds long.  

In [22]:
mp3_path = "data/mp3"
wav_path = "data/wav"
samples_path = "data/samples"

In [23]:
import os

def list_files(directory_path):
    try:
        files = os.listdir(directory_path)
        print(f"Files in '{directory_path}':")
        for file in files:
            print(file)
    except Exception as e:
        print(f"Error listing files in '{directory_path}': {e}")

list_files(mp3_path)


Files in 'data/mp3':
Waltz Op. 69 no. 2 in B minor.mp3
PolonaiseB.5InAFlatMajor.mp3
PolonaiseOp.71No.2InBFlatMajor.mp3
Waltz B. 133 in E flat major.mp3
PolonaiseOp.53InAFlatMajorheroic.mp3
Waltz no.8 - Op.64 no.3_ Ab-major.mp3
Waltz Op. 69 no. 1 in A flat major.mp3
PolonaiseOp.26No.2.mp3
PolonaiseOp.26No.1.mp3
Waltz Op. 64 no. 1 in D flat major.mp3
PolonaiseOp.71No.1InDMinor.mp3
Waltz no. 18 - op. posth: Eb-major.mp3
Waltz Op. 64 no. 2 in C sharp minor.mp3
PolonaiseB.6InG-sharpMinor.mp3
PolonaiseB.36InG-flatMajor.mp3
Waltz Op. 70 no. 1 in G flat major.mp3
Polonaise no.5 - Op.40 no.2_ C-minor.mp3
Waltz Op. 70 no. 3 in D flat major.mp3
PolonaiseB.1InGMinor.mp3
Waltz Op. 70 no. 2 in F minor.mp3
PolonaiseB.13InBFlatMinor.mp3
Waltz no. 19 - op. posth: A minor.mp3


### Converting mp3 to wav

In [24]:
from pydub import AudioSegment
import os

def convert_mp3_to_wav(mp3_file, output_folder):
    audio = AudioSegment.from_mp3(mp3_file)
    wav_file = os.path.join(output_folder, os.path.splitext(os.path.basename(mp3_file))[0] + ".wav")
    audio.export(wav_file, format="wav")

def convert_all_mp3_to_wav(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".mp3"):
            mp3_file = os.path.join(input_folder, filename)
            convert_mp3_to_wav(mp3_file, output_folder)

input_directory = mp3_path
output_directory = wav_path

convert_all_mp3_to_wav(input_directory, output_directory)


### Creating 4s samples

In [25]:
def cut_wav_into_samples(wav_file, output_folder, sample_duration=4000):
    audio = AudioSegment.from_wav(wav_file)
    
    # Calculate the number of samples
    num_samples = len(audio) // sample_duration
    
    for i in range(num_samples):
        start_time = i * sample_duration
        end_time = (i + 1) * sample_duration
        sample = audio[start_time:end_time]
        
        sample_file = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(wav_file))[0]}_{i + 1}.wav")
        sample.export(sample_file, format="wav")

def cut_all_wav_into_samples(input_folder, output_folder, sample_duration=4000):
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".wav"):
            wav_file = os.path.join(input_folder, filename)
            cut_wav_into_samples(wav_file, output_folder, sample_duration)


input_directory = wav_path
output_directory = samples_path
sample_duration_ms = 4000  # 4 seconds
    
cut_all_wav_into_samples(input_directory, output_directory, sample_duration_ms)


In [15]:
import IPython
IPython.display.Audio('data/samples/Waltz Op. 70 no. 3 in D flat major_42.wav')

### Renaming the files

In [26]:
def normalize_filename(filename):
    # Replace spaces with underscores
    filename = filename.replace(" ", "_")
    return filename

def normalize_files(directory_path):
    # List all files in the specified directory
    files = [file for file in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file))]

    # Iterate through files and normalize filenames
    for file in files:
        old_filepath = os.path.join(directory_path, file)
        new_filename = normalize_filename(file)
        new_filepath = os.path.join(directory_path, new_filename)

        # Rename the file
        os.rename(old_filepath, new_filepath)

# Call the function with the specified directory path (optional, defaults to the current directory)
normalize_files(samples_path)


## Creating data folds

In [27]:
import shutil
from sklearn.model_selection import KFold
import glob

def create_and_move_files(data_directory, output_base_directory, fold_indices):
    for i, (train_index, test_index) in enumerate(fold_indices, start=1):
        # Create directory for the current fold
        fold_directory = os.path.join(output_base_directory, f"Fold_{i}")
        os.makedirs(fold_directory, exist_ok=True)

        # Move files to the fold directory
        for index in train_index:
            src_file = wav_files[index]
            dst_file = os.path.join(fold_directory, os.path.basename(src_file))
            shutil.copy(src_file, dst_file)


# Assuming your WAV files are in a directory
data_directory = samples_path

# Get a list of all WAV files in the directory
wav_files = glob.glob(os.path.join(data_directory, "*.wav"))

# Number of folds (in this case, 10 folds)
n_folds = 10

# KFold
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Get fold indices
fold_indices = list(kfold.split(wav_files))

# Output base directory
output_base_directory = "data/samples_by_fold"

# Create directories for each fold and move files
create_and_move_files(data_directory, output_base_directory, fold_indices)
