# Notebook Reserves for Preparing Data For Training Pipline

In [14]:
import os
import shutil
import boto3
import joblib
import mido
import miditoolkit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from pathlib import Path
from tqdm.notebook import tqdm

import dotenv
dotenv.load_dotenv()

sns.set()

Python-dotenv could not parse statement starting at line 3


In [15]:
# set some parameter
DATA_DIR = '../dataset'

In [16]:
# read in clean metaMIDI
musicMIDI_meta = pd.read_csv(f"{DATA_DIR}/midi_metadata_file_cleaned.csv")
musicMIDI_meta

Unnamed: 0,audio_key,pitch_range,num_measures,bpm,genre,track_roll,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,chord_progressions,track_role
0,cmajor,unknown,10,192,electronic,unknown,dulcimer,unknown,2/4,64,87,unknown,0004806f96307e317d116040af5b7861_11,"[['Am', 'Am', 'C', 'C', 'F', 'F', 'Am', 'Am', ...",unknown
1,fmajor,unknown,7,112,electronic,unknown,brass_section,unknown,2/4,127,127,unknown,0004806f96307e317d116040af5b7861_12,"[['F', 'F', 'F', 'F', 'A', 'A', 'A', 'A', 'C',...",unknown
2,fmajor,unknown,12,163,electronic,unknown,percussive_organ,unknown,2/4,31,85,unknown,0004806f96307e317d116040af5b7861_13,"[['Dm', 'Dm', 'Dm', 'Dm', 'F', 'F', 'F', 'F', ...",unknown
3,dminor,unknown,19,109,electronic,unknown,synthstrings_1,unknown,2/4,85,96,unknown,0004806f96307e317d116040af5b7861_2,"[['Dm', 'Dm', 'Dm', 'Dm', 'F', 'F', 'Am', 'Am'...",unknown
4,aminor,unknown,11,179,electronic,unknown,lead_1_square,unknown,2/4,27,113,unknown,0004806f96307e317d116040af5b7861_3,"[['C', 'C', 'G', 'G', 'A', 'A', 'C', 'C', 'A',...",unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62057,cminor,unknown,22,104,folk,unknown,tango_accordion,unknown,4/4,70,95,unknown,fff824db4e363d902648ca4869a2b682_2,"[['C', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'C',...",unknown
62058,cminor,unknown,15,130,folk,unknown,tango_accordion,unknown,4/4,70,95,unknown,fff824db4e363d902648ca4869a2b682_3,"[['C', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'G',...",unknown
62059,fmajor,unknown,9,231,classical,unknown,flute,unknown,4/4,79,90,unknown,fffdf64e3d6ff25c5b1a38b485f336c3_1,"[['C', 'C', 'C', 'C', 'F', 'F', 'F', 'F', 'C',...",unknown
62060,fmajor,unknown,10,193,classical,unknown,piccolo,unknown,4/4,55,85,unknown,fffdf64e3d6ff25c5b1a38b485f336c3_2,"[['F', 'F', 'F', 'F', 'B', 'B', 'B', 'B', 'C',...",unknown


In [17]:
# randomize 10% of the dataset for validation
musicMIDI_meta_val = musicMIDI_meta.sample(frac=.10, random_state=4678)

# remove validation data from training data
musicMIDI_meta = musicMIDI_meta.drop(musicMIDI_meta_val.index)

# label the split data column with train and val
musicMIDI_meta['split_data'] = 'train'
musicMIDI_meta_val['split_data'] = 'val'

# combine the data
musicMIDI_meta = pd.concat([musicMIDI_meta, musicMIDI_meta_val])
musicMIDI_meta = musicMIDI_meta.reset_index(drop=True)
musicMIDI_meta

Unnamed: 0,audio_key,pitch_range,num_measures,bpm,genre,track_roll,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,chord_progressions,track_role
0,cmajor,unknown,10,192,electronic,unknown,dulcimer,unknown,2/4,64,87,train,0004806f96307e317d116040af5b7861_11,"[['Am', 'Am', 'C', 'C', 'F', 'F', 'Am', 'Am', ...",unknown
1,fmajor,unknown,7,112,electronic,unknown,brass_section,unknown,2/4,127,127,train,0004806f96307e317d116040af5b7861_12,"[['F', 'F', 'F', 'F', 'A', 'A', 'A', 'A', 'C',...",unknown
2,dminor,unknown,19,109,electronic,unknown,synthstrings_1,unknown,2/4,85,96,train,0004806f96307e317d116040af5b7861_2,"[['Dm', 'Dm', 'Dm', 'Dm', 'F', 'F', 'Am', 'Am'...",unknown
3,aminor,unknown,11,179,electronic,unknown,lead_1_square,unknown,2/4,27,113,train,0004806f96307e317d116040af5b7861_3,"[['C', 'C', 'G', 'G', 'A', 'A', 'C', 'C', 'A',...",unknown
4,dminor,unknown,10,193,electronic,unknown,acoustic_bass,unknown,2/4,95,100,train,0004806f96307e317d116040af5b7861_4,"[['D', 'D', 'D', 'D', 'F', 'F', 'F', 'F', 'B',...",unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62057,fmajor,unknown,7,173,pop,unknown,synth_bass_2,unknown,4/4,107,127,val,624ed3c241f36205215b9c4b728b6e80_6,"[['A', 'A', 'A', 'A', 'G', 'G', 'G', 'G', 'A',...",unknown
62058,eminor,unknown,15,120,rock,unknown,acoustic_guitar_steel,unknown,4/4,47,95,val,74d212c7c57c98761756d13aa771888b_3,"[['C#', 'C#', 'C#', 'C#', 'F#', 'F#', 'F#', 'F...",unknown
62059,fmajor,unknown,14,54,blues,unknown,honky_tonk_piano,unknown,4/4,96,96,val,8c86c6e469fd191e753ffd7639ce4ea4_4,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',...",unknown
62060,dmajor,unknown,10,202,soul,unknown,honky_tonk_piano,unknown,4/4,15,115,val,1aa90de63c0c2dd1117684a1822cb5f8_1,"[['D', 'D', 'D', 'D', 'G', 'G', 'G', 'G', 'G',...",unknown


In [18]:
# move the data to the respective folder
full_data_dir = f"{DATA_DIR}/processed_musicMIDI/raw_updated"
train_dir = f"{DATA_DIR}/metaMIDI_midi_cv_split_5/train/raw"
val_dir = f"{DATA_DIR}/metaMIDI_midi_cv_split_5/val/raw"

# create the folder if not exist
Path(train_dir).mkdir(parents=True, exist_ok=True)
Path(val_dir).mkdir(parents=True, exist_ok=True)

# copy the data using the split_data column
for i, row in tqdm(musicMIDI_meta.iterrows()):
    try:
        # copy the file and replace underscore with hyphen
        # also replace the id with same structure
        if row['split_data'] == 'train':
            shutil.copy(f"{full_data_dir}/{row['id']}.mid", f"{train_dir}/{row['id'].replace('_', '-')}.mid")
            musicMIDI_meta.loc[i, 'id'] = row['id'].replace('_', '-')
        elif row['split_data'] == 'val':
            shutil.copy(f"{full_data_dir}/{row['id']}.mid", f"{val_dir}/{row['id'].replace('_', '-')}.mid")
            musicMIDI_meta.loc[i, 'id'] = row['id'].replace('_', '-')
    except Exception as e:
        print(e)
        pass


0it [00:00, ?it/s]

In [19]:
musicMIDI_meta.to_csv(f"{DATA_DIR}/midi_metadata_file_cleaned_cv_split_5.csv", index=False)
musicMIDI_meta.head()

Unnamed: 0,audio_key,pitch_range,num_measures,bpm,genre,track_roll,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,chord_progressions,track_role
0,cmajor,unknown,10,192,electronic,unknown,dulcimer,unknown,2/4,64,87,train,0004806f96307e317d116040af5b7861-11,"[['Am', 'Am', 'C', 'C', 'F', 'F', 'Am', 'Am', ...",unknown
1,fmajor,unknown,7,112,electronic,unknown,brass_section,unknown,2/4,127,127,train,0004806f96307e317d116040af5b7861-12,"[['F', 'F', 'F', 'F', 'A', 'A', 'A', 'A', 'C',...",unknown
2,dminor,unknown,19,109,electronic,unknown,synthstrings_1,unknown,2/4,85,96,train,0004806f96307e317d116040af5b7861-2,"[['Dm', 'Dm', 'Dm', 'Dm', 'F', 'F', 'Am', 'Am'...",unknown
3,aminor,unknown,11,179,electronic,unknown,lead_1_square,unknown,2/4,27,113,train,0004806f96307e317d116040af5b7861-3,"[['C', 'C', 'G', 'G', 'A', 'A', 'C', 'C', 'A',...",unknown
4,dminor,unknown,10,193,electronic,unknown,acoustic_bass,unknown,2/4,95,100,train,0004806f96307e317d116040af5b7861-4,"[['D', 'D', 'D', 'D', 'F', 'F', 'F', 'F', 'B',...",unknown
