# Module to Cleanup Current MusicMIDI Metadata

In [1]:
import os
import boto3
import joblib
import mido
import miditoolkit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from pathlib import Path
from tqdm.notebook import tqdm

import dotenv
dotenv.load_dotenv()

sns.set()

In [2]:
# set some parameter
DATA_DIR = '../dataset'

In [3]:
# clean some columns
# remove mid suffix
# list the chord progression
musicMIDI_meta = pd.read_csv(f"{DATA_DIR}/midi_metadata_file_cleaned.csv", 
                             index_col=[0], 
                             converters={"chord_progressions": literal_eval}).reset_index()
musicMIDI_meta.head()

Unnamed: 0,audio_key,pitch_range,num_measures,bpm,genre,track_roll,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,chord_progressions,track_role,inst_mapping,updated_inst,updated_genre,genre_mapping
0,cmajor,unknown,10,192,electronic,unknown,dulcimer,unknown,2/4,64,87,train,0004806f96307e317d116040af5b7861-11,"[[Am, Am, C, C, F, F, Am, Am, B, B, G, G, B, B...",unknown,3,guitar,electronic,3
1,fmajor,unknown,7,112,electronic,unknown,brass_section,unknown,2/4,127,127,train,0004806f96307e317d116040af5b7861-12,"[[F, F, F, F, A, A, A, A, C, C, G, G, F, F, F,...",unknown,5,brass,electronic,3
2,fmajor,unknown,12,163,electronic,unknown,percussive_organ,unknown,2/4,31,85,train,0004806f96307e317d116040af5b7861-13,"[[Dm, Dm, Dm, Dm, F, F, F, F, B, B, G, G, Dm, ...",unknown,0,keyboard,electronic,3
3,aminor,unknown,11,179,electronic,unknown,lead_square,unknown,2/4,27,113,train,0004806f96307e317d116040af5b7861-3,"[[C, C, G, G, A, A, C, C, A, A, G, G, G, G, C,...",unknown,1,accordian,electronic,3
4,dminor,unknown,7,112,electronic,unknown,lead_square,unknown,2/4,126,126,train,0004806f96307e317d116040af5b7861-5,"[[D, D, D, D, F, F, A, A, A, A, G, G, D, D, D,...",unknown,1,accordian,electronic,3


In [4]:
# determine when minor velocity is negative
musicMIDI_meta['neg_min_velocity'] = musicMIDI_meta.apply(lambda x: x['min_velocity'] < 0, axis=1)

# determine when max velocity is negative
musicMIDI_meta['neg_max_velocity'] = musicMIDI_meta.apply(lambda x: x['max_velocity'] < 0, axis=1)
musicMIDI_meta['neg_min_velocity'].value_counts(), musicMIDI_meta['neg_max_velocity'].value_counts()

(neg_min_velocity
 False    61237
 Name: count, dtype: int64,
 neg_max_velocity
 False    61237
 Name: count, dtype: int64)

In [5]:
# determine instances where the min/max velocidy is the same
musicMIDI_meta['min_max_velocity_same'] = musicMIDI_meta.apply(lambda x: x['min_velocity'] == x['max_velocity'], axis=1)
musicMIDI_meta['min_max_velocity_same'].value_counts()

min_max_velocity_same
False    40239
True     20998
Name: count, dtype: int64

In [6]:
# determine instances when min velocity is > max velocity
musicMIDI_meta['min_velocity_greater_than_max_velocity'] = musicMIDI_meta.apply(lambda x: x['min_velocity'] > x['max_velocity'], axis=1)
musicMIDI_meta['min_velocity_greater_than_max_velocity'].value_counts()

min_velocity_greater_than_max_velocity
False    61237
Name: count, dtype: int64

In [10]:
def extract_min_max_velocity(midi_file_path):
    """
        Function to extract the minimum and maximum velocity from a given MIDI file
        Input: MIDI file path
        Output: Minimum velocity, Maximum velocity
    """
    try:
        mid = mido.MidiFile(midi_file_path)

        velocity_data = []

        for track in mid.tracks:
            for msg in track:
                if msg.type == 'note_on':
                    velocity = msg.velocity
                    velocity_data.append(velocity)


        velocity_data = sorted(list(set(velocity_data)))
        velocity_data = velocity_data[1:]
        min_velocity = velocity_data[0]
        max_velocity = velocity_data[-1]

    except Exception as e:
        print(f"Error: {e}")
        return None, None

    return min_velocity, max_velocity


In [11]:
# for MIDI files that have same min/max velocity, re-extract the min/max velocity 
# check if the min/max velocity is the same
# if it is, ignore the file
# if it is not, update the min/max velocity record
temp_data = []
for i, row in tqdm(musicMIDI_meta.iterrows()):
    if row['min_max_velocity_same']:
        midi_file_path = f'{DATA_DIR}/processed_musicMIDI/raw/{row["id"]}.mid'
        min_velocity, max_velocity = extract_min_max_velocity(midi_file_path)
        
        if min_velocity != max_velocity:
            temp_data.append([row['id'], min_velocity, max_velocity])


temp_data[:5]

0it [00:00, ?it/s]

[]