In [1]:
pip install protobuf==3.20.*


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.8 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
import glob
import os

import librosa
import librosa.display

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

import music21
import mido
from mido import MidiFile, MidiTrack, Message

In [4]:
# Get a list of csv files
csv_files = glob.glob("data/csv/*.csv")

# Get a list of wav files
audio_files = glob.glob("data/wav/*.wav")

wav_file_names = {file.split("/")[-1].split(".")[0] for file in audio_files}

all_mfccs = []
all_labels = []

for csv_file in csv_files:
    
    csv_file_name = csv_file.split("/")[-1].split(".")[0]
    
    if csv_file_name in wav_file_names:
        # Load the CSV file that contains columns 'start', 'end', 'pitch', 'syllable'
        df = pd.read_csv(csv_file)
        
        # Load and process each audio file 
        audio_file = f'data/wav/{csv_file_name}.wav'  
        # y is the audio time series, sr is the sampling rate (None to keep the original rate)
        y, sr = librosa.load(audio_file, sr=None) 

        # Iterate over each row in the DataFrame to process each audio file
        for index, row in df.iterrows():
            onset = row['start']
            offset = row['end']
            pitch = row['pitch']
            syllable = row['syllable']
           
            # Convert time in seconds to an equivalent number of audio samples
            start_sample = int(onset * sr)
            end_sample = int(offset * sr)
            audio_segment = y[start_sample:end_sample]
            mfccs = librosa.feature.mfcc(y=audio_segment, sr=sr, n_mfcc=13)
            
            # Aggregate the MFCCs for the segment by taking the mean 
            avg_mfccs = np.mean(mfccs, axis=1)
            # Ground truth labels 
            all_mfccs.append(avg_mfccs)  
            # Omit 'syllable' as the focus is on predicting the pitch
            all_labels.append([onset, offset, pitch]) 

# Convert lists to numpy arrays
all_mfccs = np.array(all_mfccs)
all_labels = np.array(all_labels)

print(all_mfccs)
print(all_labels)



[[-3.62460114e+02  1.00009148e+02 -2.60470276e+01 ... -2.87805271e+00
   1.93404222e+00 -1.60931358e+01]
 [-3.48974030e+02  1.19040260e+02 -6.35902023e+00 ...  9.98298407e-01
  -3.70218897e+00 -1.28588181e+01]
 [-3.77557922e+02  1.49558945e+02 -3.74861479e-01 ...  7.16310382e-01
  -2.34762383e+00 -2.11644688e+01]
 ...
 [-3.14909302e+02  9.83111115e+01 -5.40003853e+01 ... -1.68301463e+00
   6.84126329e+00 -2.31120739e+01]
 [-3.90784760e+02  1.14711784e+02 -2.34533329e+01 ... -5.43308783e+00
   3.70930886e+00 -2.73066101e+01]
 [-3.65752960e+02  7.83080368e+01 -3.62409782e+01 ... -1.18469381e+01
   1.63539067e-01 -2.70787601e+01]]
[[ 2.1984  2.4922 62.    ]
 [ 2.5883  2.957  62.    ]
 [ 2.9922  3.2016 62.    ]
 ...
 [73.7356 74.2529 64.    ]
 [74.2529 74.4396 67.    ]
 [74.5115 75.9195 65.    ]]


In [5]:
print(all_mfccs.shape)
print(all_labels.shape)

(8484, 13)
(8484, 3)


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_mfccs, all_labels, test_size=0.2, random_state=42)

In [7]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape features for LSTM input (n_samples, n_timesteps, n_features)
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

In [8]:
# Print shapes 
print("X_train shape:", X_train_reshaped.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test_reshaped.shape)
print("y_test shape:", y_test.shape)

X_train shape: (6787, 1, 13)
y_train shape: (6787, 3)
X_test shape: (1697, 1, 13)
y_test shape: (1697, 3)


In [9]:
# Build the LSTM model 
model = Sequential()
# Each input sample is treated as a sequence of one timestep 
# Each timestep in the input data has 13 different features
model.add(LSTM(50, return_sequences=True, input_shape=(1, X_train_reshaped.shape[2])))
model.add(Dropout(0.2))  # Dropout layer for regularization 
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu'))  
model.add(Dense(1))

# Compile the model
model.compile(loss='mse', optimizer='adam')

# Train the model
model.fit(X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test),epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7facf8ffc340>

In [10]:
loss = model.evaluate(X_test_reshaped, y_test)
print(f"Test loss: {loss}")

Test loss: 727.662841796875


In [20]:
output_directory = "midi-output/"

def audio_to_midi(audio_file_path, model, scaler, max_frames=100):
    # Load and process the audio file
    y, sr = librosa.load(audio_file_path, sr=None)
    print(y, sr)
    audio_duration = len(y) / sr  # Total duration of the audio file in seconds
    print(audio_file_path, audio_duration)

    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_scaled = scaler.transform(mfccs.T)
    print(mfccs_scaled)

    frame_duration = audio_duration / mfccs_scaled.shape[0]  # Duration of each frame
    
    mid = MidiFile(ticks_per_beat=480)
    track = MidiTrack()
    mid.tracks.append(track)
    
    last_note_start_ticks = 0
    batch_size = 32  
    num_frames = min(mfccs_scaled.shape[0], max_frames)

    for start in range(0, num_frames, batch_size):
        end = min(start + batch_size, num_frames)
        batch_mfccs = mfccs_scaled[start:end]
        batch_mfccs_reshaped = batch_mfccs.reshape(batch_mfccs.shape[0], 1, -1)
        batch_predictions = model.predict(batch_mfccs_reshaped)
        
        for i, prediction in enumerate(batch_predictions):

            pitch = int(prediction[0])
            frame = start + i
            start_time = frame * frame_duration
            end_time = (frame + 1) * frame_duration

            note_start_ticks = int(start_time * mid.ticks_per_beat)
            note_end_ticks = int(end_time * mid.ticks_per_beat)
            duration_ticks = note_end_ticks - note_start_ticks

            note_start_ticks = max(note_start_ticks, last_note_start_ticks + 1)
            last_note_start_ticks = note_start_ticks

            track.append(Message('note_on', note=pitch, velocity=64, time=note_start_ticks))
            track.append(Message('note_off', note=pitch, velocity=64, time=note_start_ticks + duration_ticks))

    # Save the MIDI file
    audio_file_name = os.path.basename(audio_file_path).replace('.wav', '.mid')
    midi_file_path = os.path.join(output_directory, audio_file_name)
    mid.save(midi_file_path)
    print(f"MIDI file saved: {midi_file_path}")

# Loop through each WAV file to generate a MIDI file for it
for audio_file in audio_files:
    audio_to_midi(audio_file, model, scaler)

[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ...  0.0000000e+00
  0.0000000e+00  0.0000000e+00] 44100
data/wav/en016b.wav 70.58823129251701
[[-5.595192   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.595192   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.595192   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-5.595192   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.595192   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.595192   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]]
MIDI file saved: midi-output/en016b.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ... -1.5258789e-05
  0.0000000e+00 -1.5258789e-05] 44100
data/wav/en020a.wav 44.0
[[-5.77039    -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.77039    -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.77039    -3.5817902   1.3706504 

MIDI file saved: midi-output/en026b.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ...  1.5258789e-05
  1.5258789e-05  1.5258789e-05] 44100
data/wav/en006b.wav 88.00002267573696
[[-5.192293   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.192293   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.192293   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-5.192293   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.192293   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.192293   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]]
MIDI file saved: midi-output/en006b.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ...  1.5258789e-05
  0.0000000e+00 -1.5258789e-05] 44100
data/wav/en010b.wav 135.6521768707483
[[-4.632592   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-4.632592   -3.5817902   1.3706504  ...  0.09520672  0.03423878

[[-5.4691124  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.4691124  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.4691124  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-5.4691124  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.4691124  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.4691124  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]]
MIDI file saved: midi-output/en012a.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ...  1.5258789e-05
  0.0000000e+00  0.0000000e+00] 44100
data/wav/en004a.wav 64.0
[[-5.6989     -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.6989     -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.6989     -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-5.6989     -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.6989  

MIDI file saved: midi-output/en029b.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ...  0.0000000e+00
  0.0000000e+00  0.0000000e+00] 44100
data/wav/en009b.wav 70.15385487528344
[[-4.7591352  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-4.7591352  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-4.7591352  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-4.7591352  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-4.7591352  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-4.7591352  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]]
MIDI file saved: midi-output/en009b.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ... -1.5258789e-05
  0.0000000e+00  0.0000000e+00] 44100
data/wav/en005a.wav 73.84616780045351
[[-5.421398   -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.421398   -3.5817902   1.3706504  ...  0.09520672  0.03423878

[[-5.8828535  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.8828535  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.8828535  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-5.8828535  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.8828535  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.8828535  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]]
MIDI file saved: midi-output/en002a.mid
[-1.5258789e-05  0.0000000e+00  1.5258789e-05 ...  1.5258789e-05
  0.0000000e+00  1.5258789e-05] 44100
data/wav/en003a.wav 62.608707482993196
[[-5.0689635  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.0689635  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 [-5.0689635  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801 ]
 ...
 [-5.0689635  -3.5817902   1.3706504  ...  0.09520672  0.03423878
   2.1661801

In [21]:
import subprocess

# Get a list of all midi files
midi_files = glob.glob("midi-output/*.mid")

for midi_file in midi_files:
    midi_file_name = midi_file.split("/")[-1].split(".")[0]
    midi_stream = music21.converter.parse(midi_file)

    # Convert MIDI file into a music21 Score object
    score = music21.stream.Score()
    for part in midi_stream.parts:
        # Instead of inserting at position 0, insert at the end of the score
        score.append(part)

    # Save it as a PDF 
    ly_file_path = f"lilypond-output/{midi_file_name}.ly"
    score.write('lilypond', ly_file_path)
    
    # Read the generated LilyPond file
    with open(ly_file_path, 'r') as file:
        ly_content = file.read()

    # Insert custom LilyPond settings for layout and paper
    layout_settings = '\\layout {\n' \
                      '  \\context {\n' \
                      '    \\Score\n' \
                      '    \\override SystemSeparator.break-visibility = ##(#f #f #f)\n' \
                      '    ragged-bottom = ##f\n' \
                      '    ragged-last-bottom = ##f\n' \
                      '  }\n' \
                      '}\n'
    
    paper_settings = '\\paper {\n' \
                     '  min-systems-per-page = #5\n' \
                     '  system-system-spacing.padding = #2\n' \
                     '  system-system-spacing.stretchability = #15\n' \
                     '}\n'

    # Add the custom settings to the LilyPond content
    ly_content = ly_content.replace('\\layout {', layout_settings) + paper_settings

    # Write the modified content back to the LilyPond file
    with open(ly_file_path, 'w') as file:
        file.write(ly_content)

    # Using subprocess (requires LilyPond installed on your system):
    output_file_name = f"sheet-music-output/{midi_file_name}_sheet"
    subprocess.run(['lilypond', '--pdf', '-o', output_file_name, ly_file_path])  # Convert to PDF
    print("Sheet music saved:", output_file_name)

Sheet music saved: sheet-music-output/en006a_sheet
Sheet music saved: sheet-music-output/en010a_sheet
Sheet music saved: sheet-music-output/en026b_sheet
Sheet music saved: sheet-music-output/en006b_sheet
Sheet music saved: sheet-music-output/en010b_sheet
Sheet music saved: sheet-music-output/en030a_sheet
Sheet music saved: sheet-music-output/en026a_sheet
Sheet music saved: sheet-music-output/en027a_sheet
Sheet music saved: sheet-music-output/en011b_sheet
Sheet music saved: sheet-music-output/en007b_sheet
Sheet music saved: sheet-music-output/en027b_sheet
Sheet music saved: sheet-music-output/en011a_sheet
Sheet music saved: sheet-music-output/en007a_sheet
Sheet music saved: sheet-music-output/en016b_sheet
Sheet music saved: sheet-music-output/en020a_sheet
Sheet music saved: sheet-music-output/en016a_sheet
Sheet music saved: sheet-music-output/en020b_sheet
Sheet music saved: sheet-music-output/en021b_sheet
Sheet music saved: sheet-music-output/en017a_sheet
Sheet music saved: sheet-music-