# Preprocessing for PianoBear Transcriber

In [1]:
import os
import mido
import librosa
import numpy as np
import pandas as pd
import json

# MIDI 데이터 처리 함수
def process_midi(file_path):
    midi_data = mido.MidiFile(file_path)
    notes = []
    for i, track in enumerate(midi_data.tracks):
        for msg in track:
            if msg.type in ['note_on', 'note_off']:
                notes.append({
                    'note': msg.note,
                    'velocity': msg.velocity,
                    'time': msg.time,
                    'type': msg.type
                })
    return notes

# 오디오 데이터 처리 함수
def process_audio(file_path):
    y, sr = librosa.load(file_path, sr=None)  # 원본 샘플 레이트 사용
    spectrogram = np.abs(librosa.stft(y))
    return spectrogram

# 메타데이터 로드 및 파일 경로 설정
def load_metadata(base_path):
    metadata_file = os.path.join(base_path, 'maestro-v3.0.0.json')
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    metadata_df = pd.DataFrame(metadata)
    metadata_df['midi_path'] = metadata_df['midi_filename'].apply(lambda x: os.path.join(base_path, x))
    metadata_df['audio_path'] = metadata_df['audio_filename'].apply(lambda x: os.path.join(base_path, x))
    return metadata_df

# 배치 데이터 처리 및 저장
def process_and_save_batch(metadata_df, save_dir, batch_size=10):
    total_rows = len(metadata_df)
    for start_index in range(0, total_rows, batch_size):
        end_index = min(start_index + batch_size, total_rows)
        processed_data = []
        
        batch = metadata_df.iloc[start_index:end_index]
        for index, row in batch.iterrows():
            midi_notes = process_midi(row['midi_path'])
            audio_spectrogram = process_audio(row['audio_path'])
            processed_data.append({
                'midi_notes': midi_notes,
                'audio_spectrogram': audio_spectrogram
            })

        # 데이터 저장
        batch_file_name = f'processed_maestro_data_{start_index}_{end_index - 1}.npy'
        np.save(os.path.join(save_dir, batch_file_name), processed_data)
        print(f'Batch from {start_index} to {end_index - 1} processed and saved.')

# 실행 코드
base_path = 'maestro_data/maestro-v3.0.0'
metadata_df = load_metadata(base_path)
save_dir = 'maestro_data'
os.makedirs(save_dir, exist_ok=True)
process_and_save_batch(metadata_df, save_dir, batch_size=50)  # 배치 크기 설정


KeyboardInterrupt: 