# 토크나이저 학습용 데이터 전처리 (Program_0으로 일관화)

In [49]:
# 시퀀스 파일들 생성
import os
import json
import pretty_midi
from miditok import REMI, TokenizerConfig

# 시김새 토큰 매핑
SIKIM_TOKEN_MAP = {
    "남도 꺾는청": "<SIKIM_NamdoKkeokneuncheong>",
    "남도요성": "<SIKIM_NamdoYoseong>",
    "서도요성": "<SIKIM_SeodoYoseong>",
    "잔요성": "<SIKIM_JanyoSeong>",
    "전성": "<SIKIM_Jeonseong>",
    "퇴성": "<SIKIM_Toeseong>",
}

def get_sikim_ranges(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    result = []
    ann_info = data.get("annotation_data_info", {})
    for item in ann_info.get("single_tonguing_cd", []):
        if item.get("annotation_category") == "시김새":
            name = item.get("annotation_name")
            if name in SIKIM_TOKEN_MAP:
                start = float(item.get("start_time", 0))
                end = float(item.get("end_time", 0))
                result.append((SIKIM_TOKEN_MAP[name], start, end))
    return result

def insert_sikim_tokens(midi_path, json_path, tokenizer, tol=0.1):
    sikim_ranges = get_sikim_ranges(json_path)
    midi = pretty_midi.PrettyMIDI(midi_path)
    tokens = tokenizer.encode(midi_path).tokens
    notes = []
    for inst in midi.instruments:
        notes += inst.notes

    # 시김새 구간별로 노트에 삽입, 모든 Program은 Program_0으로 강제
    new_tokens = []
    note_idx = 0
    for tok in tokens:
        if tok.startswith("Program_"):
            new_tokens.append("Program_0")
        elif tok.startswith("Pitch_"):
            note_start = notes[note_idx].start if note_idx < len(notes) else None
            for sikim_tok, st, et in sikim_ranges:
                if note_start is not None and (st - tol) <= note_start <= (et + tol):
                    new_tokens.append(sikim_tok)
            note_idx += 1
            new_tokens.append(tok)
        else:
            new_tokens.append(tok)
    return new_tokens

if __name__ == "__main__":
    midi_dir = "/home/wjg980807/ai_music/dataset/ktm/aihub/midi"
    json_dir = "/home/wjg980807/ai_music/dataset/ktm/aihub/json"
    out_dir = "/home/wjg980807/ai_music/dataset/ktm/aihub/remi_with_sikim_program0"
    os.makedirs(out_dir, exist_ok=True)

    # Program_0만 허용, 시김새 토큰 추가된 config
    config = TokenizerConfig(
        use_programs=True,
        use_tempos=True,
        use_time_signatures=True,
        program_range=(0, 0),
        special_tokens=list(SIKIM_TOKEN_MAP.values())
    )
    remi_tokenizer = REMI(config)

    for fname in os.listdir(midi_dir):
        if fname.endswith(".mid"):
            stem = os.path.splitext(fname)[0]
            midi_path = os.path.join(midi_dir, fname)
            json_path = os.path.join(json_dir, f"{stem}.json")
            if not os.path.exists(json_path):
                continue
            tokens = insert_sikim_tokens(midi_path, json_path, remi_tokenizer, tol=0.1)
            with open(os.path.join(out_dir, f"{stem}.remi.txt"), "w", encoding="utf-8") as f:
                f.write(" ".join(tokens))
            print(f"✅ {fname} → 시김새+Program_0 REMI 시퀀스 생성")

✅ AP_F07_06534.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F06_08109.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BP_FS3_03090.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BP_CR2_00014.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BP_FS2_03552.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_E01_02222.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BP_FS1_00269.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F06_04670.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F06_04688.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_E04_02024.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_E01_02554.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BM_FS1_00285.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BP_FS2_01517.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_C04_07615.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_E01_03271.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_C09_07721.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F02_04127.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F06_04648.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F02_07249.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_F02_05947.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ AP_E01_04086.mid → 시김새+Program_0 REMI 시퀀스 생성
✅ BM_FS1_0261

# 토크나이저 학습

In [None]:
# midi  변환 (시김새 추가)
import os
import mido
import json

SIKIM_ENG_MAP = {
    "남도 꺾는청": "NamdoKkeokneuncheong",
    "남도요성": "NamdoYoseong",
    "서도요성": "SeodoYoseong",
    "잔요성": "JanyoSeong",
    "전성": "Jeonseong",
    "퇴성": "Toeseong",
}

def add_sikimsa_text_events(midi_path, json_path, output_path):
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        sikim_ranges = []
        for item in data.get("annotation_data_info", {}).get("single_tonguing_cd", []):
            if item.get("annotation_category") == "시김새":
                name = item.get("annotation_name")
                name_eng = SIKIM_ENG_MAP.get(name, name)
                start = float(item.get("start_time", 0))
                sikim_ranges.append((name_eng, start))

        mid = mido.MidiFile(midi_path)
        track = mid.tracks[0]
        tempo = 500000
        for msg in track:
            if msg.type == 'set_tempo':
                tempo = msg.tempo
                break

        def sec2tick(sec):
            return int(mido.second2tick(sec, mid.ticks_per_beat, tempo))

        text_msgs = []
        for name, start_sec in sikim_ranges:
            tick = sec2tick(start_sec)
            text_msgs.append((tick, mido.MetaMessage('text', text=f"SIKIM:{name}", time=0)))

        new_msgs = []
        abs_tick = 0
        text_idx = 0
        text_msgs.sort()

        for msg in track:
            abs_tick += msg.time
            while text_idx < len(text_msgs) and abs_tick >= text_msgs[text_idx][0]:
                new_msgs.append(mido.MetaMessage('text', text=text_msgs[text_idx][1].text, time=0))
                text_idx += 1
            new_msgs.append(msg)
        for i in range(text_idx, len(text_msgs)):
            new_msgs.append(mido.MetaMessage('text', text=text_msgs[i][1].text, time=0))

        mid.tracks[0] = mido.MidiTrack(new_msgs)
        mid.save(output_path)
        print(f"✅ {output_path} 변환 완료")
    except Exception as e:
        print(f"[ERROR] {midi_path}: {e}")

if __name__ == "__main__":
    midi_dir = "/home/wjg980807/ai_music/dataset/ktm/aihub/midi"
    json_dir = "/home/wjg980807/ai_music/dataset/ktm/aihub/json"
    out_dir  = "/home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae"
    os.makedirs(out_dir, exist_ok=True)

    for fname in os.listdir(midi_dir):
        if fname.endswith(".mid"):
            stem = os.path.splitext(fname)[0]
            midi_path = os.path.join(midi_dir, fname)
            json_path = os.path.join(json_dir, f"{stem}.json")
            out_path = os.path.join(out_dir, fname)
            if os.path.exists(json_path):
                add_sikimsa_text_events(midi_path, json_path, out_path)
            else:
                print(f"❌ {json_path} 라벨 없음, 건너뜀")

✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_F07_06534.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_F06_08109.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/BP_FS3_03090.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/BP_CR2_00014.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/BP_FS2_03552.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_E01_02222.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/BP_FS1_00269.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_F06_04670.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_F06_04688.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_E04_02024.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae/AP_E01_02554.mid 변환 완료
✅ /home/wjg980807/ai_music/dataset/ktm/aihu

In [47]:
from miditok import REMI, TokenizerConfig

# 구성 정의
config = TokenizerConfig(
    use_programs=True,
    use_tempos=True,
    use_time_signatures=True,
    program_range=(0, 0),
    special_tokens=[
        "<SIKIM_NamdoKkeokneuncheong>", "<SIKIM_NamdoYoseong>",
        "<SIKIM_SeodoYoseong>", "<SIKIM_JanyoSeong>",
        "<SIKIM_Jeonseong>", "<SIKIM_Toeseong>"
    ]
)
tokenizer = REMI(config)

In [63]:
from miditok import REMI, TokenizerConfig
import glob

# 1. config 생성
config = TokenizerConfig(
    use_programs=True,
    use_tempos=True,
    use_time_signatures=True,
    program_range=(0, 0),
    special_tokens=[
        "<SIKIM_NamdoKkeokneuncheong>", "<SIKIM_NamdoYoseong>",
        "<SIKIM_SeodoYoseong>", "<SIKIM_JanyoSeong>",
        "<SIKIM_Jeonseong>", "<SIKIM_Toeseong>"
    ]
)
tokenizer = REMI(config)

# 2. vocab 직접 구성
txt_files = glob.glob("/home/wjg980807/ai_music/dataset/ktm/aihub/remi_with_sikim_program0/*.remi.txt")
all_tokens = set()
for path in txt_files:
    with open(path, "r", encoding="utf-8") as f:
        all_tokens.update(f.read().split())

vocab = {tok: i for i, tok in enumerate(sorted(all_tokens))}
tokenizer._vocab = vocab
tokenizer._vocab_inv = {i: tok for tok, i in vocab.items()}

# 3. 저장
tokenizer.save_pretrained("/home/wjg980807/ai_music/dataset/ktm/aihub/tokenizer_sikim_trained")

In [64]:
import json

save_path = "/home/wjg980807/ai_music/dataset/ktm/aihub/tokenizer_sikim_trained/vocab.json"

# 이미 존재하는 tokenizer._vocab 사용
with open(save_path, "w", encoding="utf-8") as f:
    json.dump(tokenizer._vocab, f, ensure_ascii=False, indent=4)

print(f"✅ vocab.json 저장 완료 → {save_path}")

✅ vocab.json 저장 완료 → /home/wjg980807/ai_music/dataset/ktm/aihub/tokenizer_sikim_trained/vocab.json


# 테스트

In [54]:
# 1. 시퀀스 불러오기
with open("/home/wjg980807/ai_music/dataset/ktm/aihub/remi_with_sikim_program0/BP_FS3_03102.remi.txt", encoding="utf-8") as f:
    tokens = f.read().split()

# 2. 시김새 토큰만 추출
print("🎵 Detected SIKIM tokens from .remi.txt:")
for t in tokens:
    if "SIKIM" in t:
        print("✅", t)


🎵 Detected SIKIM tokens from .remi.txt:
✅ <SIKIM_JanyoSeong>
✅ <SIKIM_JanyoSeong>
✅ <SIKIM_NamdoKkeokneuncheong>
✅ <SIKIM_NamdoKkeokneuncheong>
✅ <SIKIM_NamdoYoseong>
✅ <SIKIM_JanyoSeong>
✅ <SIKIM_JanyoSeong>
✅ <SIKIM_NamdoKkeokneuncheong>
✅ <SIKIM_NamdoKkeokneuncheong>


In [4]:
import os
import mido

midi_dir = "/home/wjg980807/ai_music/dataset/ktm/aihub/midi_with_sikimsae"

for fname in sorted(os.listdir(midi_dir)):
    if not fname.endswith(".mid"):
        continue

    midi_path = os.path.join(midi_dir, fname)
    midi = mido.MidiFile(midi_path)

    sikim_events = []
    for track in midi.tracks:
        for msg in track:
            if msg.is_meta and msg.type == 'text' and msg.text.startswith("SIKIM:"):
                sikim_events.append(msg.text)

    if sikim_events:
        print(f"🎯 {fname} - 시김새 이벤트 {len(sikim_events)}개")
        for evt in sikim_events:
            print("   →", evt)
    else:
        print(f"⚠️ {fname} - 시김새 이벤트 없음")

⚠️ AM_C01_08594.mid - 시김새 이벤트 없음
🎯 AM_C03_07798.mid - 시김새 이벤트 2개
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
⚠️ AM_C04_07566.mid - 시김새 이벤트 없음
⚠️ AM_C04_07589.mid - 시김새 이벤트 없음
⚠️ AM_C04_07605.mid - 시김새 이벤트 없음
🎯 AM_C06_07668.mid - 시김새 이벤트 6개
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:Toeseong
⚠️ AM_C07_07618.mid - 시김새 이벤트 없음
⚠️ AM_C08_07643.mid - 시김새 이벤트 없음
🎯 AM_C09_02208.mid - 시김새 이벤트 9개
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:Toeseong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:Toeseong
   → SIKIM:JanyoSeong
🎯 AM_C09_07698.mid - 시김새 이벤트 3개
   → SIKIM:Toeseong
   → SIKIM:Toeseong
   → SIKIM:JanyoSeong
🎯 AM_E01_02255.mid - 시김새 이벤트 9개
   → SIKIM:Toeseong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
   → SIKIM:Toeseong
   → SIKIM:JanyoSeong
   → SIKIM:Toeseong
   → SIKIM:JanyoSeong
   → SIKIM:JanyoSeong
🎯 AM_E01_02406.mid -

# 번외

In [None]:
from miditok import REMI, TokenizerConfig
import glob

config = TokenizerConfig(
    use_programs=True,
    use_tempos=True,
    use_time_signatures=True,
    program_range=(0, 0),
    special_tokens=[
        "<SIKIM_남도꺾는청>", "<SIKIM_남도요성>", "<SIKIM_서도요성>",
        "<SIKIM_잔요성>", "<SIKIM_전성>", "<SIKIM_퇴성>"
    ]
)
tokenizer = REMI(config)

# 🚩 반드시! REMI 시퀀스(.remi.txt) 리스트로 학습해야 함
txt_files = glob.glob("/home/wjg980807/ai_music/dataset/ktm/aihub/remi_with_sikim_program0/*.remi.txt")
tokenizer.train(2048, iterator=txt_files)
tokenizer.save("/home/wjg980807/ai_music/dataset/ktm/aihub/tokenizer_sikim")




