# xml -> annotation

In [30]:
DATA_PATH=f"../data"
MODEL_PATH=f"../model"
IMAGE_PATH=f"../images/"

DATA_FEATURE_PATH=f"{DATA_PATH}/processed-feature"
DATA_RAW_PATH=f"{DATA_PATH}/raw"
DATA_TEST_PATH=f"{DATA_PATH}/test"

PrIMuS="PrIMuS"
package_aa = "package_aa_short"


xml_path = f'{DATA_RAW_PATH}/{PrIMuS}/Rock-ver/Rock-ver.xml'

# 학습!!!

In [31]:
import torch
import torch.nn as nn

from timm.models.vision_transformer import VisionTransformer
from timm.models.vision_transformer_hybrid import HybridEmbed
from timm.models.resnetv2 import ResNetV2
from timm.models.layers import StdConv2dSame
from einops import repeat

class CustomVisionTransformer(VisionTransformer):
    def __init__(self, img_size, patch_size=16, *args, **kwargs):
        super(CustomVisionTransformer, self).__init__(img_size=img_size, patch_size=patch_size, *args, **kwargs)
        self.height, self.width = img_size
        self.patch_size = patch_size

    def forward_features(self, x):
        B, c, h, w = x.shape
        x = self.patch_embed(x)

        cls_tokens = self.cls_token.expand(B, -1, -1) 
        x = torch.cat((cls_tokens, x), dim=1)
        h, w = h//self.patch_size, w//self.patch_size
        pos_emb_ind = repeat(torch.arange(h)*(self.width//self.patch_size-w), 'h -> (h w)', w=w)+torch.arange(h*w)
        pos_emb_ind = torch.cat((torch.zeros(1), pos_emb_ind+1), dim=0).long()
        x += self.pos_embed[:, pos_emb_ind]
        x = self.pos_drop(x)

        for blk in self.blocks:
            x = blk(x)

        x = self.norm(x)
        return x

def get_encoder(args):
    backbone_layers = list(args.backbone_layers)
    backbone = ResNetV2(
        layers=backbone_layers, num_classes=0, global_pool='', in_chans=args.channels,
        preact=False, stem_type='same', conv_layer=StdConv2dSame)
    min_patch_size = 2**(len(backbone_layers)+1)

    def embed_layer(**x):
        ps = x.pop('patch_size', min_patch_size)
        assert ps % min_patch_size == 0 and ps >= min_patch_size, 'patch_size needs to be multiple of %i with current backbone configuration' % min_patch_size
        return HybridEmbed(**x, patch_size=ps//min_patch_size, backbone=backbone)

    encoder = CustomVisionTransformer(img_size=(args.max_height, args.max_width),
                                      patch_size=args.patch_size,
                                      in_chans=args.channels,
                                      num_classes=0,
                                      embed_dim=args.encoder_dim,
                                      depth=args.encoder_depth,
                                      num_heads=args.encoder_heads,
                                      embed_layer=embed_layer,
                                      global_pool=""
                                      )
    return encoder

In [32]:
from math import ceil

import torch
import torch.nn as nn
import torch.nn.functional as F
from x_transformers.x_transformers import AttentionLayers, TokenEmbedding, AbsolutePositionalEmbedding, Decoder

class ScoreTransformerWrapper(nn.Module):
    def __init__(
        self,
        num_note_tokens,
        num_rhythm_tokens,
        num_pitch_tokens,
        num_lift_tokens,
        max_seq_len,
        attn_layers,
        emb_dim,
        l2norm_embed = False
    ):
        super().__init__()
        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'

        dim = attn_layers.dim
        self.max_seq_len = max_seq_len
        self.l2norm_embed = l2norm_embed
        self.lift_emb = TokenEmbedding(emb_dim, num_lift_tokens, l2norm_embed = l2norm_embed)
        self.pitch_emb = TokenEmbedding(emb_dim, num_pitch_tokens, l2norm_embed = l2norm_embed)
        self.rhythm_emb = TokenEmbedding(emb_dim, num_rhythm_tokens, l2norm_embed = l2norm_embed)
        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len, l2norm_embed = l2norm_embed)

        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
        self.attn_layers = attn_layers
        self.norm = nn.LayerNorm(dim)
        self.init_()

        self.to_logits_lift = nn.Linear(dim, num_lift_tokens)
        self.to_logits_pitch = nn.Linear(dim, num_pitch_tokens)
        self.to_logits_rhythm = nn.Linear(dim, num_rhythm_tokens)
        self.to_logits_note = nn.Linear(dim, num_note_tokens)

    def init_(self):
        if self.l2norm_embed:
            nn.init.normal_(self.lift_emb.emb.weight, std = 1e-5)
            nn.init.normal_(self.pitch_emb.emb.weight, std = 1e-5)
            nn.init.normal_(self.rhythm_emb.emb.weight, std = 1e-5)
            nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
            return

        nn.init.kaiming_normal_(self.lift_emb.emb.weight)
        nn.init.kaiming_normal_(self.pitch_emb.emb.weight)
        nn.init.kaiming_normal_(self.rhythm_emb.emb.weight)

    def forward(
        self,
        rhythms,
        pitchs,
        lifts,
        mask = None,
        return_hiddens = True,
        **kwargs
    ):
        x = self.rhythm_emb(rhythms) + self.pitch_emb(pitchs) + self.lift_emb(lifts) + self.pos_emb(rhythms)
        x = self.project_emb(x)
        x, hiddens = self.attn_layers(x, mask = mask, return_hiddens = return_hiddens, **kwargs)
        # select_hiddens = hiddens[0][3]
        
        x = self.norm(x)

        out_lifts = self.to_logits_lift(x)
        out_pitchs = self.to_logits_pitch(x)
        out_rhythms = self.to_logits_rhythm(x)
        out_notes = self.to_logits_note(x)
        return out_rhythms, out_pitchs, out_lifts, out_notes, x

def top_k(logits, thres = 0.9):
    k = ceil((1 - thres) * logits.shape[-1])
    val, ind = torch.topk(logits, k)
    probs = torch.full_like(logits, float('-inf'))
    probs.scatter_(1, ind, val)
    return probs

class ScoreDecoder(nn.Module):
    def __init__(self, transoformer, noteindexes, num_rhythmtoken, ignore_index = -100, pad_value = 0):
        super().__init__()
        self.pad_value = pad_value
        self.ignore_index = ignore_index

        self.net = transoformer
        self.max_seq_len = transoformer.max_seq_len

        note_mask = torch.zeros(num_rhythmtoken)
        note_mask[noteindexes] = 1
        self.note_mask = nn.Parameter(note_mask)

    @torch.no_grad()
    def generate(self, start_tokens, nonote_tokens, seq_len, eos_token = None, temperature = 1., filter_thres = 0.9, min_p_pow=2.0, min_p_ratio=0.02, **kwargs):
        device = start_tokens.device
        was_training = self.net.training
        num_dims = len(start_tokens.shape)

        if num_dims == 1:
            start_tokens = start_tokens[None, :]

        b, t = start_tokens.shape

        self.net.eval()
        out_rhythm = start_tokens
        out_pitch = nonote_tokens
        out_lift = nonote_tokens
        mask = kwargs.pop('mask', None)

        if mask is None:
            mask = torch.full_like(out_rhythm, True, dtype=torch.bool, device=out_rhythm.device)

        for _ in range(seq_len):
            mask = mask[:, -self.max_seq_len:]
            x_lift = out_lift[:, -self.max_seq_len:]
            x_pitch = out_pitch[:, -self.max_seq_len:]
            x_rhymthm = out_rhythm[:, -self.max_seq_len:]
            
            rhythmsp, pitchsp, liftsp, notesp, _ = self.net(x_rhymthm, x_pitch, x_lift,  mask=mask, **kwargs)
            
            filtered_lift_logits = top_k(liftsp[:, -1, :], thres = filter_thres)
            filtered_pitch_logits = top_k(pitchsp[:, -1, :], thres = filter_thres)
            filtered_rhythm_logits = top_k(rhythmsp[:, -1, :], thres = filter_thres)

            lift_probs = F.softmax(filtered_lift_logits / temperature, dim=-1)
            pitch_probs = F.softmax(filtered_pitch_logits / temperature, dim=-1)
            rhythm_probs = F.softmax(filtered_rhythm_logits / temperature, dim=-1)
            
            lift_sample = torch.multinomial(lift_probs, 1)
            pitch_sample = torch.multinomial(pitch_probs, 1)
            rhythm_sample = torch.multinomial(rhythm_probs, 1)

            out_lift = torch.cat((out_lift, lift_sample), dim=-1)
            out_pitch = torch.cat((out_pitch, pitch_sample), dim=-1)
            out_rhythm = torch.cat((out_rhythm, rhythm_sample), dim=-1)
            mask = F.pad(mask, (0, 1), value=True)

            if eos_token is not None and (torch.cumsum(out_rhythm == eos_token, 1)[:, -1] >= 1).all():
                break

        out_lift = out_lift[:, t:]
        out_pitch = out_pitch[:, t:]
        out_rhythm = out_rhythm[:, t:]

        if num_dims == 1:
            out = out.squeeze(0)

        self.net.train(was_training)
        return out_rhythm, out_pitch, out_lift

    def forward(self, rhythms, pitchs, lifts,notes, **kwargs):
        liftsi = lifts[:, :-1]
        liftso = lifts[:, 1:]
        pitchsi = pitchs[:, :-1]
        pitchso = pitchs[:, 1:]
        rhythmsi = rhythms[:, :-1]
        rhythmso = rhythms[:, 1:]
        noteso = notes[:, 1:]

        mask = kwargs.get('mask', None)
        if mask is not None and mask.shape[1] == rhythms.shape[1]:
            mask = mask[:, :-1]
            kwargs['mask'] = mask

        rhythmsp, pitchsp, liftsp, notesp, x = self.net(rhythmsi, pitchsi, liftsi, **kwargs) 
        
        loss_consist = self.calConsistencyLoss(rhythmsp, pitchsp, liftsp,notesp)
        loss_rhythm = F.cross_entropy(rhythmsp.transpose(1, 2), rhythmso, ignore_index = self.ignore_index)
        loss_pitch = F.cross_entropy(pitchsp.transpose(1, 2), pitchso, ignore_index = self.ignore_index)
        loss_lift = F.cross_entropy(liftsp.transpose(1, 2), liftso, ignore_index = self.ignore_index)
        loss_note = F.cross_entropy(notesp.transpose(1, 2), noteso, ignore_index = self.ignore_index)
        
        return dict(
            loss_rhythm=loss_rhythm,
            loss_pitch=loss_pitch,
            loss_lift=loss_lift,
            loss_consist=loss_consist,
            loss_note = loss_note
        )

    def calConsistencyLoss(self, rhythmsp, pitchsp, liftsp,notesp, gamma=10):
        notesp_soft = torch.softmax(notesp, dim=2)
        note_flag = notesp_soft[:,:,1]
        rhythmsp_soft = torch.softmax(rhythmsp, dim=2)
        rhythmsp_note = torch.sum(rhythmsp_soft * self.note_mask, dim=2)

        pitchsp_soft = torch.softmax(pitchsp, dim=2)
        pitchsp_note = torch.sum(pitchsp_soft[:,:,1:], dim=2)

        liftsp_soft = torch.softmax(liftsp, dim=2)
        liftsp_note = torch.sum(liftsp_soft[:,:,1:], dim=2)
        
        loss = gamma * (F.l1_loss(rhythmsp_note, note_flag) + 
                        F.l1_loss(note_flag, liftsp_note) + 
                        F.l1_loss(note_flag, pitchsp_note)) / 3.
        return loss
        
def get_decoder(args):
    return ScoreDecoder(
        ScoreTransformerWrapper(
            num_note_tokens=args.num_note_tokens,
            num_rhythm_tokens=args.num_rhythm_tokens,
            num_pitch_tokens=args.num_pitch_tokens,
            num_lift_tokens=args.num_lift_tokens,
            max_seq_len=args.max_seq_len,
            emb_dim=args.decoder_dim,
            attn_layers=Decoder(
                dim=args.decoder_dim,
                depth=args.decoder_depth,
                heads=args.decoder_heads,
                **args.decoder_args
            )),
        pad_value=args.pad_token,
        num_rhythmtoken = args.num_rhythmtoken,
        noteindexes = args.noteindexes)

In [33]:
import torch
import torch.nn as nn


class TrOMR(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.encoder = get_encoder(args)
        self.decoder = get_decoder(args)
        self.args = args

    def forward(self, inputs, rhythms_seq, pitchs_seq, 
                lifts_seq, note_seq, mask, **kwargs):
        
        encoded = self.encoder(inputs)
        loss = self.decoder(rhythms_seq, pitchs_seq, 
                            lifts_seq, note_seq, 
                            context=encoded, mask=mask, **kwargs)
        return loss

    @torch.no_grad()
    def generate(self, x: torch.Tensor, temperature: float = 0.25):
        start_token = (
                torch.LongTensor([self.args.bos_token]*len(x))[:, None]
            ).to(x.device)
        nonote_token = (
                torch.LongTensor([self.args.nonote_token]*len(x))[:, None]
            ).to(x.device)

        out_lift, out_pitch, out_rhythm = self.decoder.generate(
            start_token, nonote_token , self.args.max_seq_len,
            eos_token=self.args.eos_token, context=self.encoder(x), 
            temperature=temperature)
        
        return out_lift, out_pitch, out_rhythm


In [34]:
from datetime import datetime
import os

import cv2
import torch
import numpy as np
import albumentations as alb
from albumentations.pytorch import ToTensorV2

from transformers import PreTrainedTokenizerFast
from einops import rearrange, reduce, repeat

# from model import TrOMR
import torch.optim as optim

from torch.utils.data import TensorDataset  # 텐서데이터셋
from torch.utils.data import DataLoader  # 데이터로더
from tqdm import tqdm

import sys
import logging

class StaffToScore(object):
    def __init__(self, args):
        self.args = args
        self.size_h = args.max_height
        self.size_w = args.max_width
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = TrOMR(args)
        self.model.load_state_dict(torch.load(args.filepaths.checkpoint), strict=True)
        # self.model=torch.load(args.filepaths.checkpoint)
        self.model.to(self.device)

        self.lifttokenizer = PreTrainedTokenizerFast(
            tokenizer_file=args.filepaths.lifttokenizer
        )
        self.pitchtokenizer = PreTrainedTokenizerFast(
            tokenizer_file=args.filepaths.pitchtokenizer
        )
        self.rhythmtokenizer = PreTrainedTokenizerFast(
            tokenizer_file=args.filepaths.rhythmtokenizer
        )
        self.notetokenizer = PreTrainedTokenizerFast(
            tokenizer_file=args.filepaths.notetokenizer
        )
        self.transform = alb.Compose(
            [
                alb.ToGray(always_apply=True),
                alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
                ToTensorV2(),
            ]
        )

    

    def readimg(self, path):
        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        # img = cv2.imread(path)
        # print(f"1 -- resize 전")
        # print(img.shape)

        if img.shape[-1] == 4:
            img = 255 - img[:, :, 3]
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
        elif img.shape[-1] == 3:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        else:
            raise RuntimeError("Unsupport image type!")
        

        h, w, c = img.shape

        # 이미지의 가로세로 비율 계산
        ratio = min(self.size_w / w, self.size_h / h)

        # 이미지를 self.size_w 또는 self.size_h에 맞춰서 resize
        resized_image = cv2.resize(img, (int(w * ratio), int(h * ratio)))

        # 만약 세로 길이가 self.size_h를 넘는다면, 다시 self.size_h에 맞춰서 resize
        if resized_image.shape[0] > self.size_h:
            ratio = self.size_h / resized_image.shape[0]
            resized_image = cv2.resize(resized_image, (int(resized_image.shape[1] * ratio), self.size_h))

        img = resized_image

        # h, w, c = img.shape
        # new_h = self.size_h
        # new_w = int(self.size_h / h * w)
        # new_w = new_w // self.args.patch_size * self.args.patch_size
        # img = cv2.resize(img, (new_w, new_h))

        # 이미지 고정 크기로 맞춰야해서...
        top_pad = (self.size_h - img.shape[0]) // 2
        bottom_pad = self.size_h - img.shape[0] - top_pad
        left_pad = (self.size_w - img.shape[1]) // 2
        right_pad = self.size_w - img.shape[1] - left_pad
        img = np.pad(img, ((top_pad, bottom_pad), (left_pad, right_pad), (0, 0)), mode='constant', constant_values=255)

        # print(img)
        # print(img.shape)

        if "000051760-1_1_1" in path:
            cv2.imwrite(f"resize-000051760-1_1_1.png", img)
        img = self.transform(image=img)["image"][:1]

        # print(f"2 -- resize 후")
        # print(img.shape)
        # print(img.dtype)
        return img

    # def preprocessing(self, rgb):
    #     patches = rearrange(
    #         rgb,
    #         "b c (h s1) (w s2) -> b (h w) (s1 s2 c)",
    #         s1=self.args.patch_size,
    #         s2=self.args.patch_size,
    #     )
    #     return patches
    def preprocessing(self, rgb):
        h, w, c = rgb.shape
        new_h = self.size_h
        new_w = int(self.size_h / h * w)
        new_w = new_w // self.args.patch_size * self.args.patch_size
        img = cv2.resize(rgb, (new_w, new_h))
        img = self.transform(image=img)["image"][:1]
        return img

    def detokenize(self, tokens, tokenizer):
        toks = [tokenizer.convert_ids_to_tokens(tok) for tok in tokens]
        for b in range(len(toks)):
            for i in reversed(range(len(toks[b]))):
                if toks[b][i] is None:
                    toks[b][i] = ""
                toks[b][i] = toks[b][i].replace("Ġ", " ").strip()
                if toks[b][i] in (["[BOS]", "[EOS]", "[PAD]"]):
                    del toks[b][i]
        return toks
    
    def entokenize(self, state, tokens, tokenizer):
        result=[]
        for tok in tokens:
            # toks = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tok))
            toks = tokenizer.encode_plus(tok)["input_ids"]
            # print(f"--{state} encode plus: {tokenizer.encode_plus(tok)}")
            # print("-- toks len: ", len(toks))
            result.append(toks)
        return torch.tensor(result)

    def all_entokenize(self, lift_y_list, pitch_y_list, rhythm_y_list, note_y_list):
        token_lift = self.entokenize("lift", lift_y_list, self.lifttokenizer)
        token_pitch = self.entokenize("pitch",pitch_y_list, self.pitchtokenizer)
        token_rhythm = self.entokenize("rhythm",rhythm_y_list, self.rhythmtokenizer)
        token_note = self.entokenize("note",note_y_list, self.notetokenizer)
        return token_lift, token_pitch, token_rhythm, token_note
    

    def train_model(self, input_seq, lift_seq, pitchs_seq, rhythms_seq, note_seq, mask_seq):
        # 역전파 계산
        optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        # 데이터 준비 및 학습 반복
        num_epochs = 5
        batch_size = 32

        # 로그 설정
        log_date=datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        log_path=f"../src/workspace/log/train-log_img2score_epoch{num_epochs}-piano-{log_date}.log"
        logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s : %(message)s', datefmt='%Y-%m-%d %H:%M:%S')


        # for tt in range(rhythms_seq.size()[0]):
        #     logging.info(f'리듬은??: {rhythms_seq[tt]}')
        #     logging.info(f'리프트는??: {lift_seq[tt]}')
        #     logging.info(f'피치는??: {pitchs_seq[tt]}')
        #     logging.info(f'마스크는??: {mask_seq[tt]}')


        for epoch in range(num_epochs):
            for i in tqdm(range(0, len(input_seq), batch_size)):
                # input = input_seq.to(self.device)
                # lift = lift_seq.to(self.device)
                # pitch = pitchs_seq.to(self.device)
                # rhythm = rhythms_seq.to(self.device)
                # note = note_seq.to(self.device)
                # mask = mask_seq.to(self.device)

                max_data_len=min(len(input_seq), i+batch_size)

                inputs_batch = input_seq[i:max_data_len].to(self.device)
                lift_seq_batch = lift_seq[i:max_data_len].to(self.device)
                pitchs_seq_batch = pitchs_seq[i:max_data_len].to(self.device)
                rhythms_seq_batch = rhythms_seq[i:max_data_len].to(self.device)
                note_seq_batch = note_seq[i:max_data_len].to(self.device)
                mask_batch = mask_seq[i:max_data_len].to(self.device)

                outputs = self.model.forward(inputs_batch, rhythms_seq_batch, pitchs_seq_batch, lift_seq_batch, note_seq_batch, mask=mask_batch)

                # λ = 0.1 and β = 1.0.
                # LTrOMR = λLce + βLcon
                alpha = 0.1
                beta = 1.0
                loss = alpha*(outputs['loss_rhythm'] + outputs['loss_pitch'] + outputs['loss_lift']) + beta*outputs['loss_consist']

                # 역전파 단계
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # 훈련 과정을 출력
                print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")
                print(f"-- consist Loss({outputs['loss_consist']:.4f}) | rhythm Loss({outputs['loss_rhythm']:.4f}) | pitch Loss({outputs['loss_pitch']:.4f}) | lift loss({outputs['loss_lift']:.4f})")

                # 로그 기록
                logging.info(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")
                logging.info(f"-- consist Loss({outputs['loss_consist']:.4f}) | rhythm Loss({outputs['loss_rhythm']:.4f}) | pitch Loss({outputs['loss_pitch']:.4f}) | lift loss({outputs['loss_lift']:.4f})")
            

        # 체크포인트 저장
        datet=datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        checkpoint_path = f'../src/workspace/checkpoints/img2score_epoch{num_epochs}-piano-{datet}.pth'
        torch.save(self.model.state_dict(), checkpoint_path)
        print(f'--!! saved {checkpoint_path}')
        logging.info(f'Saved checkpoint: {checkpoint_path}')

    # def predict_img2token(self, rgbimgs):
    #     if not isinstance(rgbimgs, list):
    #         rgbimgs = [rgbimgs]
    #     imgs = [self.preprocessing(item) for item in rgbimgs]
    #     imgs = torch.cat(imgs).float().unsqueeze(1)
    #     output = self.model.generate(
    #         imgs.to(self.device), temperature=self.args.get("temperature", 0.2)
    #     )
    #     rhythm, pitch, lift = output
    #     return rhythm, pitch, lift
    
    def predict_token(self, imgpath):
        imgs = []
        if os.path.isdir(imgpath):
            for item in os.listdir(imgpath):
                imgs.append(self.readimg(os.path.join(imgpath, item)))
        else:
            imgs.append(self.readimg(imgpath))
        imgs = torch.cat(imgs).float().unsqueeze(1)
        output = self.model.generate(
            imgs.to(self.device), temperature=self.args.get("temperature", 0.2)
        )
        rhythm, pitch, lift = output
        return rhythm, pitch, lift

    def predict(self, imgpath):
        rhythm, pitch, lift = self.predict_token(imgpath)

        predlift = self.detokenize(lift, self.lifttokenizer)
        predpitch = self.detokenize(pitch, self.pitchtokenizer)
        predrhythm = self.detokenize(rhythm, self.rhythmtokenizer)
        return predrhythm, predpitch, predlift



In [35]:
import os
from omegaconf import OmegaConf


def getconfig(configpath):
    args = OmegaConf.load(configpath)

    workspace = os.path.dirname(configpath)
    for key in args.filepaths.keys():
        args.filepaths[key] = os.path.join(workspace, args.filepaths[key])
    return args


In [36]:
import glob
import os
import re

import argparse
from random import randrange

import cv2
import numpy as np
import pandas as pd
import torch


if __name__ == "__main__":
    cofigpath = "../src/workspace/config.yaml" 
    args = getconfig(cofigpath)

    handler = StaffToScore(args)
    
    x_dataset_path=f"{DATA_RAW_PATH}/{PrIMuS}/{package_aa}/"
    x_all_dataset_path = glob.glob(f"{x_dataset_path}/*")
    del x_dataset_path
    x_pattern = re.compile(r'^[^._].*\.png$') 
    y_pattern = re.compile(r'^[^._].*\.semantic$')

    x_raw_file_list=[]  # image
    y_raw_file_list=[]  # label
    for x_path in x_all_dataset_path:
        files = os.listdir(x_path)
        x_filtered_files = [f"{x_path}/{file}" for file in files if x_pattern.match(file)]
        y_filtered_files = [f"{x_path}/{file}" for file in files if y_pattern.match(file)]
        if len(x_filtered_files) == len(y_filtered_files):
            x_raw_file_list+=x_filtered_files
            y_raw_file_list+=y_filtered_files

    del x_all_dataset_path
    print("x:",len(x_raw_file_list),x_raw_file_list)
    print("y:",len(y_raw_file_list),y_raw_file_list)

    def convert_img(imgpath):
        imgs = []
        # if os.path.isdir(imgpath):
        for item in imgpath:
            # print("---", item, "---")
            con_img=handler.readimg(item)
            imgs.append(con_img)

        imgs = torch.cat(imgs).float().unsqueeze(1)

        return imgs
    

    def read_txt_file(file_path):
        """
        텍스트 파일을 읽어서 내용을 리스트로 반환하는 함수
        """
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.readlines()
            # 각 줄의 개행 문자 제거
            content = [line.strip() for line in content]
        return content[0]
    

    # print(f"------------------y data----------------------------")
    # print(f"-- labeling y : {len(contents)}")
    # print(f"-- labeling y : {contents[0]}")
    # print(f"----------------------------------------------------")

    
    # 각 token에 맞는 string list로 만들기
    def map_pitch(note):
        pitch_mapping = {
            # "nonote": 0,
            "note-C0": 1,
            "note-D0": 2,
            "note-E0": 3,
            "note-F0": 4,
            "note-G0": 5,
            "note-A0": 6,
            "note-B0": 7,
            "note-C1": 8,
            "note-D1": 9,
            "note-E1": 10,
            "note-F1": 11,
            "note-G1": 12,
            "note-A1": 13,
            "note-B1": 14,
            "note-C2": 15,
            "note-D2": 16,
            "note-E2": 17,
            "note-F2": 18,
            "note-G2": 19,
            "note-A2": 20,
            "note-B2": 21,
            "note-C3": 22,
            "note-D3": 23,
            "note-E3": 24,
            "note-F3": 25,
            "note-G3": 26,
            "note-A3": 27,
            "note-B3": 28,
            "note-C4": 29,
            "note-D4": 30,
            "note-E4": 31,
            "note-F4": 32,
            "note-G4": 33,
            "note-A4": 34,
            "note-B4": 35,
            "note-C5": 36,
            "note-D5": 37,
            "note-E5": 38,
            "note-F5": 39,
            "note-G5": 40,
            "note-A5": 41,
            "note-B5": 42,
            "note-C6": 43,
            "note-D6": 44,
            "note-E6": 45,
            "note-F6": 46,
            "note-G6": 47,
            "note-A6": 48,
            "note-B6": 49,
            "note-C7": 50,
            "note-D7": 51,
            "note-E7": 52,
            "note-F7": 53,
            "note-G7": 54,
            "note-A7": 55,
            "note-B7": 56,
            "note-C8": 57,
            "note-D8": 58,
            "note-E8": 59,
            "note-F8": 60,
            "note-G8": 61,
            "note-A8": 62,
            "note-B8": 63,
            "note-C9": 64,
            "note-D9": 65,
            "note-E9": 66,
            "note-F9": 67,
            "note-G9": 68,
            "note-A9": 69,
            "note-B9": 70
        }
        return "nonote" if note not in pitch_mapping else note
    
    def map_rhythm(note):
        duration_mapping =  {
            "[PAD]": 0,
            "[BOS]": 1,
            "[EOS]": 2,
            "+": 3,
            "|": 4,
            "barline": 5,
            "clef-C1": 6,
            "clef-C2": 7,
            "clef-C3": 8,
            "clef-C4": 9,
            "clef-C5": 10,
            "clef-F3": 11,
            "clef-F4": 12,
            "clef-F5": 13,
            "clef-G1": 14,
            "clef-G2": 15,
            "keySignature-AM": 16,
            "keySignature-AbM": 17,
            "keySignature-BM": 18,
            "keySignature-BbM": 19,
            "keySignature-C#M": 20,
            "keySignature-CM": 21,
            "keySignature-CbM": 22,
            "keySignature-DM": 23,
            "keySignature-DbM": 24,
            "keySignature-EM": 25,
            "keySignature-EbM": 26,
            "keySignature-F#M": 27,
            "keySignature-FM": 28,
            "keySignature-GM": 29,
            "keySignature-GbM": 30,
            "multirest-10": 31,
            "multirest-100": 32,
            "multirest-11": 33,
            "multirest-111": 34,
            "multirest-116": 35,
            "multirest-12": 36,
            "multirest-120": 37,
            "multirest-121": 38,
            "multirest-129": 39,
            "multirest-13": 40,
            "multirest-133": 41,
            "multirest-135": 42,
            "multirest-14": 43,
            "multirest-140": 44,
            "multirest-144": 45,
            "multirest-15": 46,
            "multirest-152": 47,
            "multirest-154": 48,
            "multirest-16": 49,
            "multirest-17": 50,
            "multirest-174": 51,
            "multirest-18": 52,
            "multirest-19": 53,
            "multirest-195": 54,
            "multirest-196": 55,
            "multirest-2": 56,
            "multirest-20": 57,
            "multirest-21": 58,
            "multirest-22": 59,
            "multirest-224": 60,
            "multirest-23": 61,
            "multirest-24": 62,
            "multirest-249": 63,
            "multirest-25": 64,
            "multirest-26": 65,
            "multirest-27": 66,
            "multirest-28": 67,
            "multirest-29": 68,
            "multirest-3": 69,
            "multirest-30": 70,
            "multirest-31": 71,
            "multirest-32": 72,
            "multirest-33": 73,
            "multirest-34": 74,
            "multirest-35": 75,
            "multirest-36": 76,
            "multirest-37": 77,
            "multirest-38": 78,
            "multirest-39": 79,
            "multirest-4": 80,
            "multirest-40": 81,
            "multirest-41": 82,
            "multirest-42": 83,
            "multirest-43": 84,
            "multirest-44": 85,
            "multirest-444": 86,
            "multirest-446": 87,
            "multirest-45": 88,
            "multirest-46": 89,
            "multirest-47": 90,
            "multirest-48": 91,
            "multirest-49": 92,
            "multirest-5": 93,
            "multirest-50": 94,
            "multirest-51": 95,
            "multirest-52": 96,
            "multirest-54": 97,
            "multirest-55": 98,
            "multirest-559": 99,
            "multirest-56": 100,
            "multirest-58": 101,
            "multirest-6": 102,
            "multirest-60": 103,
            "multirest-62": 104,
            "multirest-63": 105,
            "multirest-64": 106,
            "multirest-67": 107,
            "multirest-68": 108,
            "multirest-69": 109,
            "multirest-7": 110,
            "multirest-70": 111,
            "multirest-72": 112,
            "multirest-74": 113,
            "multirest-75": 114,
            "multirest-78": 115,
            "multirest-79": 116,
            "multirest-8": 117,
            "multirest-82": 118,
            "multirest-83": 119,
            "multirest-87": 120,
            "multirest-88": 121,
            "multirest-89": 122,
            "multirest-9": 123,
            "multirest-90": 124,
            "multirest-92": 125,
            "multirest-93": 126,
            "multirest-94": 127,
            "multirest-98": 128,
            "note-breve": 129,
            "note-breve.": 130,
            "note-eighth": 131,
            "note-eighth.": 132,
            "note-half": 133,
            "note-half.": 134,
            "note-hundred_twenty_eighth": 135,
            "note-long": 136,
            "note-quarter": 137,
            "note-quarter.": 138,
            "note-sixteenth": 139,
            "note-sixteenth.": 140,
            "note-sixty_fourth": 141,
            "note-sixty_fourth.": 142,
            "note-thirty_second": 143,
            "note-thirty_second.": 144,
            "note-whole": 145,
            "note-whole.": 146,
            "rest-256th": 147,
            "rest-512th": 148,
            "rest-breve": 149,
            "rest-eighth": 150,
            "rest-eighth.": 151,
            "rest-half": 152,
            "rest-half.": 153,
            "rest-hundred_twenty_eighth": 154,
            "rest-long": 155,
            "rest-quarter": 156,
            "rest-quarter.": 157,
            "rest-sixteenth": 158,
            "rest-sixteenth.": 159,
            "rest-sixty_fourth": 160,
            "rest-sixty_fourth.": 161,
            "rest-thirty_second": 162,
            "rest-thirty_second.": 163,
            "rest-whole": 164,
            "rest-whole.": 165,
            "timeSignature-1/16": 166,
            "timeSignature-1/2": 167,
            "timeSignature-1/4": 168,
            "timeSignature-1/8": 169,
            "timeSignature-10/1": 170,
            "timeSignature-10/16": 171,
            "timeSignature-10/4": 172,
            "timeSignature-10/8": 173,
            "timeSignature-100/2": 174,
            "timeSignature-11/16": 175,
            "timeSignature-11/4": 176,
            "timeSignature-11/8": 177,
            "timeSignature-12/16": 178,
            "timeSignature-12/32": 179,
            "timeSignature-12/4": 180,
            "timeSignature-12/8": 181,
            "timeSignature-13/16": 182,
            "timeSignature-13/4": 183,
            "timeSignature-13/8": 184,
            "timeSignature-14/4": 185,
            "timeSignature-14/8": 186,
            "timeSignature-15/16": 187,
            "timeSignature-15/4": 188,
            "timeSignature-15/8": 189,
            "timeSignature-16/16": 190,
            "timeSignature-16/4": 191,
            "timeSignature-16/8": 192,
            "timeSignature-17/16": 193,
            "timeSignature-17/4": 194,
            "timeSignature-17/8": 195,
            "timeSignature-18/16": 196,
            "timeSignature-18/4": 197,
            "timeSignature-18/8": 198,
            "timeSignature-19/16": 199,
            "timeSignature-19/32": 200,
            "timeSignature-19/4": 201,
            "timeSignature-2/1": 202,
            "timeSignature-2/16": 203,
            "timeSignature-2/2": 204,
            "timeSignature-2/32": 205,
            "timeSignature-2/4": 206,
            "timeSignature-2/8": 207,
            "timeSignature-20/4": 208,
            "timeSignature-20/8": 209,
            "timeSignature-21/16": 210,
            "timeSignature-22/16": 211,
            "timeSignature-23/16": 212,
            "timeSignature-23/4": 213,
            "timeSignature-23/8": 214,
            "timeSignature-24/16": 215,
            "timeSignature-24/4": 216,
            "timeSignature-27/16": 217,
            "timeSignature-29/4": 218,
            "timeSignature-3/1": 219,
            "timeSignature-3/16": 220,
            "timeSignature-3/2": 221,
            "timeSignature-3/4": 222,
            "timeSignature-3/8": 223,
            "timeSignature-32/8": 224,
            "timeSignature-33/32": 225,
            "timeSignature-35/16": 226,
            "timeSignature-35/32": 227,
            "timeSignature-37/16": 228,
            "timeSignature-4/1": 229,
            "timeSignature-4/16": 230,
            "timeSignature-4/2": 231,
            "timeSignature-4/4": 232,
            "timeSignature-4/8": 233,
            "timeSignature-5/16": 234,
            "timeSignature-5/2": 235,
            "timeSignature-5/4": 236,
            "timeSignature-5/8": 237,
            "timeSignature-52/4": 238,
            "timeSignature-6/16": 239,
            "timeSignature-6/2": 240,
            "timeSignature-6/4": 241,
            "timeSignature-6/8": 242,
            "timeSignature-63/4": 243,
            "timeSignature-7/1": 244,
            "timeSignature-7/16": 245,
            "timeSignature-7/2": 246,
            "timeSignature-7/4": 247,
            "timeSignature-7/8": 248,
            "timeSignature-8/16": 249,
            "timeSignature-8/2": 250,
            "timeSignature-8/4": 251,
            "timeSignature-8/8": 252,
            "timeSignature-80/4": 253,
            "timeSignature-9/16": 254,
            "timeSignature-9/32": 255,
            "timeSignature-9/4": 256,
            "timeSignature-9/8": 257,
            "timeSignature-C": 258,
            "timeSignature-C/": 259,
            "clef-percussion": 260
        }
        return note if note in duration_mapping else "<unk>"

    def map_lift(note):
        lift_mapping =  {
            # "nonote"    : 0,
            "lift_null" : 1,
            "lift_##"   : 2,
            "lift_#"    : 3,
            "lift_bb"   : 4,
            "lift_b"    : 5,
            "lift_N"    : 6
        }
        return "nonote" if note not in lift_mapping else note
        
    def symbol2pitch_rhythm_lift(symbol_lift, symbol_pitch, symbol_rhythm):
        return map_lift(symbol_lift), map_pitch(symbol_pitch), map_rhythm(symbol_rhythm)
    
    def note2pitch_rhythm_lift(note):
        # note-G#3_eighth
        note_split = note.split("_") # (note-G#3) (eighth)
        note_pitch_lift = note_split[:1][0]
        note_rhythm = note_split[1:][0]
        rhythm=f"note-{note_rhythm}"
        # print("-- note_rhythm: ", rhythm)

        note_note, pitch_lift = note_pitch_lift.split("-") # (note) (G#3)
        if len(pitch_lift)>2:
            pitch = f"note-{pitch_lift[0]+pitch_lift[-1]}" # (G3)
            lift = f"lift_{pitch_lift[1:-1]}"
        else:
            pitch = f"note-{pitch_lift}" 
            lift = f"lift_null"
        # print("-- note_pitch_lift: ", pitch, lift)
        return symbol2pitch_rhythm_lift(lift, pitch, rhythm)
    
    def rest2pitch_rhythm_lift(rest):
        # rest-quarter
        return symbol2pitch_rhythm_lift("nonote", "nonote", rest)
    
    def map_pitch2isnote(pitch_note):
        group_notes = []
        note_split = pitch_note.split("+")
        for note_s in note_split:
            if "nonote" in note_s:
                group_notes.append("nonote")
            elif "note-" in note_s:
                group_notes.append("note")
        return "+".join(group_notes)


    def map_notes2pitch_rhythm_lift_note(note_list):
        result_lift=[]
        result_pitch=[]
        result_rhythm=[]
        result_note=[]

        for notes in note_list:
            group_lift = []
            group_pitch = []
            group_rhythm = []
            group_notes_token_len=0

            # 우선 +로 나누고, 안에 | 있는 지 확인해서 먼저 붙이기
            # note-G#3_eighth + note-G3_eighth + note-G#3_eighth|note-G#3_eighth + rest-quarter
            note_split = notes.split("+")
            for note_s in note_split:
                if "|" in note_s:
                    mapped_lift_chord = []
                    mapped_pitch_chord = []
                    mapped_rhythm_chord = []
                    
                    # note-G#3_eighth|note-G#3_eighth
                    note_split_chord = note_s.split("|") # (note-G#3_eighth) (note-G#3_eighth)
                    for idx, note_s_c in enumerate(note_split_chord):
                        chord_lift, chord_pitch, chord_rhythm = note2pitch_rhythm_lift(note_s_c)

                        mapped_lift_chord.append(chord_lift)
                        mapped_pitch_chord.append(chord_pitch)
                        mapped_rhythm_chord.append(chord_rhythm)

                        # --> '|' 도 token이기 때문에 lift, pitch엔 nonote 추가해주기
                        if idx != len(note_split_chord)-1:
                            mapped_lift_chord.append("nonote")
                            mapped_pitch_chord.append("nonote")

                    group_lift.append("+".join(mapped_lift_chord))
                    group_pitch.append("+".join(mapped_pitch_chord))
                    group_rhythm.append("|".join(mapped_rhythm_chord))

                    # --> '|' 도 token이기 때문에 추가된 token 개수 더하기
                    # 동시에 친 걸 하나의 string으로 해버리는 거니까 주의하기
                    group_notes_token_len+=len(note_split_chord) + len(note_split_chord)-1

                elif "note" in note_s:
                    if "_" in note_s:
                        # note-G#3_eighth
                        note2lift, note2pitch, note2rhythm = note2pitch_rhythm_lift(note_s)
                        group_lift.append(note2lift)
                        group_pitch.append(note2pitch)
                        group_rhythm.append(note2rhythm)
                        group_notes_token_len+=1
                
                elif "rest" in note_s:
                    if "-" in note_s:
                        # rest-quarter
                        rest2lift, rest2pitch, rest2rhythm =rest2pitch_rhythm_lift(note_s)
                        group_lift.append(rest2lift)
                        group_pitch.append(rest2pitch)
                        group_rhythm.append(rest2rhythm)
                        group_notes_token_len+=1
                else:
                    # clef-F4+keySignature-AM+timeSignature-12/8
                    symbol2lift, symbol2pitch, symbol2rhythm = symbol2pitch_rhythm_lift("nonote", "nonote", note_s)
                    group_lift.append(symbol2lift)
                    group_pitch.append(symbol2pitch)
                    group_rhythm.append(symbol2rhythm)
                    group_notes_token_len+=1

            toks_len= group_notes_token_len

            # lift, pitch
            emb_lift="nonote+"
            emb_pitch="nonote+"
            emb_lift+= "+".join(group_lift)
            emb_pitch+= "+".join(group_pitch)
            emb_lift+="+nonote"
            emb_pitch+="+nonote"

            # rhythm
            emb_rhythm="[BOS]"
            emb_rhythm+= "+".join(group_rhythm)
            emb_rhythm+="[EOS]"

            # 뒤에 남은 건 패딩
            if toks_len < 256 - 2:
                for _ in range(256 - toks_len - 2):
                    emb_lift+="+nonote"
                    emb_pitch+="+nonote"        
                    emb_rhythm+="[PAD]"

            result_lift.append(emb_lift)
            result_pitch.append(emb_pitch)
            result_rhythm.append(emb_rhythm)
            result_note.append(map_pitch2isnote(emb_pitch))

        return result_lift, result_pitch, result_rhythm, result_note
    def get_mask(token_rhythm):
        result = []
        for tokens in token_rhythm:
            re=[0 for _ in range(len(tokens))]
            for idx, to in enumerate(tokens):
                if to !=0:
                    re[idx]=1
            result.append(re)
        return torch.tensor(result)
    # ===============================================================================
    # """
    # 1 -- resize 전
    # (298, 2404, 4)
    # 2 -- resize 후
    # torch.Size([1, 128, 1024])
    # torch.float32
    # rgbimgs : torch.Size([1, 1, 128, 1024])
    # """

    batch_=1000
    for i in range(0, len(x_raw_file_list), batch_):
        max_data_len=min(len(x_raw_file_list), i+batch_)

        x_file_list=x_raw_file_list[i:max_data_len]
        y_file_list=y_raw_file_list[i:max_data_len]

        inputs=convert_img(x_file_list)
        del x_file_list

        # 각 파일의 내용을 담을 리스트
        contents = []
        # 각 파일을 읽어서 내용을 리스트에 추가
        for annotation_path in y_file_list:
            # print("--- annotation_path:", annotation_path)
            content = read_txt_file(annotation_path)
            # 사이사이에 + 로 연결해주기
            content=content.replace(" ","+")
            content=content.replace("\t","+")
            contents.append(content)
        del y_file_list

        result_lift, result_pitch, result_rhythm, result_note = map_notes2pitch_rhythm_lift_note(contents)
        # print(len(result_lift), len(result_pitch), len(result_rhythm), len(result_note))

        token_lift, token_pitch, token_rhythm, token_note = handler.all_entokenize(result_lift, result_pitch, result_rhythm, result_note)
        del result_lift, result_pitch, result_rhythm, result_note

        # 부울 형식으로 변환
        mask = get_mask(token_rhythm).bool()

        print("x:", inputs.size())
        print("y:", token_lift.size())

        

        handler.train_model(inputs, token_lift, token_pitch, token_rhythm, token_note, mask)


x: 7 ['../data/raw/PrIMuS/package_aa_short/000102292-1_1_1/000102292-1_1_1.png', '../data/raw/PrIMuS/package_aa_short/000102289-5_1_1/000102289-5_1_1.png', '../data/raw/PrIMuS/package_aa_short/000102293-1_1_1/000102293-1_1_1.png', '../data/raw/PrIMuS/package_aa_short/000102291-1_1_1/000102291-1_1_1.png', '../data/raw/PrIMuS/package_aa_short/000102291-1_1_2/000102291-1_1_2.png', '../data/raw/PrIMuS/package_aa_short/000102289-4_1_1/000102289-4_1_1.png', '../data/raw/PrIMuS/package_aa_short/000102289-3_1_1/000102289-3_1_1.png']
y: 7 ['../data/raw/PrIMuS/package_aa_short/000102292-1_1_1/000102292-1_1_1.semantic', '../data/raw/PrIMuS/package_aa_short/000102289-5_1_1/000102289-5_1_1.semantic', '../data/raw/PrIMuS/package_aa_short/000102293-1_1_1/000102293-1_1_1.semantic', '../data/raw/PrIMuS/package_aa_short/000102291-1_1_1/000102291-1_1_1.semantic', '../data/raw/PrIMuS/package_aa_short/000102291-1_1_2/000102291-1_1_2.semantic', '../data/raw/PrIMuS/package_aa_short/000102289-4_1_1/000102289-

100%|██████████| 1/1 [00:03<00:00,  3.10s/it]


Epoch [1/5], Loss: 0.0434
-- consist Loss(0.0035) | rhythm Loss(0.1591) | pitch Loss(0.1774) | lift loss(0.0625)


100%|██████████| 1/1 [00:03<00:00,  3.05s/it]


Epoch [2/5], Loss: 0.0733
-- consist Loss(0.0343) | rhythm Loss(0.1545) | pitch Loss(0.1748) | lift loss(0.0615)


100%|██████████| 1/1 [00:03<00:00,  3.16s/it]


Epoch [3/5], Loss: 0.0666
-- consist Loss(0.0278) | rhythm Loss(0.1517) | pitch Loss(0.1743) | lift loss(0.0618)


100%|██████████| 1/1 [00:03<00:00,  3.19s/it]


Epoch [4/5], Loss: 0.0511
-- consist Loss(0.0123) | rhythm Loss(0.1509) | pitch Loss(0.1752) | lift loss(0.0620)


100%|██████████| 1/1 [00:03<00:00,  3.14s/it]

Epoch [5/5], Loss: 0.0452
-- consist Loss(0.0061) | rhythm Loss(0.1512) | pitch Loss(0.1770) | lift loss(0.0621)
--!! saved ../src/workspace/checkpoints/img2score_epoch5-piano-2024-05-14_20-37-07.pth





In [37]:
def calculate_SER(S, D, I, N):
    """
    Calculate Symbol Error Rate (SER)
    
    Parameters:
        S (int): Number of substitutions
        D (int): Number of deletions
        I (int): Number of insertions
        N (int): Total number of symbols in the reference sequence
        
    Returns:
        float: Symbol Error Rate
    """
    return (S + D + I) / N if N != 0 else 0.0

if __name__ == '__main__':
    # parser = argparse.ArgumentParser(description='Inference single staff image')
    # parser.add_argument('filepath', type=str, help='path to staff image')

    test_path="../data/test/000051650-1_1_1.png"
    cofigpath = "../src/workspace/config.yaml"
    args = getconfig(cofigpath)
    
    handler = StaffToScore(args)
    predrhythms, predpitchs, predlifts = handler.predict(test_path)

    # Example usage:
    S = 5  # Number of substitutions
    D = 3  # Number of deletions
    I = 2  # Number of insertions
    N = 100  # Total number of symbols in the reference sequence

    SER = calculate_SER(S, D, I, N)
    print("-- Symbol Error Rate (SER):", SER)
    
    mergeds = []
    for i in range(len(predrhythms)):
        predlift = predlifts[i]
        predpitch = predpitchs[i]
        predrhythm = predrhythms[i]
        
        merge = predrhythm[0] + '+'
        for j in range(1, len(predrhythm)):
            if predrhythm[j] == "|":
                merge = merge[:-1]+predrhythm[j]
            elif "note" in predrhythm[j]:
                if predlift[j] in ("lift_##", "lift_#", "lift_bb", "lift_b", "lift_N",):
                    lift = predlift[j].split("_")[-1]
                merge += predpitch[j]+"_"+predrhythm[j].split('note-')[-1]+"+"
            else:
                merge += predrhythm[j]+"+"
        mergeds.append(merge[:-1])
    print("-- Result: ", mergeds)

-- Symbol Error Rate (SER): 0.1
-- Result:  ['clef-G2+timeSignature-C/+nonote_quarter+nonote_quarter+nonote_quarter+nonote_eighth+note-B4_eighth+note-A4_eighth+note-C5_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-A4_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-B4_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-C5_eighth+nonote_eighth+nonote_eighth+note-A4_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-D5_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-D5_eighth+note-D5_eighth+note-G4_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-C5_eighth+nonote_eighth+note-A4_eighth+note-A4_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-G4_eighth+note-E5_eighth+nonote_eighth+nonote_eighth+note-C5_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+nonote_eighth+note-B4_eight