In [1]:
!pip install mirdata

Collecting mirdata
  Downloading mirdata-1.0.0-py3-none-any.whl.metadata (9.1 kB)
Collecting chardet>=5.0.0 (from mirdata)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting Deprecated>=1.2.14 (from mirdata)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting pretty_midi>=0.2.10 (from mirdata)
  Downloading pretty_midi-0.2.11.tar.gz (5.6 MB)
     ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
     - -------------------------------------- 0.3/5.6 MB ? eta -:--:--
     ---------------------- ----------------- 3.1/5.6 MB 13.2 MB/s eta 0:00:01
     ---------------------------------------- 5.6/5.6 MB 18.0 MB/s  0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finis



In [None]:
#@title Import libraries
import numpy as np
import librosa
import mirdata
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# Audio config
SR = 22050
N_FFT = 1024
HOP_LENGTH = 256          # ~11.6 ms
N_MELS = 64
FMIN = 80
FMAX = 2000               # guitar range

# Pitch config
UNVOICED = 0              # class 0
MIDI_MIN = 40             # E2
MIDI_MAX = 88             # E6
N_PITCH_CLASS = MIDI_MAX - MIDI_MIN + 2  # + unvoiced
MAX_TRACKS = None         # set số nhỏ nếu test

In [None]:
#@title Load dataset
gset = mirdata.initialize("guitarset")
gset.download()

track_ids = gset.track_ids
if MAX_TRACKS:
    track_ids = track_ids[:MAX_TRACKS]

37.3MB [00:34, 1.12MB/s]                            
3.36GB [18:08, 3.31MB/s]                            
2.99GB [17:33, 3.05MB/s]                            
626MB [05:42, 1.92MB/s]                           
652MB [02:02, 5.56MB/s]                           
248kB [00:01, 189kB/s]                           


In [None]:
#@title Helper functions
def extract_logmel(y, sr):
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS,
        fmin=FMIN,
        fmax=FMAX,
        power=2.0
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db.T   # (T_frames, n_mels)

def get_frame_times(n_frames, sr, hop_length):
    """Time (seconds) of each mel frame"""
    return np.arange(n_frames) * hop_length / sr

STRINGS = ["E", "A", "D", "G", "B", "e"]

def extract_mono_pitch_midi(track):
    pcs = track.pitch_contours

    # ---- chọn timeline tham chiếu ----
    ref_times = None
    for s in STRINGS:
        if pcs.get(s) is not None:
            ref_times = pcs[s].times
            break

    if ref_times is None:
        # không có pitch annotation nào
        return np.array([]), np.array([])

    pitch_strings = []

    for s in STRINGS:
        f0data = pcs.get(s)

        if f0data is None:
            # dây không có annotation → unvoiced
            f0_aligned = np.zeros_like(ref_times)
        else:
            f0 = f0data.frequencies
            times = f0data.times

            # align về ref_times
            f0_aligned = np.interp(
                ref_times,
                times,
                f0,
                left=0.0,
                right=0.0
            )

        pitch_strings.append(f0_aligned)

    pitches = np.stack(pitch_strings, axis=0)   # (<=6, T)
    mono_hz = np.max(pitches, axis=0)

    midi = np.zeros_like(mono_hz, dtype=np.float32)
    voiced = mono_hz > 0
    midi[voiced] = 69 + 12 * np.log2(mono_hz[voiced] / 440.0)

    return ref_times, midi

def align_pitch_to_frames(pitch_times, midi, frame_times):
    """
    Interpolate pitch contour to mel-frame timeline
    """
    return np.interp(
        frame_times,
        pitch_times,
        midi,
        left=0.0,
        right=0.0
    )

def midi_to_class(midi):
    """
    0 = unvoiced
    1..N = pitch classes
    """
    midi = np.round(midi).astype(int)
    cls = np.zeros_like(midi, dtype=np.int32)
    voiced = (midi >= MIDI_MIN) & (midi <= MIDI_MAX)
    cls[voiced] = midi[voiced] - MIDI_MIN + 1
    return cls

def extract_tempo(track, n_frames):
    """
    GuitarSet tempo is global (BPM).
    Handle all known GuitarSet API variants.
    """
    tempo_ann = track.tempo

    if tempo_ann is None:
        return np.zeros(n_frames, dtype=np.float32)

    # Case 1: tempo is a float
    if isinstance(tempo_ann, (int, float)):
        bpm = float(tempo_ann)
        return np.full(n_frames, bpm, dtype=np.float32)

    # Case 2: tempo is list-like (annotation)
    try:
        if len(tempo_ann) == 0:
            return np.zeros(n_frames, dtype=np.float32)
        bpm = tempo_ann[0].value
        return np.full(n_frames, bpm, dtype=np.float32)
    except TypeError:
        # fallback safety
        return np.zeros(n_frames, dtype=np.float32)

In [None]:
#@title Create Pitch + Tempo dataset (frame-level)
X = []
y_pitch = []
y_tempo = []

for tid in tqdm(track_ids):
    track = gset.track(tid)
    if track.audio_mic_path is None:
        continue
    y, sr = librosa.load(track.audio_mic_path, sr=SR, mono=True)
    mel = extract_logmel(y, sr)                     # (T_frames, n_mels)
    frame_times = get_frame_times(len(mel), sr, HOP_LENGTH)
    pitch_times, midi = extract_mono_pitch_midi(track)
    midi_frame = align_pitch_to_frames(
        pitch_times,
        midi,
        frame_times
    )
    pitch_cls = midi_to_class(midi_frame)
    tempo = extract_tempo(track, len(mel))
    X.append(mel)
    y_pitch.append(pitch_cls)
    y_tempo.append(tempo)

100%|██████████| 360/360 [03:02<00:00,  1.98it/s]


In [None]:
#@title Scale and normalize
X = np.vstack(X)                # (N, n_mels)
y_pitch = np.concatenate(y_pitch)
y_tempo = np.concatenate(y_tempo)

print("X:", X.shape)
print("Pitch:", y_pitch.shape)
print("Tempo:", y_tempo.shape)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X: (944904, 64)
Pitch: (944904,)
Tempo: (944904,)


In [None]:
#@title Save dataset (frame-level)
X = X.astype(np.float32)
y_pitch = y_pitch.astype(np.int32)
y_tempo = y_tempo.astype(np.float32)

np.savez_compressed(
    "pitch_tempo.npz",
    X=X,
    pitch=y_pitch,
    tempo=y_tempo
)

print("Saved pitch_tempo.npz")

Saved pitch_tempo.npz


In [None]:
#@title Onset-based tempo feature

import librosa
import numpy as np

N_ACF = 200  # số lag giữ lại (quan trọng)

def extract_onset_acf_feature(y, sr):
    onset_env = librosa.onset.onset_strength(
        y=y,
        sr=sr,
        hop_length=HOP_LENGTH
    )  # (T,)

    # Autocorrelation
    acf = np.correlate(onset_env, onset_env, mode="full")
    acf = acf[len(acf)//2:]  # keep positive lags

    acf = acf[:N_ACF]
    acf = acf / (np.max(acf) + 1e-6)  # normalize

    return acf  # (N_ACF,)

In [None]:
#@title Build onset-tempo dataset

X_temp = []
y_temp = []

for tid in tqdm(track_ids):
    track = gset.track(tid)

    if track.audio_mic_path is None or track.tempo is None:
        continue

    y, sr = librosa.load(
        track.audio_mic_path,
        sr=SR,
        mono=True
    )

    feat = extract_onset_acf_feature(y, sr)

    X_temp.append(feat)
    y_temp.append(float(track.tempo))

X_temp = np.stack(X_temp)
y_temp = np.array(y_temp, dtype=np.float32)

print("X_temp:", X_temp.shape)
print("y_temp:", y_temp.shape)

100%|██████████| 360/360 [00:53<00:00,  6.74it/s]

X_temp: (360, 200)
y_temp: (360,)





In [None]:
#@title Save onset dataset

np.savez(
    "tempo_onset_dataset.npz",
    X=X_temp,          # raw ACF feature
    y=y_temp,          # float BPM
    y_cls=y_cls        # quantized class
)

print("Onset tempo dataset saved to tempo_onset_dataset.npz")

Onset tempo dataset saved to tempo_onset_dataset.npz


In [None]:
#@title Load dataset
import numpy as np
data = np.load("tempo_onset_dataset.npz")
X_temp = data["X"]
y_temp = data["y"]
y_cls = data["y_cls"]
print(X.shape, y.shape, y_cls.shape)

(360, 200) (360,) (360,)


In [None]:
#@title Tempo quantization

TEMPO_MIN = 40
TEMPO_MAX = 200

y_cls = np.clip(
    np.round(y_temp),
    TEMPO_MIN,
    TEMPO_MAX
).astype(int)

y_cls = y_cls - TEMPO_MIN  # class index

N_TEMPO_CLASS = TEMPO_MAX - TEMPO_MIN + 1
print("Tempo classes:", N_TEMPO_CLASS)

Tempo classes: 161


In [None]:
#@title Scale & split

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X_s = scaler.fit_transform(X_temp)

X_tr, X_va, y_tr, y_va = train_test_split(
    X_s,
    y_cls,
    test_size=0.2,
    random_state=42
)

In [None]:
#@title Train tempo LR

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

clf = LogisticRegression(
    max_iter=3000,
    n_jobs=-1,
    class_weight="balanced"
)

clf.fit(X_tr, y_tr)

pred = clf.predict(X_va)

tempo_pred = pred + TEMPO_MIN
tempo_gt   = y_va + TEMPO_MIN

mae = np.mean(np.abs(tempo_pred - tempo_gt))

print("Tempo ACC:", accuracy_score(y_va, pred))
print("Tempo MAE (BPM):", mae)

Tempo ACC: 0.8055555555555556
Tempo MAE (BPM): 6.833333333333333


In [None]:
#@title Finetuning LR for tempo
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

C_list = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
best_acc = -1
best_C = None
best_clf = None

for C_val in C_list:
    clf = LogisticRegression(
        C=C_val,
        max_iter=3000,
        n_jobs=-1,
        class_weight="balanced"
    )
    clf.fit(X_tr, y_tr)
    pred = clf.predict(X_va)
    acc = accuracy_score(y_va, pred)
    print(f"C={C_val}, Val ACC={acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_C = C_val
        best_clf = clf

print(f"\nBest C: {best_C}, Val ACC: {best_acc:.4f}")

C=0.01, Val ACC=0.7500
C=0.1, Val ACC=0.8194
C=0.5, Val ACC=0.7917
C=1.0, Val ACC=0.8056
C=5.0, Val ACC=0.8056
C=10.0, Val ACC=0.8056

Best C: 0.1, Val ACC: 0.8194


In [None]:
#@title Export best tempo LR to .h

HEADER_FILE = "tempo_lr_model.h"
PREFIX = "TEMPO_LR"

W = best_clf.coef_          # (n_class, N_ACF)
b = best_clf.intercept_     # (n_class,)
n_class, n_feat = W.shape

with open(HEADER_FILE, "w") as f:
    f.write("#pragma once\n\n")
    f.write("#include <stdint.h>\n\n")

    f.write(f"#define {PREFIX}_N_CLASS {n_class}\n")
    f.write(f"#define {PREFIX}_N_FEAT  {n_feat}\n\n")

    # weights
    f.write(f"const float {PREFIX}_W[{PREFIX}_N_CLASS][{PREFIX}_N_FEAT] = {{\n")
    for k in range(n_class):
        row = ", ".join([f"{w:.6f}f" for w in W[k]])
        f.write("  { " + row + " },\n")
    f.write("};\n\n")

    # bias
    f.write(f"const float {PREFIX}_B[{PREFIX}_N_CLASS] = {{\n")
    f.write("  " + ", ".join([f"{x:.6f}f" for x in b]) + "\n")
    f.write("};\n\n")

    # scaler for input
    f.write(f"const float {PREFIX}_MEAN[{PREFIX}_N_FEAT] = {{\n")
    f.write("  " + ", ".join([f"{m:.6f}f" for m in scaler.mean_]) + "\n")
    f.write("};\n\n")

    f.write(f"const float {PREFIX}_SCALE[{PREFIX}_N_FEAT] = {{\n")
    f.write("  " + ", ".join([f"{s:.6f}f" for s in scaler.scale_]) + "\n")
    f.write("};\n")

print(f"Exported multiclass tempo LR to {HEADER_FILE}")

Exported multiclass tempo LR to tempo_lr_model.h
