# 0. 준비

In [1]:
from pathlib import Path

In [None]:
# import essentia
# import essentia.standard as es

In [2]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
folder = Path('wav_folder')
wav_paths = sorted(folder.glob('*.wav'))

audio_path1, audio_path2, audio_path3 = wav_paths[:3]

print(audio_path1)  # wav_folder/1.wav
print(audio_path2)  # wav_folder/2.wav
print(audio_path3)  # wav_folder/3.wav

# 1. 조성 분포

> 분리된 콘다 환경에서 실행 가능

In [None]:
# def extract_key_signature(audio_path):
#     loader = es.MonoLoader(filename=str(audio_path))
#     audio = loader()
    
#     key_extractor = es.KeyExtractor()
#     key, scale, strength = key_extractor(audio)
#     return key, scale, strength

In [None]:
# key, scale, strength = extract_key_signature(audio_path1)

# 2. 템포 변화

## 템포 분산

In [None]:
def check_tempo_variance(audio_path):
    y, sr = librosa.load(audio_path)
    hop_length = 512

    onset_strength_seq = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length) # 각 타임 프레임 시작점의 강도
    tempogram = librosa.feature.tempogram(onset_envelope=onset_strength_seq, sr=sr, hop_length=hop_length) # (num_bins, num_frames)

    tempos = librosa.tempo_frequencies(tempogram.shape[0], sr=sr, hop_length=hop_length) # tempo bin index → bpm
    dominant_indices = np.argmax(tempogram[1:], axis=0) + 1
    dominant_tempos = tempos[dominant_indices]

    dominant_tempos = dominant_tempos[(~np.isnan(dominant_tempos)) & (~np.isinf(dominant_tempos))]
    dominant_tempos = np.log2(dominant_tempos + 1e-6)

    if len(dominant_tempos) == 0:
        return 0.0

    tempo_var = np.var(dominant_tempos)

    return tempo_var

## 활성 템포 분석

In [None]:
def get_onset_tempogram_tempo(audio_path):
  # Load audio
  y, sr = librosa.load(audio_path, duration=30)
  hop_length = 512
  oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
  tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)
  return tempogram

In [7]:
def get_dominant_tempo(tempogram, threshold=0.5):

  active_bpm_per_frame = np.sum(tempogram >= threshold, axis=0)
  active_bpm_variance = np.var(active_bpm_per_frame)

  prev_indices = None
  change_counts = 0
  for t in range(tempogram.shape[1]):
      indices = set(np.where(tempogram[:, t] >= threshold)[0])
      if prev_indices is not None:
          change = len(indices.symmetric_difference(prev_indices))
          change_counts += change
      prev_indices = indices
  tempo_transition_complexity = change_counts / tempogram.shape[1]

  return tempo_transition_complexity, active_bpm_variance

In [None]:
tempogram = get_onset_tempogram_tempo(audio_path1)
tempo_transition_complexity, active_bpm_variance = get_dominant_tempo(tempogram)
tempo_transition_complexity, active_bpm_variance

(0.17414860681114552, np.float64(326.4930334566612))

# 3. 피치 범위

In [None]:
def get_pitch_range(audio_path):
  y, sr = librosa.load(audio_path, duration=30)

  f0, _, _ = librosa.pyin(
      y,
      fmin=librosa.note_to_hz('C2'),
      fmax=librosa.note_to_hz('C7'),
      sr=sr
  )
  
  f0 = f0[~np.isnan(f0)]
  q1 = np.percentile(f0, 25)
  q3 = np.percentile(f0, 75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5 * iqr
  upper_bound = q3 + 1.5 * iqr
  f0_filtered = f0[(f0 >= lower_bound) & (f0 <= upper_bound)]
  
  pitch_range = np.max(f0_filtered) - np.min(f0_filtered)

  return np.max(f0_filtered), np.min(f0_filtered), pitch_range

In [None]:
pitch_high, pitch_low, pitch_range = get_pitch_range(audio_path1)
pitch_range

np.float64(121.74038408403857)

# 4. 반복 구간 비율

## 반복 면적, 반복 비율

In [None]:
def check_repetition_ratio(audio_path, threshold=0.8, min_separation_sec=1.0, hop_length=512):
    y, sr = librosa.load(audio_path)
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)

    similarity_matrix = cosine_similarity(chroma.T)
    T = similarity_matrix.shape[0]
    min_diag_offset = int(min_separation_sec * sr / hop_length)

    time_idx = np.arange(T)
    frame_diff = np.abs(time_idx[:, None] - time_idx[None, :])

    is_valid_repeat = (similarity_matrix > threshold) & (frame_diff >= min_diag_offset)

    total_possible = np.sum(frame_diff >= min_diag_offset)
    repeated_area = np.sum(is_valid_repeat)
    repetition_ratio_value = repeated_area / total_possible if total_possible > 0 else 0

    return repeated_area, repetition_ratio_value

## 평균 self-similarity matrix

In [None]:
from skimage.transform import resize
import os
import numpy as np
import librosa
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

def get_resized_ssm(audio_path, target_size=128, threshold=0.8, min_diag_offset = 5):
    y, sr = librosa.load(audio_path)

    chroma = librosa.feature.chroma_cqt(y=y, sr=sr) # (12, T)
    similarity_matrix = cosine_similarity(chroma.T) # (T, T)
    T = similarity_matrix.shape[0]

    time_idx = np.arange(T)
    frame_diff = np.abs(time_idx[:, None] - time_idx[None, :]) # 프레임 간 거리
    is_repeat = (similarity_matrix > threshold) & (frame_diff >= min_diag_offset)

    resized_mask = resize(is_repeat.astype(float), (target_size, target_size),
                          mode='reflect', anti_aliasing=True, preserve_range=True)

    return resized_mask

def plot_silimilarity_matrix(similarity_matrix, save_path=None):
    plt.figure(figsize=(6, 5))
    plt.imshow(similarity_matrix, origin='lower', aspect='auto', cmap='magma', interpolation='nearest')
    plt.title("Mean Self-Similarity Matrix")
    plt.colorbar(label='Repetition Presence Probability')
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
        print(f"Saved visualization to: {save_path}")
    
    plt.close()

In [None]:
n_class = 4

for n in range(n_class):
    download_dir = f"../downloads/{n}"
    resized_masks = []
    target_size = 128

    for fname in tqdm(os.listdir(download_dir)):
        if fname.endswith(".wav"):
            path = os.path.join(download_dir, fname)
            resized_matrix = get_resized_ssm(path, target_size)
            resized_masks.append(resized_matrix)

    # 평균 마스크 시각화
    mean_matrix = np.mean(resized_masks, axis=0)
    plot_silimilarity_matrix(mean_matrix, f"../imgs/{n}.png")

# 5. RMS 분산

In [14]:
def check_rms_variation(audio_path):
    y, sr = librosa.load(audio_path, duration=30)

    rms = librosa.feature.rms(y=y).flatten()

    rms = rms[~np.isnan(rms) & ~np.isinf(rms)]

    if len(rms) == 0:
        return 0.0

    rms_var = np.var(rms)
    return rms_var

In [15]:
rms_var = check_rms_variation(audio_path1)
rms_var

np.float32(0.008695843)