# load library

In [8]:
import pandas as pd
from pathlib import Path
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# load dataset

In [None]:
df = pd.read_csv('data.csv', encoding='utf-8-sig')
df.head()

Unnamed: 0,video,yt_id,duration,title,singer,audio,lyric,year,match,pn
0,아이유(IU) - Blueming(블루밍) [가사/Lyrics],yCFMJrneOUs,3:38,Blueming,IU,Q1,Q1,2019,2,0
1,아이유(IU) - 좋은 날 [가사/Lyrics],V6WWJNpIJN4,3:56,좋은날,IU,Q1,Q1,2010,2,0
2,아이유(IU) - 이 지금 [가사/Lyrics],RUuRcR7ZQUg,2:57,이 지금,IU,Q4,Q1,2017,2,0
3,다비치 - 안녕이라고 말하지마 [가사/Lyrics],uw83-MnCpAo,3:51,안녕이라고 말하지마,다비치,Q4,Q4,2011,2,0
4,다비치 - 거북이 [가사/Lyrics],ofBinMrHfHQ,3:44,거북이,다비치,Q1,Q1,2013,2,0


---

# load wav path

In [9]:
pos = Path('downloads/2')
pos_wav_paths = sorted(pos.glob('*.wav'))
neg = Path('downloads/3')
neg_wav_paths = sorted(neg.glob('*.wav'))

In [3]:
len(pos_wav_paths), len(neg_wav_paths)

(30, 31)

# create columns

In [5]:
df['tempo_transition_complexity'] = 0
df['active_bpm_variance'] = 0
df['pitch_high'] = 0
df['pitch_low'] = 0
df['pitch_range'] = 0
df['rms_variation'] = 0
df['repeat_area_size'] = 0
df['repetition_ratio'] = 0

# tempo transition complexity & active bpm number variance

In [12]:
def get_onset_tempogram_tempo(audio_path):
  # Load audio
  # y, sr = librosa.load(audio_path, duration=30)
  y, sr = librosa.load(audio_path)
  hop_length = 512

  # Onset strength envelope
  oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

  # Tempogram (local autocorrelation)
  tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)

  return tempogram

def get_dominant_tempo(tempogram, threshold=0.5):

  active_bpm_per_frame = np.sum(tempogram >= threshold, axis=0)
  active_bpm_variance = np.var(active_bpm_per_frame)

  prev_indices = None
  change_counts = 0
  for t in range(tempogram.shape[1]):
      indices = set(np.where(tempogram[:, t] >= threshold)[0])
      if prev_indices is not None:
          change = len(indices.symmetric_difference(prev_indices))
          change_counts += change
      prev_indices = indices
  tempo_transition_complexity = change_counts / tempogram.shape[1]

  return tempo_transition_complexity, active_bpm_variance

In [7]:
from tqdm.auto import tqdm
for wav_path in tqdm(pos_wav_paths):
    video_name = wav_path.stem
    tempogram = get_onset_tempogram_tempo(wav_path)
    tempo_transition_complexity, active_bpm_variance = get_dominant_tempo(tempogram)
    df.loc[df['video'] == video_name, 'tempo_transition_complexity'] = tempo_transition_complexity
    df.loc[df['video'] == video_name, 'active_bpm_variance'] = active_bpm_variance
    
for wav_path in tqdm(neg_wav_paths):
    video_name = wav_path.stem
    tempogram = get_onset_tempogram_tempo(wav_path)
    tempo_transition_complexity, active_bpm_variance = get_dominant_tempo(tempogram)
    df.loc[df['video'] == video_name, 'tempo_transition_complexity'] = tempo_transition_complexity
    df.loc[df['video'] == video_name, 'active_bpm_variance'] = active_bpm_variance
    

  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/30 [00:00<?, ?it/s]

  df.loc[df['video'] == video_name, 'tempo_transition_complexity'] = tempo_transition_complexity
  df.loc[df['video'] == video_name, 'active_bpm_variance'] = active_bpm_variance
100%|██████████| 30/30 [00:15<00:00,  1.96it/s]
100%|██████████| 31/31 [00:16<00:00,  1.89it/s]


# tempo variance

In [None]:
def check_tempo_variance(audio_path):
    y, sr = librosa.load(audio_path)
    hop_length = 512

    onset_strength_seq = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length) # 각 타임 프레임 시작점의 강도
    tempogram = librosa.feature.tempogram(onset_envelope=onset_strength_seq, sr=sr, hop_length=hop_length) # (num_bins, num_frames)

    tempos = librosa.tempo_frequencies(tempogram.shape[0], sr=sr, hop_length=hop_length) # tempo bin index → bpm
    dominant_indices = np.argmax(tempogram[1:], axis=0) + 1
    dominant_tempos = tempos[dominant_indices]

    dominant_tempos = dominant_tempos[(~np.isnan(dominant_tempos)) & (~np.isinf(dominant_tempos))]
    dominant_tempos = np.log2(dominant_tempos + 1e-6)

    if len(dominant_tempos) == 0:
        return 0.0

    tempo_var = np.var(dominant_tempos)

    return tempo_var

In [None]:
for wav_path in tqdm(pos_wav_paths):
    video_name = wav_path.stem
    tempo_var = check_tempo_variance(wav_path)
    df.loc[df['video'] == video_name, 'tempo_variance'] = tempo_var

for wav_path in tqdm(neg_wav_paths):
    video_name = wav_path.stem
    tempo_var = check_tempo_variance(wav_path)
    df.loc[df['video'] == video_name, 'tempo_variance'] = tempo_var

100%|██████████| 30/30 [00:14<00:00,  2.12it/s]
100%|██████████| 31/31 [00:15<00:00,  2.03it/s]


# pitch range

In [15]:
def get_pitch_range(audio_path):
  # y, sr = librosa.load(audio_path, duration=30)
  y, sr = librosa.load(audio_path)

  f0, _, _ = librosa.pyin(
      y,
      fmin=librosa.note_to_hz('C2'),
      fmax=librosa.note_to_hz('C7'),
      sr=sr
  )

  # NaN 제거
  f0 = f0[~np.isnan(f0)]

  # IQR 기반 이상치 제거
  q1 = np.percentile(f0, 25)
  q3 = np.percentile(f0, 75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5 * iqr
  upper_bound = q3 + 1.5 * iqr
  f0_filtered = f0[(f0 >= lower_bound) & (f0 <= upper_bound)]
  pitch_range = np.max(f0_filtered) - np.min(f0_filtered)

  return np.max(f0_filtered), np.min(f0_filtered), pitch_range

In [16]:
for wav_path in tqdm(neg_wav_paths):
    video_name = wav_path.stem
    pitch_high, pitch_low, pitch_range = get_pitch_range(wav_path)
    df.loc[df['video'] == video_name, 'pitch_high'] = pitch_high
    df.loc[df['video'] == video_name, 'pitch_low'] = pitch_low
    df.loc[df['video'] == video_name, 'pitch_range'] = pitch_range

for wav_path in tqdm(pos_wav_paths):
    video_name = wav_path.stem
    pitch_high, pitch_low, pitch_range = get_pitch_range(wav_path)
    df.loc[df['video'] == video_name, 'pitch_high'] = pitch_high
    df.loc[df['video'] == video_name, 'pitch_low'] = pitch_low
    df.loc[df['video'] == video_name, 'pitch_range'] = pitch_range

  df.loc[df['video'] == video_name, 'pitch_high'] = pitch_high
  df.loc[df['video'] == video_name, 'pitch_low'] = pitch_low
  df.loc[df['video'] == video_name, 'pitch_range'] = pitch_range
100%|██████████| 31/31 [22:25<00:00, 43.40s/it]
100%|██████████| 30/30 [20:51<00:00, 41.71s/it]


# repetition_ratio

In [None]:
def check_repetition_ratio(audio_path, threshold=0.8, min_separation_sec=1.0, hop_length=512):
    y, sr = librosa.load(audio_path)
    
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)  # shape: (12, T)
    similarity_matrix = cosine_similarity(chroma.T)  # shape: (T, T)
    T = similarity_matrix.shape[0]

    # Minimum diagonal offset (exclude self or very nearby frames)
    min_diag_offset = int(min_separation_sec * sr / hop_length)

    time_idx = np.arange(T)
    frame_diff = np.abs(time_idx[:, None] - time_idx[None, :])

    # Valid repeat: high similarity + not too close in time
    is_valid_repeat = (similarity_matrix > threshold) & (frame_diff >= min_diag_offset)

    total_possible = np.sum(frame_diff >= min_diag_offset)
    repeated_area = np.sum(is_valid_repeat)
    repetition_ratio_value = repeated_area / total_possible if total_possible > 0 else 0

    return repeated_area, repetition_ratio_value

In [11]:
for wav_path in tqdm(pos_wav_paths):
    video_name = wav_path.stem
    repeated_area, repetition_ratio = check_repetition_ratio(wav_path)
    df.loc[df['video'] == video_name, 'repeat_area_size'] = repeated_area
    df.loc[df['video'] == video_name, 'repetition_ratio'] = repetition_ratio

for wav_path in tqdm(neg_wav_paths):
    video_name = wav_path.stem
    repeated_area, repetition_ratio = check_repetition_ratio(wav_path)
    df.loc[df['video'] == video_name, 'repeat_area_size'] = repeated_area
    df.loc[df['video'] == video_name, 'repetition_ratio'] = repetition_ratio

  df.loc[df['video'] == video_name, 'repetition_ratio'] = repetition_ratio
100%|██████████| 30/30 [00:43<00:00,  1.46s/it]
100%|██████████| 31/31 [00:48<00:00,  1.57s/it]


# mean self-similarity matrix

In [None]:
from skimage.transform import resize
import os
import numpy as np
import librosa
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

def check_repetition_ratio(audio_path, target_size=128):
    y, sr = librosa.load(audio_path)

    chroma = librosa.feature.chroma_cqt(y=y, sr=sr) # (12, T)
    similarity_matrix = cosine_similarity(chroma.T) # 각 타임 프레임 간 유사도 계산 (T, T)

    T = similarity_matrix.shape[0]

    repeated_area = np.sum(similarity_matrix)
    repetition_ratio_value = repeated_area / (T * T)

    resized_mask = resize(similarity_matrix.astype(float), (target_size, target_size),
                          mode='reflect', anti_aliasing=True, preserve_range=True)
    repetition_ratio_value = np.mean(resized_mask > 0.5)

    return repetition_ratio_value, resized_mask, similarity_matrix

def plot_silimilarity_matrix(similarity_matrix, save_path=None):
    plt.figure(figsize=(6, 5))
    plt.imshow(similarity_matrix, origin='lower', aspect='auto', cmap='magma', interpolation='nearest')
    plt.title("Mean Self-Similarity Matrix")
    plt.colorbar(label='Repetition Presence Probability')
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
        print(f"Saved visualization to: {save_path}")
    
    plt.close()


download_dir = "../downloads/1"
results = []
resized_masks = []

count = 0
max_files = 10
target_size = 128

for fname in tqdm(os.listdir(download_dir)):
    if fname.endswith(".wav"):
        path = os.path.join(download_dir, fname)
        rep_ratio, mask, sim_matrix = check_repetition_ratio(path, target_size)
        resized_masks.append(mask)

# 평균 마스크 시각화
mean_matrix = np.mean(resized_masks, axis=0)
plot_silimilarity_matrix(mean_matrix, '../imgs/1.png')

In [None]:
download_dir = "../downloads/2"
results = []
resized_masks = []

count = 0
max_files = 10
target_size = 128

for fname in tqdm(os.listdir(download_dir)):
    if fname.endswith(".wav"):
        path = os.path.join(download_dir, fname)
        rep_ratio, mask, sim_matrix = check_repetition_ratio(path, target_size)
        resized_masks.append(mask)

# 평균 마스크 시각화
mean_matrix = np.mean(resized_masks, axis=0)
plot_silimilarity_matrix(mean_matrix, '../imgs/2.png')

In [None]:
download_dir = "../downloads/3"
results = []
resized_masks = []

count = 0
max_files = 10
target_size = 128

for fname in tqdm(os.listdir(download_dir)):
    if fname.endswith(".wav"):
        path = os.path.join(download_dir, fname)
        rep_ratio, mask, sim_matrix = check_repetition_ratio(path, target_size)
        resized_masks.append(mask)

# 평균 마스크 시각화
mean_matrix = np.mean(resized_masks, axis=0)
plot_silimilarity_matrix(mean_matrix, '../imgs/3.png')

In [None]:
download_dir = "../downloads/0"
results = []
resized_masks = []

count = 0
max_files = 10
target_size = 128

for fname in tqdm(os.listdir(download_dir)):
    if fname.endswith(".wav"):
        path = os.path.join(download_dir, fname)
        rep_ratio, mask, sim_matrix = check_repetition_ratio(path, target_size)
        resized_masks.append(mask)

# 평균 마스크 시각화
mean_matrix = np.mean(resized_masks, axis=0)
plot_silimilarity_matrix(mean_matrix, '../imgs/0.png')

# rms variation

In [8]:
def check_rms_variation(audio_path):
    # y, sr = librosa.load(audio_path, duration=30)
    y, sr = librosa.load(audio_path)

    rms = librosa.feature.rms(y=y).flatten()

    rms = rms[~np.isnan(rms) & ~np.isinf(rms)]

    if len(rms) == 0:
        return 0.0

    rms_var = np.var(rms)
    return rms_var

In [9]:
for wav_path in tqdm(neg_wav_paths):
    video_name = wav_path.stem
    rms_variation = check_rms_variation(wav_path)
    df.loc[df['video'] == video_name, 'rms_variation'] = rms_variation

for wav_path in tqdm(pos_wav_paths):
    video_name = wav_path.stem
    rms_variation = check_rms_variation(wav_path)
    df.loc[df['video'] == video_name, 'rms_variation'] = rms_variation

  df.loc[df['video'] == video_name, 'rms_variation'] = rms_variation
100%|██████████| 31/31 [00:10<00:00,  3.06it/s]
100%|██████████| 30/30 [00:09<00:00,  3.24it/s]
