In [54]:
import os

def load_audio_files(directory, exclude_dirs=None):
    if exclude_dirs is None:
        exclude_dirs = []
    audio_files = []
    for root, dirs, files in os.walk(directory):
        # 檢查是否要排除當前目錄
        if any(exclude_dir in root for exclude_dir in exclude_dirs):
            continue
        for file in files:
            if file.endswith('.wav'):
                genre = os.path.basename(root)  # 取得子目錄名稱作為類別標籤
                audio_files.append((os.path.join(root, file), genre))
    return audio_files

def load_ground_truth(directory):
    ground_truth = {}
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.bpm'):
                try:
                    with open(os.path.join(root, file), 'r') as f:
                        bpm = float(f.readline().strip())
                    key = os.path.splitext(file)[0]
                    ground_truth[key] = bpm
                except Exception as e:
                    print(f"讀取 {file} 時發生錯誤: {e}")
    return ground_truth

# 設定要排除的目錄
exclude_dirs = ['allwav']

audio_files = load_audio_files('D:/MIR_HW3_DATASET/BallroomData', exclude_dirs=exclude_dirs)
ground_truth = load_ground_truth('D:/MIR_HW3_DATASET/BallroomAnnotations')

# 檢查音訊檔案和真實值的鍵名是否一致
audio_file_names = [os.path.splitext(os.path.basename(file[0]))[0] for file in audio_files]
missing_files = [file for file in audio_file_names if file not in ground_truth]

if missing_files:
    print(f"以下音訊檔案缺少對應的真實值: {missing_files}")

print(f"音訊檔案數量: {len(audio_files)}")
print(f"真實值項目數量: {len(ground_truth)}")

# 顯示部分載入的音訊檔案及其類別
print("部分音訊檔案及其類別標籤：")
for audio_file, genre in audio_files[:10]:  # 只顯示前10個
    print(f"檔案: {audio_file}, 類別: {genre}")

以下音訊檔案缺少對應的真實值: ['Media-106111', 'Media-103618', 'Media-106118']
音訊檔案數量: 698
真實值項目數量: 695
部分音訊檔案及其類別標籤：
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Cafe_Paradiso-05.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Cafe_Paradiso-06.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Cafe_Paradiso-07.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Cafe_Paradiso-08.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Fire-01.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Fire-08.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Fire-14.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-I_Like_It2-01.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-I_Like_It2-02.wav, 類別: ChaCha
檔案: D:/MIR_HW3_DATASET/BallroomData\ChaCha\Albums-Latino_Latino-01.wav, 類別: ChaCha


In [68]:
import librosa
import numpy as np
from scipy.signal import find_peaks
import mir_eval

def compute_tempograms(audio_path):
    y, sr = librosa.load(audio_path)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr)
    ac_tempogram = librosa.autocorrelate(onset_env, max_size=tempogram.shape[0])
    return tempogram, ac_tempogram

def find_predominant_tempo_from_tempogram(tempogram, sr):
    tempogram_sum = np.sum(tempogram, axis=1)
    peaks, properties = find_peaks(tempogram_sum, height=0)
    
    if len(peaks) >= 2:
        T1, T2 = sorted(peaks[:2])
        S1, S2 = properties['peak_heights'][:2]
    elif len(peaks) == 1:
        T1 = T2 = peaks[0]
        S1 = S2 = properties['peak_heights'][0]
    else:
        T1 = T2 = S1 = S2 = 0

    T1 = librosa.frames_to_time(T1, sr=sr)
    T2 = librosa.frames_to_time(T2, sr=sr)

    T1 = 60.0 / T1 if T1 > 0 else 0
    T2 = 60.0 / T2 if T2 > 0 else 0

    return T1, T2, S1, S2

def find_predominant_tempo_from_autocorrelation(ac_tempogram, sr):
    peaks, properties = find_peaks(ac_tempogram, height=0)
    
    if len(peaks) >= 2:
        T1, T2 = sorted(peaks[:2])
        S1, S2 = properties['peak_heights'][:2]
    elif len(peaks) == 1:
        T1 = T2 = peaks[0]
        S1 = S2 = properties['peak_heights'][0]
    else:
        T1 = T2 = S1 = S2 = 0

    T1 = librosa.frames_to_time(T1, sr=sr)
    T2 = librosa.frames_to_time(T2, sr=sr)

    T1 = 60.0 / T1 if T1 > 0 else 0
    T2 = 60.0 / T2 if T2 > 0 else 0

    return T1, T2, S1, S2

def relative_saliency(S1, S2):
    return S1 / S2 if S2 != 0 else float('inf')

def evaluate_tempo_estimation(results, ground_truth):
    T_mir_eval = [(gt, 1) for gt in ground_truth]
    T_est_f = [(res['T1_fourier'], res['saliency_fourier']) for res in results]
    T_est_a = [(res['T1_autocorrelation'], res['saliency_autocorrelation']) for res in results]
    
    p_scores_f = []
    lotc_scores_f = []
    p_scores_a = []
    lotc_scores_a = []

    for gt, est_f, est_a in zip(T_mir_eval, T_est_f, T_est_a):
        reference_tempi = np.array([[gt[0], 1]])
        estimated_tempi_f = np.array([[est_f[0], est_f[1]]])
        estimated_tempi_a = np.array([[est_a[0], est_a[1]]])
        
        P_f, ALOTC_f = mir_eval.tempo.detection(reference_tempi, estimated_tempi_f,0.5)
        P_a, ALOTC_a = mir_eval.tempo.detection(reference_tempi, estimated_tempi_a,0.5)
        
        p_scores_f.append(P_f)
        lotc_scores_f.append(ALOTC_f)
        p_scores_a.append(P_a)
        lotc_scores_a.append(ALOTC_a)

    return np.mean(p_scores_f), np.mean(lotc_scores_f), np.mean(p_scores_a), np.mean(lotc_scores_a)


In [69]:
results = []

for audio_file, genre in audio_files:
    tempogram, ac_tempogram = compute_tempograms(audio_file)
    
    T1_f, T2_f, S1_f, S2_f = find_predominant_tempo_from_tempogram(tempogram, sr=22050)
    T1_a, T2_a, S1_a, S2_a = find_predominant_tempo_from_autocorrelation(ac_tempogram, sr=22050)
    
    saliency_f = relative_saliency(S1_f, S2_f)
    saliency_a = relative_saliency(S1_a, S2_a)
    
    result = {
        'file': audio_file,
        'genre': genre,
        'T1_fourier': T1_f,
        'T2_fourier': T2_f,
        'saliency_fourier': saliency_f,
        'T1_autocorrelation': T1_a,
        'T2_autocorrelation': T2_a,
        'saliency_autocorrelation': saliency_a,
    }
    results.append(result)
    print(result)

gt_list = []
for audio_file, genre in audio_files:
    file_name = os.path.splitext(os.path.basename(audio_file))[0]
    if file_name in ground_truth:
        gt_list.append(ground_truth[file_name])
        
p_score_f, lotc_score_f, p_score_a, lotc_score_a = evaluate_tempo_estimation(results, gt_list)
print(f"Fourier Tempogram P-Score: {p_score_f}, ALOTC: {lotc_score_f}")
print(f"Autocorrelation Tempogram P-Score: {p_score_a}, ALOTC: {lotc_score_a}")

{'file': 'D:/MIR_HW3_DATASET/BallroomData\\ChaCha\\Albums-Cafe_Paradiso-05.wav', 'genre': 'ChaCha', 'T1_fourier': 516.796875, 'T2_fourier': 258.3984375, 'saliency_fourier': 0.3993356101465938, 'T1_autocorrelation': 516.796875, 'T2_autocorrelation': 258.3984375, 'saliency_autocorrelation': 0.3133656516693445}
{'file': 'D:/MIR_HW3_DATASET/BallroomData\\ChaCha\\Albums-Cafe_Paradiso-06.wav', 'genre': 'ChaCha', 'T1_fourier': 430.6640625, 'T2_fourier': 234.90767045454544, 'saliency_fourier': 0.2659960824888647, 'T1_autocorrelation': 430.6640625, 'T2_autocorrelation': 234.90767045454544, 'saliency_autocorrelation': 0.2126387456644069}
{'file': 'D:/MIR_HW3_DATASET/BallroomData\\ChaCha\\Albums-Cafe_Paradiso-07.wav', 'genre': 'ChaCha', 'T1_fourier': 516.796875, 'T2_fourier': 258.3984375, 'saliency_fourier': 0.2514209328020112, 'T1_autocorrelation': 516.796875, 'T2_autocorrelation': 258.3984375, 'saliency_autocorrelation': 0.23595910790705524}
{'file': 'D:/MIR_HW3_DATASET/BallroomData\\ChaCha\\Al

AttributeError: 'float' object has no attribute 'size'