In [11]:
import librosa
import numpy as np
import scipy.signal as sps
import matplotlib.pyplot as plt
import os
import mir_eval
from os import listdir
import textwrap

Task 1: tempo estimation

In [12]:
def P_SCORE(T, S, G):
    def Tti(Ti):
        return 1 if abs((G - Ti) / G) <= 0.08 else 0
    Tt1 = Tti(T[0])
    Tt2 = Tti(T[1])
    P = S * Tt1 + (1 - S) * Tt2
    return P

def ALOTC_SCORE(T, G):
    def is_tempo_correct(Ti):
        return abs((G - Ti) / G) <= 0.08
    if is_tempo_correct(T[0]) or is_tempo_correct(T[1]):
        return 1
    else:
        return 0

Q1

In [13]:
def extract_tempo_features(audio_file: str, hop_size: int = 512) -> tuple[list[float], list[float], list[float], list[float]]:
    audio_signal, sampling_rate = librosa.load(audio_file)
    onset_env = librosa.onset.onset_strength(y=audio_signal, sr=sampling_rate, hop_length=hop_size)
    auto_tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sampling_rate, hop_length=hop_size, norm=None)
    fourier_tempogram = librosa.feature.fourier_tempogram(onset_envelope=onset_env, sr=sampling_rate, hop_length=hop_size)
    auto_tempo_vector = np.sum(auto_tempogram, axis=1) / auto_tempogram.shape[1]
    fourier_tempo_vector = np.sum(np.abs(fourier_tempogram), axis=1) / fourier_tempogram.shape[1]
    auto_peak_indices = sps.argrelmax(np.array(auto_tempo_vector))[0]
    fourier_peak_indices = sps.argrelmax(np.array(fourier_tempo_vector))[0]
    auto_frequencies = librosa.tempo_frequencies(len(auto_tempo_vector))
    fourier_frequencies = librosa.fourier_tempo_frequencies(hop_length=hop_size)
    auto_tempo_vector = sorted([[auto_tempo_vector[id], id] for id in auto_peak_indices], key=lambda x: x[0], reverse=True)
    fourier_tempo_vector = sorted([[fourier_tempo_vector[id], id] for id in fourier_peak_indices], key=lambda x: x[0], reverse=True)
    return auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies
def print_result(label, p_score, alotc_score):
   border = "+" + "-" * 28 + "+"
   line = "|{:^10}|{:^6.2f}|{:^6.2f}|"
   print(border)
   print(line.format(label, p_score, alotc_score))
   print(border)
def get_reference_tempo(annotation_file: str) -> float:
    with open(annotation_file, 'r') as file:
        return float(file.readline().strip())
def calculate_tempo_scores(auto_tempo_vector: list[float], fourier_tempo_vector: list[float], auto_frequencies: list[float], fourier_frequencies: list[float], reference_tempo: float) -> tuple[float, float, float, float]:
    if len(auto_tempo_vector) < 2 or len(fourier_tempo_vector) < 2:
        return 0.0, 0.0, 0.0, 0.0
    top_auto_tempos = [auto_frequencies[auto_tempo_vector[0][1]], auto_frequencies[auto_tempo_vector[1][1]]]
    top_fourier_tempos = [fourier_frequencies[fourier_tempo_vector[0][1]], fourier_frequencies[fourier_tempo_vector[1][1]]]
    auto_tempo_weight = auto_tempo_vector[0][0] / (auto_tempo_vector[0][0] + auto_tempo_vector[1][0])
    fourier_tempo_weight = fourier_tempo_vector[0][0] / (fourier_tempo_vector[0][0] + fourier_tempo_vector[1][0])
    auto_p_score = P_SCORE(top_auto_tempos, auto_tempo_weight, reference_tempo)
    fourier_p_score = P_SCORE(top_fourier_tempos, fourier_tempo_weight, reference_tempo)
    auto_alotc_score = ALOTC_SCORE(top_auto_tempos, reference_tempo)
    fourier_alotc_score = ALOTC_SCORE(top_fourier_tempos, reference_tempo)
    return auto_p_score, fourier_p_score, auto_alotc_score, fourier_alotc_score
Ballroom = ['ChaCha', 'Jive', 'Quickstep', 'Rumba', 'Samba', 'Tango', 'Viennese waltz', 'Waltz']
for genre in Ballroom:
    print(genre)
    avg_p_scores = [0.0, 0.0]
    avg_alotc_scores = [0.0, 0.0]
    genre_folder = os.path.join('Ballroom', 'BallroomData', genre)
    audio_files = os.listdir(genre_folder)
    for audio_file in audio_files:
        audio_file_path = os.path.join(genre_folder, audio_file)
        annotation_file = os.path.join('Ballroom', 'BallroomAnnotations', 'ballroomGroundTruth', audio_file[:-3] + 'bpm')
        reference_tempo = get_reference_tempo(annotation_file)
        auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies = extract_tempo_features(audio_file_path)
        auto_p_score, fourier_p_score, auto_alotc_score, fourier_alotc_score = calculate_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo)
        avg_p_scores[0] += auto_p_score
        avg_p_scores[1] += fourier_p_score
        avg_alotc_scores[0] += auto_alotc_score
        avg_alotc_scores[1] += fourier_alotc_score
    num_files = len(audio_files)
    if num_files:
        avg_p_scores = [score / num_files for score in avg_p_scores]
        avg_alotc_scores = [score / num_files for score in avg_alotc_scores]
    print_result("AC", avg_p_scores[0], avg_alotc_scores[0])
    print_result("FOURIER", avg_p_scores[1], avg_alotc_scores[1])

ChaCha
+----------------------------+
|    AC    | 0.47 | 0.91 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.00 | 0.00 |
+----------------------------+
Jive
+----------------------------+
|    AC    | 0.44 | 0.88 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.36 | 0.72 |
+----------------------------+
Quickstep
+----------------------------+
|    AC    | 0.46 | 0.93 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.58 | 0.96 |
+----------------------------+
Rumba
+----------------------------+
|    AC    | 0.46 | 0.91 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.00 | 0.00 |
+----------------------------+
Samba
+----------------------------+
|    AC    | 0.36 | 0.71 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.00 | 0.01 |
+----------------------------+
Tango
+----------------------------+
|    AC    | 0.48 | 0.93 |
+-----------

Q2

In [14]:
def load_audio_file(data_folder, audio_file):
    return librosa.load(os.path.join(data_folder, audio_file))
def compute_tempograms(audio_data, sample_rate, hop_length, window_length):
    onset_env = librosa.onset.onset_strength(y=audio_data, sr=sample_rate, hop_length=hop_length)
    tempogram_auto = librosa.feature.tempogram(onset_envelope=onset_env, sr=sample_rate, hop_length=hop_length, win_length=window_length)
    tempogram_fourier = librosa.feature.fourier_tempogram(onset_envelope=onset_env, sr=sample_rate, hop_length=hop_length, win_length=window_length)
    return tempogram_auto, tempogram_fourier
def compute_tempo_vectors(tempogram_auto, tempogram_fourier):
    tempo_vector_auto = np.sum(tempogram_auto, axis=1) / tempogram_auto.shape[1]
    tempo_vector_fourier = np.sum(np.abs(tempogram_fourier), axis=1) / tempogram_fourier.shape[1]
    return tempo_vector_auto, tempo_vector_fourier
def get_peak_indices(tempo_vector_auto, tempo_vector_fourier):
    peak_indices_auto = sps.argrelmax(np.array(tempo_vector_auto))[0]
    peak_indices_fourier = sps.argrelmax(np.array(tempo_vector_fourier))[0]
    return peak_indices_auto, peak_indices_fourier
def get_top_tempos(tempo_vector_auto, tempo_vector_fourier, peak_indices_auto, peak_indices_fourier, auto_frequencies, fourier_frequencies):
    sorted_auto_tempos = sorted([[tempo_vector_auto[idx], idx] for idx in peak_indices_auto], key=lambda x: x[0], reverse=True)
    sorted_fourier_tempos = sorted([[tempo_vector_fourier[idx], idx] for idx in peak_indices_fourier], key=lambda x: x[0], reverse=True)
    if len(sorted_auto_tempos) < 2:
        sorted_auto_tempos.extend([[0, 0]] * (2 - len(sorted_auto_tempos)))
    if len(sorted_fourier_tempos) < 2:
        sorted_fourier_tempos.extend([[0, 0]] * (2 - len(sorted_fourier_tempos)))
    top2_auto_tempos = [auto_frequencies[sorted_auto_tempos[0][1]], auto_frequencies[sorted_auto_tempos[1][1]]]
    top2_fourier_tempos = [fourier_frequencies[sorted_fourier_tempos[0][1]], fourier_frequencies[sorted_fourier_tempos[1][1]]]
    return top2_auto_tempos, top2_fourier_tempos
def load_reference_bpm(bpm_file):
    with open(bpm_file, 'r') as file:
        return float(file.readline().strip())
def compute_alotc_scores(top_auto_tempos, top_fourier_tempos, reference_bpm):
    alotc_score_auto = ALOTC_SCORE(top_auto_tempos, reference_bpm)
    alotc_score_fourier = ALOTC_SCORE(top_fourier_tempos, reference_bpm)
    return alotc_score_auto, alotc_score_fourier
def process_genre(genre_name, window_length, sample_rate=22050):
    data_folder = 'Ballroom/BallroomData/' + genre_name
    audio_files = os.listdir(data_folder)
    avg_alotc_scores = [0, 0]
    hop_length = 512
    win_length = 0
    for audio_file in audio_files:
        bpm_file = 'Ballroom/BallroomAnnotations/ballroomGroundTruth/' + audio_file[:-3] + 'bpm'
        audio_data, sample_rate = load_audio_file(data_folder, audio_file)
        win_length = int(window_length * sample_rate / hop_length)
        tempogram_auto, tempogram_fourier = compute_tempograms(audio_data, sample_rate, hop_length, win_length)
        tempo_vector_auto, tempo_vector_fourier = compute_tempo_vectors(tempogram_auto, tempogram_fourier)
        peak_indices_auto, peak_indices_fourier = get_peak_indices(tempo_vector_auto, tempo_vector_fourier)
        if len(peak_indices_auto) < 2 or len(peak_indices_fourier) < 2:
            continue
        auto_frequencies = librosa.tempo_frequencies(len(tempo_vector_auto))
        fourier_frequencies = librosa.fourier_tempo_frequencies(hop_length=hop_length)
        top_auto_tempos, top_fourier_tempos = get_top_tempos(tempo_vector_auto, tempo_vector_fourier, peak_indices_auto, peak_indices_fourier, auto_frequencies, fourier_frequencies)
        reference_bpm = load_reference_bpm(bpm_file)
        alotc_score_auto, alotc_score_fourier = compute_alotc_scores(top_auto_tempos, top_fourier_tempos, reference_bpm)
        avg_alotc_scores[0] += alotc_score_auto
        avg_alotc_scores[1] += alotc_score_fourier
    if audio_files:
        avg_alotc_scores = [score / len(audio_files) for score in avg_alotc_scores]
    return avg_alotc_scores
window_lengths = [4, 8, 12]
results = {}
for window_length in window_lengths:
    results[window_length] = {}
    for genre in Ballroom:
        results[window_length][genre] = process_genre(genre, window_length)

for window_length, genres in results.items():
    print('Window Length: {}s'.format(window_length))
    for genre, scores in genres.items():
        print('{:<15} - AC: {:.6f}, FOURIER: {:.6f}'.format(genre, scores[0], scores[1]))

Window Length: 4s
ChaCha          - AC: 0.990991, FOURIER: 0.387387
Jive            - AC: 1.000000, FOURIER: 0.016667
Quickstep       - AC: 0.939024, FOURIER: 0.024390
Rumba           - AC: 0.857143, FOURIER: 0.020408
Samba           - AC: 0.465116, FOURIER: 0.162791
Tango           - AC: 1.000000, FOURIER: 0.081395
Viennese waltz  - AC: 0.938462, FOURIER: 0.015385
Waltz           - AC: 0.281818, FOURIER: 0.372727
Window Length: 8s
ChaCha          - AC: 0.954955, FOURIER: 0.000000
Jive            - AC: 0.933333, FOURIER: 0.016667
Quickstep       - AC: 0.939024, FOURIER: 0.036585
Rumba           - AC: 0.887755, FOURIER: 0.000000
Samba           - AC: 0.709302, FOURIER: 0.011628
Tango           - AC: 0.953488, FOURIER: 0.081395
Viennese waltz  - AC: 0.953846, FOURIER: 0.015385
Waltz           - AC: 0.563636, FOURIER: 0.036364
Window Length: 12s
ChaCha          - AC: 0.891892, FOURIER: 0.000000
Jive            - AC: 0.800000, FOURIER: 0.000000
Quickstep       - AC: 0.865854, FOURIER: 0.02

Q3

In [15]:
import os
import librosa
import numpy as np
import scipy.signal as sps

def extract_tempo_features(audio_file: str, hop_size: int = 512) -> tuple[list[float], list[float], list[float], list[float]]:
    audio_signal, sampling_rate = librosa.load(audio_file)
    onset_env = librosa.onset.onset_strength(y=audio_signal, sr=sampling_rate, hop_length=hop_size)
    auto_tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sampling_rate, hop_length=hop_size, norm=None)
    fourier_tempogram = librosa.feature.fourier_tempogram(onset_envelope=onset_env, sr=sampling_rate, hop_length=hop_size)
    auto_tempo_vector = np.sum(auto_tempogram, axis=1) / auto_tempogram.shape[1]
    fourier_tempo_vector = np.sum(np.abs(fourier_tempogram), axis=1) / fourier_tempogram.shape[1]
    auto_peak_indices = sps.argrelmax(np.array(auto_tempo_vector))[0]
    fourier_peak_indices = sps.argrelmax(np.array(fourier_tempo_vector))[0]
    auto_frequencies = librosa.tempo_frequencies(len(auto_tempo_vector))
    fourier_frequencies = librosa.fourier_tempo_frequencies(hop_length=hop_size)
    auto_tempo_vector = sorted([[auto_tempo_vector[id], id] for id in auto_peak_indices], key=lambda x: x[0], reverse=True)
    fourier_tempo_vector = sorted([[fourier_tempo_vector[id], id] for id in fourier_peak_indices], key=lambda x: x[0], reverse=True)
    return auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies

def print_result(label, p_score, alotc_score):
    border = "+" + "-" * 28 + "+"
    line = "|{:^10}|{:^6.2f}|{:^6.2f}|"
    print(border)
    print(line.format(label, p_score, alotc_score))
    print(border)

def get_reference_tempo(annotation_file: str) -> float:
    with open(annotation_file, 'r') as file:
        return float(file.readline().strip())

def calculate_early_fusion_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo):
    if len(auto_tempo_vector) < 2 or len(fourier_tempo_vector) < 2:
        return 0.0, 0.0

    combined_tempos = auto_tempo_vector[:2] + fourier_tempo_vector[:2]
    combined_tempos = sorted(combined_tempos, key=lambda x: x[0], reverse=True)[:2]
    top_tempos = [auto_frequencies[combined_tempos[0][1]] if combined_tempos[0][1] < len(auto_frequencies) else fourier_frequencies[combined_tempos[0][1] - len(auto_frequencies)],
                  auto_frequencies[combined_tempos[1][1]] if combined_tempos[1][1] < len(auto_frequencies) else fourier_frequencies[combined_tempos[1][1] - len(auto_frequencies)]]
    tempo_weight = combined_tempos[0][0] / (combined_tempos[0][0] + combined_tempos[1][0])
    p_score = P_SCORE(top_tempos, tempo_weight, reference_tempo)
    alotc_score = ALOTC_SCORE(top_tempos, reference_tempo)
    return p_score, alotc_score

def calculate_late_fusion_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo):
    auto_p_score, fourier_p_score, auto_alotc_score, fourier_alotc_score = calculate_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo)
    avg_p_score = (auto_p_score + fourier_p_score) / 2
    avg_alotc_score = (auto_alotc_score + fourier_alotc_score) / 2
    return avg_p_score, avg_alotc_score

def calculate_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo):
    if len(auto_tempo_vector) < 2 or len(fourier_tempo_vector) < 2:
        return 0.0, 0.0, 0.0, 0.0

    top_auto_tempos = [auto_frequencies[auto_tempo_vector[0][1]], auto_frequencies[auto_tempo_vector[1][1]]]
    top_fourier_tempos = [fourier_frequencies[fourier_tempo_vector[0][1]], fourier_frequencies[fourier_tempo_vector[1][1]]]
    auto_tempo_weight = auto_tempo_vector[0][0] / (auto_tempo_vector[0][0] + auto_tempo_vector[1][0])
    fourier_tempo_weight = fourier_tempo_vector[0][0] / (fourier_tempo_vector[0][0] + fourier_tempo_vector[1][0])
    auto_p_score = P_SCORE(top_auto_tempos, auto_tempo_weight, reference_tempo)
    fourier_p_score = P_SCORE(top_fourier_tempos, fourier_tempo_weight, reference_tempo)
    auto_alotc_score = ALOTC_SCORE(top_auto_tempos, reference_tempo)
    fourier_alotc_score = ALOTC_SCORE(top_fourier_tempos, reference_tempo)
    return auto_p_score, fourier_p_score, auto_alotc_score, fourier_alotc_score

Ballroom = ['ChaCha', 'Jive', 'Quickstep', 'Rumba', 'Samba', 'Tango', 'Viennese waltz', 'Waltz']
for genre in Ballroom:
    print(genre)
    avg_p_scores = [0.0, 0.0, 0.0]
    avg_alotc_scores = [0.0, 0.0, 0.0]
    genre_folder = os.path.join('Ballroom', 'BallroomData', genre)
    audio_files = os.listdir(genre_folder)
    for audio_file in audio_files:
        audio_file_path = os.path.join(genre_folder, audio_file)
        annotation_file = os.path.join('Ballroom', 'BallroomAnnotations', 'ballroomGroundTruth', audio_file[:-3] + 'bpm')
        reference_tempo = get_reference_tempo(annotation_file)
        auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies = extract_tempo_features(audio_file_path)
        auto_p_score, fourier_p_score, auto_alotc_score, fourier_alotc_score = calculate_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo)
        early_p_score, early_alotc_score = calculate_early_fusion_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo)
        late_p_score, late_alotc_score = calculate_late_fusion_tempo_scores(auto_tempo_vector, fourier_tempo_vector, auto_frequencies, fourier_frequencies, reference_tempo)
        avg_p_scores[0] += auto_p_score
        avg_p_scores[1] += fourier_p_score
        avg_p_scores[2] += late_p_score
        avg_alotc_scores[0] += auto_alotc_score
        avg_alotc_scores[1] += fourier_alotc_score
        avg_alotc_scores[2] += late_alotc_score
    num_files = len(audio_files)
    if num_files:
        avg_p_scores = [score / num_files for score in avg_p_scores]
        avg_alotc_scores = [score / num_files for score in avg_alotc_scores]
    print_result("AC", avg_p_scores[0], avg_alotc_scores[0])
    print_result("FOURIER", avg_p_scores[1], avg_alotc_scores[1])
    print_result("LATE", avg_p_scores[2], avg_alotc_scores[2])

ChaCha
+----------------------------+
|    AC    | 0.47 | 0.91 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.00 | 0.00 |
+----------------------------+
+----------------------------+
|   LATE   | 0.23 | 0.45 |
+----------------------------+
Jive
+----------------------------+
|    AC    | 0.44 | 0.88 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.36 | 0.72 |
+----------------------------+
+----------------------------+
|   LATE   | 0.40 | 0.80 |
+----------------------------+
Quickstep
+----------------------------+
|    AC    | 0.46 | 0.93 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.58 | 0.96 |
+----------------------------+
+----------------------------+
|   LATE   | 0.52 | 0.95 |
+----------------------------+
Rumba
+----------------------------+
|    AC    | 0.46 | 0.91 |
+----------------------------+
+----------------------------+
| FOURIER  | 0.00 | 0.00 |
+-----------------------

Task 2: using dynamic programming for beat tracking

Q4

In [17]:
for genre in Ballroom:
    total_f_score = 0
    folder_path = os.path.join('Ballroom', 'BallroomData', genre)
    files = os.listdir(folder_path)
    for wavfile in files:
        anno_beats_file = os.path.join('Ballroom', 'BallroomAnnotations-master', wavfile.replace('.wav', '.beats'))
        reference_beats = []
        with open(anno_beats_file, 'r') as anno:
            for line in anno:
                reference_beats.append(float(line.split()[0]))
        y, sr = librosa.load(os.path.join(folder_path, wavfile))
        tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
        estimated_beats = librosa.frames_to_time(beats, sr=sr)
        total_f_score += mir_eval.beat.f_measure(np.array(reference_beats), estimated_beats)
    avg_f_score = total_f_score / len(files)
    print(f'+{"-"*40}+')
    print(f'|{"Genre:":<20}{genre:<20}|')
    print(f'|{"Average F-score:":<20}{avg_f_score:.6f}{"":<14}|')
    print(f'+{"-"*40}+')

+----------------------------------------+
|Genre:              ChaCha              |
|Average F-score:    0.883743              |
+----------------------------------------+
+----------------------------------------+
|Genre:              Jive                |
|Average F-score:    0.657477              |
+----------------------------------------+
+----------------------------------------+
|Genre:              Quickstep           |
|Average F-score:    0.613898              |
+----------------------------------------+
+----------------------------------------+
|Genre:              Rumba               |
|Average F-score:    0.790903              |
+----------------------------------------+
+----------------------------------------+
|Genre:              Samba               |
|Average F-score:    0.562916              |
+----------------------------------------+
+----------------------------------------+
|Genre:              Tango               |
|Average F-score:    0.796862              |