# Calculate Song Onsets
In order to pad some of the songs so that  
they match the duration of the annotated  
version of the song, we need to calculate  
the time of the first beat or onset

## Import libraries

In [None]:
import pandas as pd
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

## Set Constants

In [None]:
HARMONIX_DATASET = "../datasets/harmonix/metadata.csv"
OUTPUT_DIR = "../datasets/harmonix/youtube_metadata.csv"
AUDIO_DIR = "../downloads/harmonix/"
BEAT_DATA_DIR = "../datasets/harmonix/beats_and_downbeats/"

SR = 22050

output = { 
    "File": [], 
    "yt_onset": [], 
    "yt_dur": [], 
    "dur": [],
    "onset": [] 
}

## Load Dataset

In [None]:
df = pd.read_csv(HARMONIX_DATASET)
df.head()

## Get onset from annotation

In [None]:
def get_harmonix_onset(path):
    onset = 0
    f = open(path, 'r')
    line = f.readline()
    onset = line.split()[0]
    f.close()
    return onset

## Calculate the YouTube download onset

In [None]:
def get_youtube_metadata(path):
    duration = librosa.get_duration(filename=path)
    x, _ = librosa.load(path, duration=10)
    frames = librosa.onset.onset_detect(x, sr=SR)
    times = librosa.frames_to_time(frames)
    return times[0], duration

## Iterate through songs and get onsets

In [None]:
for _,item in tqdm(df.iterrows(), total=len(df)):
    file_name = item["File"]
    audio_file = AUDIO_DIR + file_name + ".m4a"
    text_file = BEAT_DATA_DIR + file_name + ".txt"
    if (os.path.exists(audio_file)):
        dur = item["Duration"]
        onset = get_harmonix_onset(text_file)
        yt_onset, yt_dur = get_youtube_metadata(audio_file)
        output["File"].append(file_name)
        output["yt_onset"].append(yt_onset)
        output["yt_dur"].append(yt_dur)
        output["onset"].append(onset)
        output["dur"].append(dur)

## Save results to file

In [None]:
out_df = pd.DataFrame(output)
out_df.to_csv(OUTPUT_DIR, index=None)