# Consolidate & clean Harmonix annotations

## Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use('seaborn-darkgrid')
pd.set_option('display.max_columns', None) 

DEBUG = False

## Load local modules
Thanks to [this article](https://medium.com/swlh/how-to-structure-a-python-based-data-science-project-a-short-tutorial-for-beginners-7e00bff14f56)

In [2]:
import os
import sys

ROOT = os.path.join(os.getcwd(), '..', '..')

src_dir = os.path.join(ROOT, 'src')
sys.path.append(src_dir)

from constants import *
from labels import load_annotations

## Load clean dataset

In [3]:
harmonix = pd.read_csv(os.path.join(ROOT, INTER_DIR, 'harmonix.csv'))
harmonix.head(5)

Unnamed: 0,File,Duration,BPM,Time_Signature,Genre
0,0001_12step,142.47,113,4|4,R&B
1,0003_6foot7foot,157.347,84,4|4,Hip-Hop
2,0004_abc,180.955,94,4|4,Pop-Rock
3,0006_aint2proud2beg,181.034,105,4|4,R&B
4,0008_america,222.683,136,4|4,Metal


## Drop rows with no annotations

In [4]:
harmonix['First_annotation'] = harmonix.File.map(lambda filename: os.path.join(ROOT, HARMONIX_LABELS, filename + '.txt'))
harmonix['First_exists'] = harmonix.First_annotation.map(lambda filepath: os.path.exists(filepath))
print(f"Dataset size before removal: {harmonix.shape[0]}")
harmonix = harmonix[harmonix['First_exists']]
print(f"Dataset size after removal: {harmonix.shape[0]} (no labels available)")

Dataset size before removal: 890
Dataset size after removal: 890 (no labels available)


## Check label values

In [5]:
raw = harmonix.copy()
annotations = raw.apply(lambda row: load_annotations(row, False), axis=1).values
# Convert list of tuples to list of lists
raw['Labels'], raw['Sections'] = list(map(list, zip(*annotations)))

In [6]:
unique =  np.unique(np.concatenate(raw['Sections'].values).ravel())
unique

array(['altchorus', 'bigoutro', 'bre', 'break', 'break1', 'break2',
       'break3', 'breakdown', 'breakdown2', 'bridge', 'bridge1',
       'bridge2', 'bridge3', 'build', 'chorus', 'chorus1', 'chorus2',
       'chorus3', 'chorus_instrumental', 'chorushalf', 'chorusinst',
       'choruspart', 'chrous', 'chrous2', 'drumroll', 'end', 'fadein',
       'fast', 'gtr', 'gtr2', 'gtrbreak', 'guitar', 'guitarsolo', 'inrto',
       'inst', 'inst2', 'instbridge', 'instchorus', 'instintro',
       'instrumental', 'instrumental2', 'instrumental3',
       'instrumentalverse', 'intchorus', 'intro', 'intro2', 'intro3',
       'intro4', 'intro5', 'intro6', 'intro7', 'intro8', 'introchorus',
       'intropt2', 'introverse', 'isnt', 'mainriff', 'mainriff2',
       'miniverse', 'oddriff', 'opening', 'outro', 'outro1', 'outro2',
       'outro3', 'outroa', 'postchorus', 'postchorus2', 'postverse',
       'prechors', 'prechorus', 'prechorus2', 'prechorus3', 'prechorus5',
       'prechrous', 'prehorus', 'preve

In [7]:
annotations = harmonix.apply(lambda row: load_annotations(row), axis=1).values
# Convert list of tuples to list of lists
harmonix['Labels'], harmonix['Sections'] = list(map(list, zip(*annotations)))
# Filter out songs with very few song sections
harmonix['Size'] = harmonix['Sections'].map(lambda section: section.shape[0] > 2)
harmonix = harmonix[harmonix['Size']]
harmonix = harmonix.drop(['Size'], axis=1)
harmonix.head(1)

Unnamed: 0,File,Duration,BPM,Time_Signature,Genre,First_annotation,First_exists,Labels,Sections
0,0001_12step,142.47,113,4|4,R&B,/Users/admin/Downloads/fypdataset/notebooks/02...,True,"[0.0, 8.495567999999999, 25.486704, 42.4753280...","[intro, verse, chorus, verse, chorus, verse, c..."


In [8]:
unique =  np.unique(np.concatenate(harmonix['Sections'].values).ravel())
unique

array(['bridge', 'chorus', 'intro', 'outro', 'silence',
       'solo/instrumental', 'transition', 'verse'], dtype=object)

## Save annotations

In [9]:
harmonix = harmonix.drop(['First_annotation', 'First_exists'], axis=1)
harmonix.to_csv(os.path.join(ROOT, ANNOTATIONS_DIR, 'harmonix.csv'), index=False)