# Import and clean Harmonix Dataset

## Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use('seaborn-darkgrid')
pd.set_option('display.max_columns', None) 

DEBUG = False

## Load local modules
Thanks to [this article](https://medium.com/swlh/how-to-structure-a-python-based-data-science-project-a-short-tutorial-for-beginners-7e00bff14f56)

In [2]:
import os
import sys

ROOT = os.path.join(os.getcwd(), '..', '..')

src_dir = os.path.join(ROOT, 'src')
sys.path.append(src_dir)

from constants import *

## Load dataset

In [3]:
harmonix = pd.read_csv(os.path.join(ROOT, HARMONIX))
harmonix.head(5)

Unnamed: 0,File,Title,Artist,Release,Duration,BPM,Ratio Bars in 4,Time Signature,Genre,MusicBrainz Id,Acoustid Id
0,0001_12step,"1, 2 Step",Ciara,Goodies,142.47,113,100.0,4|4,R&B,0408655f-189f-371b-9c41-ec861e1a7810,4708e4ae-a3eb-4b7a-b701-ff3a142b2bcb
1,0003_6foot7foot,6 Foot 7 Foot,Lil Wayne,Tha Carter IV,157.347,84,100.0,4|4,Hip-Hop,83347ae2-5def-378a-a3f5-96ec56c25ab7,
2,0004_abc,ABC,The Jackson 5,Hits for Kids Pop Party 8,180.955,94,94.594595,4|4,Pop-Rock,5f1604ed-5c6b-4a85-8391-15aa61ae7f98,88ddde1c-8009-497a-b295-e61125bb5162
3,0005_again,Again,Flyleaf,Memento Mori,192.067,78,0.0,6|8,Alternative,09aed1ac-4094-3337-86ef-8303531d57f1,
4,0006_aint2proud2beg,Ain’t 2 Proud 2 Beg,TLC,Now & Forever: The Hits,181.034,105,100.0,4|4,R&B,09723bc0-b3e9-4f86-a563-c80d25df049e,


In [4]:
columns = harmonix.columns
columns

Index(['File', 'Title', 'Artist', 'Release', 'Duration', 'BPM',
       'Ratio Bars in 4', 'Time Signature', 'Genre', 'MusicBrainz Id',
       'Acoustid Id'],
      dtype='object')

In [5]:
harmonix.columns = [c.replace(' ', '_') for c in columns]
harmonix = harmonix.drop(["Release", "Ratio_Bars_in_4", "MusicBrainz_Id", "Acoustid_Id", "Artist", "Title"], axis=1)
harmonix.head(1)

Unnamed: 0,File,Duration,BPM,Time_Signature,Genre
0,0001_12step,142.47,113,4|4,R&B


## Filter irrelevant data

In [6]:
# Only load labels for songs in
# 4|4 time
filtered = harmonix[harmonix.Time_Signature == '4|4']
filtered.head(5)

Unnamed: 0,File,Duration,BPM,Time_Signature,Genre
0,0001_12step,142.47,113,4|4,R&B
1,0003_6foot7foot,157.347,84,4|4,Hip-Hop
2,0004_abc,180.955,94,4|4,Pop-Rock
4,0006_aint2proud2beg,181.034,105,4|4,R&B
5,0008_america,222.683,136,4|4,Metal


## Save filtered dataset

In [7]:
filtered.to_csv(os.path.join(ROOT, INTER_DIR, 'harmonix.csv'), index=False)