# Data Cleaning

## Numeric Feature Data

In [1]:
# Imports
import pandas as pd

In [2]:
# Reading in the data
genre = pd.read_csv('../data/genre.csv')

In [3]:
# Taking a look at the data
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels
0,<DirEntry 'classical.00095.wav'>,0.124354,1800.888692,3427.243261,-422.539001,89.848892,-18.736307,37.781479,5.998186,4.155726,-1.990408,12.125071,4.1625,13.789366,2.827351,5.892111,6.119672,<DirEntry 'classical
1,<DirEntry 'rock.00034.wav'>,0.134925,2788.250576,5831.991493,-28.014143,65.664986,-7.318477,33.754238,11.125959,14.607222,2.095349,10.01047,-0.225996,10.134683,-2.854557,3.099468,-1.664788,<DirEntry 'rock
2,<DirEntry 'pop.00004.wav'>,0.123709,2753.120791,5986.680118,-20.897747,83.238922,5.046143,16.362432,9.404126,-0.563406,4.042763,6.208806,9.122958,5.869022,-3.498068,-0.633253,-3.170406,<DirEntry 'pop
3,<DirEntry 'jazz.00088.wav'>,0.069027,2427.985227,5925.810932,-192.507477,80.970596,25.695179,18.540184,14.171223,1.25742,4.086614,-1.414906,2.667313,5.245388,3.723577,0.289519,7.333017,<DirEntry 'jazz
4,<DirEntry 'metal.00006.wav'>,0.198043,3236.92534,6227.983557,-140.729111,55.937737,-12.313358,44.344326,-10.599383,10.875739,-0.949337,22.253336,-9.380368,15.242425,-3.061906,9.06085,-8.804061,<DirEntry 'metal


### Creating Labels

In [4]:
# Fixing the file names and labels
genre['files'] = genre['files'].map(lambda x: x[11:-2])
genre['labels'] = genre['labels'].map(lambda x: x[11:])

In [5]:
# Mapping the labels to numeric values
label_map = {
    'blues': 1,
    'classical': 2,
    'country': 3,
    'disco': 4,
    'hiphop': 5,
    'jazz': 6,
    'metal': 7,
    'pop': 8,
    'reggae': 9,
    'rock': 10
}

genre['y'] = genre['labels'].map(label_map)

In [6]:
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels,y
0,classical.00095.wav,0.124354,1800.888692,3427.243261,-422.539001,89.848892,-18.736307,37.781479,5.998186,4.155726,-1.990408,12.125071,4.1625,13.789366,2.827351,5.892111,6.119672,classical,2
1,rock.00034.wav,0.134925,2788.250576,5831.991493,-28.014143,65.664986,-7.318477,33.754238,11.125959,14.607222,2.095349,10.01047,-0.225996,10.134683,-2.854557,3.099468,-1.664788,rock,10
2,pop.00004.wav,0.123709,2753.120791,5986.680118,-20.897747,83.238922,5.046143,16.362432,9.404126,-0.563406,4.042763,6.208806,9.122958,5.869022,-3.498068,-0.633253,-3.170406,pop,8
3,jazz.00088.wav,0.069027,2427.985227,5925.810932,-192.507477,80.970596,25.695179,18.540184,14.171223,1.25742,4.086614,-1.414906,2.667313,5.245388,3.723577,0.289519,7.333017,jazz,6
4,metal.00006.wav,0.198043,3236.92534,6227.983557,-140.729111,55.937737,-12.313358,44.344326,-10.599383,10.875739,-0.949337,22.253336,-9.380368,15.242425,-3.061906,9.06085,-8.804061,metal,7


#### Export

In [7]:
genre.to_csv('../data/genre_clean.csv', index=False)

## Mel Spectrogram Data

In [8]:
# Reading in the data
mel_specs = pd.read_csv('../data/genre_mel_specs.csv')

In [9]:
# Taking a look at the data
mel_specs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84471,84472,84473,84474,84475,84476,84477,84478,84479,84480
0,-38.71436,-33.474228,-27.310455,-25.299803,-28.430004,-28.678144,-27.830578,-26.89418,-34.463097,-30.501217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,classical
1,-21.302162,-39.085693,-28.659452,-31.36457,-30.419193,-40.327023,-28.70608,-43.529984,-33.345123,-33.197315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rock
2,-15.267654,-14.026318,-14.920742,-16.21959,-16.906425,-20.542664,-25.68327,-10.716038,-21.445236,-18.547516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pop
3,-31.311068,-36.68953,-42.98152,-38.595932,-35.907497,-39.644302,-43.886433,-42.308525,-35.456673,-33.849125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jazz
4,-18.864574,-22.681887,-24.525406,-29.33086,-28.273998,-29.35329,-31.51618,-25.65717,-27.893257,-30.826773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,metal


### Creating Labels

In [10]:
# Renaming the label column and mapping them to numeric values using the same map as above
mel_specs = mel_specs.rename(columns={'84480': 'labels'})
mel_specs['y'] = mel_specs['labels'].map(label_map)

#### Export

In [11]:
mel_specs.to_csv('../data/genre_mel_specs_clean.csv', index=False)