# Data Cleaning

## Numeric Feature Data

In [1]:
# Imports
import pandas as pd

In [2]:
# Reading in the data
genre = pd.read_csv('../data/genre.csv')

In [3]:
# Taking a look at the data
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels
0,<DirEntry 'reggae.00080.wav'>,0.094298,2539.121009,5260.77136,-103.136673,67.205032,2.212647,28.504496,4.172197,11.874951,14.244764,14.292829,3.261752,15.854269,8.042967,4.469097,8.679431,<DirEntry 'reggae
1,<DirEntry 'jazz.00016.wav'>,0.069845,1465.857446,2822.406728,-259.87674,123.187164,-6.390842,37.570335,-2.977656,13.057896,-14.083035,5.310007,-11.961549,3.524627,-9.633516,5.333287,-7.843499,<DirEntry 'jazz
2,<DirEntry 'disco.00052.wav'>,0.169775,2590.650686,5060.527559,-70.502701,90.517845,-48.066078,36.687813,-17.691069,21.595446,-30.198866,24.046898,-20.459778,14.487847,-18.499725,9.128921,-12.056172,<DirEntry 'disco
3,<DirEntry 'jazz.00002.wav'>,0.057857,1064.668667,1895.729578,-256.959015,175.358765,-44.822285,25.65062,-4.255735,-0.222764,-11.312749,-9.189112,-4.09536,-8.30826,-22.548216,-7.608586,-7.651291,<DirEntry 'jazz
4,<DirEntry 'disco.00046.wav'>,0.114198,2259.565542,4889.552594,-125.681534,101.784462,-17.4781,33.672756,-15.236323,23.70314,-8.659072,18.544029,-14.054308,19.970242,-15.139117,12.969249,-14.306309,<DirEntry 'disco


### Creating Labels

In [4]:
# Fixing the file names and labels
genre['files'] = genre['files'].map(lambda x: x[11:-2])
genre['labels'] = genre['labels'].map(lambda x: x[11:])

In [5]:
# Mapping the labels to numeric values
label_map = {
    'blues': 1,
    'classical': 2,
    'country': 3,
    'disco': 4,
    'hiphop': 5,
    'jazz': 6,
    'metal': 7,
    'pop': 8,
    'reggae': 9,
    'rock': 10
}

genre['y'] = genre['labels'].map(label_map)

In [6]:
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels,y
0,reggae.00080.wav,0.094298,2539.121009,5260.77136,-103.136673,67.205032,2.212647,28.504496,4.172197,11.874951,14.244764,14.292829,3.261752,15.854269,8.042967,4.469097,8.679431,reggae,9
1,jazz.00016.wav,0.069845,1465.857446,2822.406728,-259.87674,123.187164,-6.390842,37.570335,-2.977656,13.057896,-14.083035,5.310007,-11.961549,3.524627,-9.633516,5.333287,-7.843499,jazz,6
2,disco.00052.wav,0.169775,2590.650686,5060.527559,-70.502701,90.517845,-48.066078,36.687813,-17.691069,21.595446,-30.198866,24.046898,-20.459778,14.487847,-18.499725,9.128921,-12.056172,disco,4
3,jazz.00002.wav,0.057857,1064.668667,1895.729578,-256.959015,175.358765,-44.822285,25.65062,-4.255735,-0.222764,-11.312749,-9.189112,-4.09536,-8.30826,-22.548216,-7.608586,-7.651291,jazz,6
4,disco.00046.wav,0.114198,2259.565542,4889.552594,-125.681534,101.784462,-17.4781,33.672756,-15.236323,23.70314,-8.659072,18.544029,-14.054308,19.970242,-15.139117,12.969249,-14.306309,disco,4


#### Export

In [8]:
genre.to_csv('../data/genre_clean.csv', index=False)

## Mel Spectrogram Data

In [15]:
# Reading in the data
mel_specs = pd.read_csv('../data/genre_mel_specs.csv')

In [16]:
# Taking a look at the data
mel_specs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84471,84472,84473,84474,84475,84476,84477,84478,84479,84480
0,-24.185581,-27.340237,-27.214727,-23.694857,-10.349419,-11.817766,-19.678226,-26.396507,-25.827953,-30.779963,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,reggae
1,-34.482864,-29.062458,-34.239838,-41.848854,-38.88456,-46.740932,-41.780075,-42.449265,-37.310295,-38.989906,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jazz
2,-31.571878,-38.35721,-37.502674,-32.508842,-22.891088,-28.22428,-33.885254,-35.000267,-35.614574,-36.325577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,disco
3,-38.70638,-58.00811,-51.76494,-48.831757,-54.74565,-56.27259,-52.029984,-54.15286,-57.492695,-55.96265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jazz
4,-13.395082,-22.886148,-39.371693,-39.873516,-44.111927,-41.430832,-35.432774,-43.907005,-44.25971,-45.559254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,disco


### Creating Labels

In [17]:
# Renaming the label column and mapping them to numeric values using the same map as above
mel_specs = mel_specs.rename(columns={'84480': 'labels'})
mel_specs['y'] = mel_specs['labels'].map(label_map)

#### Export

In [19]:
mel_specs.to_csv('../data/genre_mel_specs_clean.csv', index=False)