In [None]:
#!python3 -m pip install tables

In [69]:
import pandas as pd
import tables
from mmsongsdbtools.mmsongsdbtocsvconverter import MMSongsDbToCsvConverter

### What's in H5DF?

In [80]:
example_h5 = '../../MillionSongSubset/A/A/A/TRAAAAW128F429D538.h5'
h5 = tables.open_file(example_h5, mode='r')
h5.root

/ (RootGroup) 'H5 Song File'
  children := ['analysis' (Group), 'metadata' (Group), 'musicbrainz' (Group)]

In [81]:
h5.root.analysis

/analysis (Group) 'Echo Nest analysis of the song'
  children := ['bars_confidence' (EArray), 'bars_start' (EArray), 'beats_confidence' (EArray), 'beats_start' (EArray), 'sections_confidence' (EArray), 'sections_start' (EArray), 'segments_confidence' (EArray), 'segments_loudness_max' (EArray), 'segments_loudness_max_time' (EArray), 'segments_loudness_start' (EArray), 'segments_pitches' (EArray), 'segments_start' (EArray), 'segments_timbre' (EArray), 'songs' (Table), 'tatums_confidence' (EArray), 'tatums_start' (EArray)]

In [82]:
h5.root.analysis.songs

/analysis/songs (Table(1,), shuffle, zlib(1)) 'table of Echo Nest analysis for one song'
  description := {
  "analysis_sample_rate": Int32Col(shape=(), dflt=0, pos=0),
  "audio_md5": StringCol(itemsize=32, shape=(), dflt=b'', pos=1),
  "danceability": Float64Col(shape=(), dflt=0.0, pos=2),
  "duration": Float64Col(shape=(), dflt=0.0, pos=3),
  "end_of_fade_in": Float64Col(shape=(), dflt=0.0, pos=4),
  "energy": Float64Col(shape=(), dflt=0.0, pos=5),
  "idx_bars_confidence": Int32Col(shape=(), dflt=0, pos=6),
  "idx_bars_start": Int32Col(shape=(), dflt=0, pos=7),
  "idx_beats_confidence": Int32Col(shape=(), dflt=0, pos=8),
  "idx_beats_start": Int32Col(shape=(), dflt=0, pos=9),
  "idx_sections_confidence": Int32Col(shape=(), dflt=0, pos=10),
  "idx_sections_start": Int32Col(shape=(), dflt=0, pos=11),
  "idx_segments_confidence": Int32Col(shape=(), dflt=0, pos=12),
  "idx_segments_loudness_max": Int32Col(shape=(), dflt=0, pos=13),
  "idx_segments_loudness_max_time": Int32Col(shape=(), d

In [83]:
h5.root.metadata

/metadata (Group) 'metadata about the song'
  children := ['artist_terms' (EArray), 'artist_terms_freq' (EArray), 'artist_terms_weight' (EArray), 'similar_artists' (EArray), 'songs' (Table)]

In [90]:
h5.root.metadata.songs.cols

/metadata/songs.cols (Cols), 20 columns
  analyzer_version (Column(1,), |S32)
  artist_7digitalid (Column(1,), int32)
  artist_familiarity (Column(1,), float64)
  artist_hotttnesss (Column(1,), float64)
  artist_id (Column(1,), |S32)
  artist_latitude (Column(1,), float64)
  artist_location (Column(1,), |S1024)
  artist_longitude (Column(1,), float64)
  artist_mbid (Column(1,), |S40)
  artist_name (Column(1,), |S1024)
  artist_playmeid (Column(1,), int32)
  genre (Column(1,), |S1024)
  idx_artist_terms (Column(1,), int32)
  idx_similar_artists (Column(1,), int32)
  release (Column(1,), |S1024)
  release_7digitalid (Column(1,), int32)
  song_hotttnesss (Column(1,), float64)
  song_id (Column(1,), |S32)
  title (Column(1,), |S1024)
  track_7digitalid (Column(1,), int32)

In [46]:
h5.root.musicbrainz

/musicbrainz (Group) 'data about the song coming from MusicBrainz'
  children := ['artist_mbtags' (EArray), 'artist_mbtags_count' (EArray), 'songs' (Table)]

In [47]:
h5.root.musicbrainz.songs

/musicbrainz/songs (Table(1,), shuffle, zlib(1)) 'table of data coming from MusicBrainz'
  description := {
  "idx_artist_mbtags": Int32Col(shape=(), dflt=0, pos=0),
  "year": Int32Col(shape=(), dflt=0, pos=1)}
  byteorder := 'little'
  chunkshape := (1024,)

### Convert H5DF to CSV files

We only looked at one subdirectory in the downloaded data. **Can potentially use PySpark to speed up** when converting all subdirectories.

Right now the code is still a bit buggy. Exceptions when input total_features, need further debug.

http://millionsongdataset.com/pages/example-track-description/

In [27]:
# total features according to the MSD website 
total_features = ['artist_mbid', 'artist_mbtags', 'artist_mbtags_count', 'artist_name', 'artist_playmeid', \
                  'artist_terms', 'artist_terms_freq', 'artist_terms_weight', 'audio_md5', 'bars_confidence', \
                  'bars_start', 'beats_confidence', 'beats_start', 'danceability','duration', 'end_of_fade_in', \
                  'energy', 'key', 'key_confidence', 'loudness', 'mode', 'mode_confidence', 'release', \
                  'release_7digitalid', 'sections_confidence', 'sections_start', 'segments_confidence', \
                  'segments_loudness_max', 'segments_loudness_max_time', 'segments_loudness_start', \
                  'segments_pitches', 'segments_start', 'segments_timbre', 'similar_artists', 'song_hotttnesss', \
                  'song_id', 'start_of_fade_out', 'tatums_confidence', 'tatums_start', 'tempo', 'time_signature',\
                  'time_signature_confidence', 'title', 'track_7digitalid', 'track_id', 'year']

# list the features you want to select
features = ['title', 'artist_name', 'danceability','duration', 'song_hotttnesss', 'tempo' ,'year']

directory = '../../MillionSongSubset/A/A/A'

In [29]:
converter = MMSongsDbToCsvConverter('mmsongsdb.csv', features)
converter.convert_directory(directory)

### Data Cleaning

In [62]:
ms = pd.read_csv('mmsongsdb.csv')
ms.head()

Unnamed: 0,title,artist_name,danceability,duration,song_hotttnesss,tempo,year
0,"b""I Didn't Mean To""",b'Casual',0.0,218.93179,0.60212,92.198,0
1,b'Soul Deep',b'The Box Tops',0.0,148.03546,,121.274,1969
2,b'Amor De Cabaret',b'Sonora Santanera',0.0,177.47546,,100.07,0
3,b'Something Girls',b'Adam Ant',0.0,233.40363,,119.293,1982
4,b'Face the Ashes',b'Gob',0.0,209.60608,0.604501,129.738,2007


In [63]:
ms.title = ms.title.apply(lambda x: x[2:-1])
ms.artist_name = ms.artist_name.apply(lambda x: x[2:-1])

In [64]:
ms.head()

Unnamed: 0,title,artist_name,danceability,duration,song_hotttnesss,tempo,year
0,I Didn't Mean To,Casual,0.0,218.93179,0.60212,92.198,0
1,Soul Deep,The Box Tops,0.0,148.03546,,121.274,1969
2,Amor De Cabaret,Sonora Santanera,0.0,177.47546,,100.07,0
3,Something Girls,Adam Ant,0.0,233.40363,,119.293,1982
4,Face the Ashes,Gob,0.0,209.60608,0.604501,129.738,2007
