In [98]:
import pandas as pd
import numpy as np
import h5py
import requests

import matplotlib.pyplot as plt

%matplotlib inline

In [10]:
FILE_PATH = '../data/external/msd_summary_file.h5'
with open(FILE_PATH, 'wb') as f:
    summary_url = 'http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/msd_summary_file.h5'
    summary = requests.get(summary_url)
    f.write(summary.content)

In [89]:
df1 = pd.read_hdf(FILE_PATH, key='analysis/songs')
df2 = pd.read_hdf(FILE_PATH, key='metadata/songs')
df3 = pd.read_hdf(FILE_PATH, key='musicbrainz/songs')
df = pd.concat([df1, df2, df3], axis=1)
df.shape

(1000000, 53)

IDX variables are all null in the summary tables.

In [90]:
features = [x for x in df.columns if not x.startswith('idx_')]
features

['analysis_sample_rate',
 'audio_md5',
 'danceability',
 'duration',
 'end_of_fade_in',
 'energy',
 'key',
 'key_confidence',
 'loudness',
 'mode',
 'mode_confidence',
 'start_of_fade_out',
 'tempo',
 'time_signature',
 'time_signature_confidence',
 'track_id',
 'analyzer_version',
 'artist_7digitalid',
 'artist_familiarity',
 'artist_hotttnesss',
 'artist_id',
 'artist_latitude',
 'artist_location',
 'artist_longitude',
 'artist_mbid',
 'artist_name',
 'artist_playmeid',
 'genre',
 'release',
 'release_7digitalid',
 'song_hotttnesss',
 'song_id',
 'title',
 'track_7digitalid',
 'year']

In [91]:
df = df[features]
df.head()

Unnamed: 0,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,mode,...,artist_name,artist_playmeid,genre,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,year
0,22050,aee9820911781c734e7694c5432990ca,0.0,252.05506,2.049,0.0,10,0.777,-4.829,0,...,Faster Pussy cat,44895,,Monster Ballads X-Mas,633681,0.542899,SOQMMHC12AB0180CB8,Silent Night,7032331,2003
1,22050,ed222d07c83bac7689d52753610a513a,0.0,156.55138,0.258,0.0,9,0.808,-10.555,1,...,Karkkiautomaatti,-1,,Karkuteillä,145266,0.299877,SOVFVAK12A8C1350D9,Tanssi vaan,1514808,1995
2,22050,96c7104889a128fef84fa469d60e380c,0.0,138.97098,0.0,0.0,7,0.418,-2.06,1,...,Hudson Mohawke,-1,,Butter,625706,0.617871,SOGTUKN12AB017F4F1,No One Could Ever,6945353,2006
3,22050,0f7da84b6b583e3846c7e022fb3a92a2,0.0,145.05751,0.0,0.0,7,0.125,-4.654,1,...,Yerba Brava,34000,,De Culo,199368,,SOBNYVR12A8C13558C,Si Vos Querés,2168257,2003
4,22050,228dd6392ad8001b0281f533f34c72fd,0.0,514.29832,0.0,0.0,5,0.097,-7.806,0,...,Der Mystic,-1,,Rene Ablaze Presents Winter Sessions,209038,,SOHSBXH12A8C13B0DF,Tangle Of Aspens,2264873,0


## Bring in pulled lyrics IDs

In [99]:
df_keys = pd.read_csv('https://storage.googleapis.com/w210-capstone/data/lyrics-valid.csv',
                      header=None, escapechar='\\',
                       names=['track_id', 'lyrics'])
df_keys = pd.DataFrame(df_keys.track_id.astype(np.object))
df_keys.head()

Unnamed: 0,track_id
0,TRAADJU128F92F58E1
1,TRAADQX128F422B4CF
2,TRAAFTE128F429545F
3,TRAAKAG128F4275D2A
4,TRAAMRO128F92F20D7


## Keep only those we pulled lyrics for

In [105]:
df_final = pd.merge(df_keys, df, how='inner', on='track_id')
df_final.head()

Unnamed: 0,track_id,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,...,artist_name,artist_playmeid,genre,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,year
0,TRAADJU128F92F58E1,22050,2844827a85108571951d585418f15dae,0.0,294.76526,0.392,0.0,1,0.584,-8.86,...,The Mob,-1,,May Inspire Revolutionary Acts,611800,,SOHJCEN12A8C142B93,I Hear You Laughing ('Ching' 1981 Cassette),6801472,0
1,TRAADQX128F422B4CF,22050,1bb9cb115bdcbad912cb3bea9facb2b8,0.0,175.72526,0.734,0.0,0,0.522,-10.93,...,Casiotone For The Painfully Alone,-1,,Etiquette,234425,0.785892,SOTUNMH12A6D4FA3E6,Nashville Parthenon,2573759,2006
2,TRAAFTE128F429545F,22050,611395c88642b6065ea0e7e74c837ff4,0.0,222.17098,0.136,0.0,7,0.386,-7.664,...,The Corrs,360,,Talk On Corners,110238,0.569505,SOCHGUG12A58A7E184,Love Gives Love Takes ( LP Version ),1140303,1997
3,TRAAKAG128F4275D2A,22050,49256c5df75bec30433c5201b9bff0e4,0.0,110.13179,4.714,0.0,10,0.214,-12.308,...,Soulfly,-1,,Soulfly,212164,,SOVDWHH12A8C13B42A,Sultao Das Matas (Album Version),2308896,0
4,TRAAMRO128F92F20D7,22050,8004243db3c1f8e164b8d7494fe3736b,0.0,139.72853,0.45,0.0,3,0.294,-11.256,...,David Wilcox,61454,,What You Whispered,391414,0.0,SOKNGDE12AB017CA4D,Step Into Your Skin,4343823,0


In [106]:
df_final.shape

(108538, 35)

In [107]:
df_final.to_csv('../data/interim/msd-aggregate.csv', index=False)

## Start looking at full files

In [40]:
with open('../data/external/example_file.h5', 'wb') as f:
    h = requests.get('https://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/TRAXLZU12903D05F94.h5').content
    f.write(h)
    
f1 = h5py.File('../data/external/example_file.h5')

In [42]:
list(f1.keys())

['analysis', 'metadata', 'musicbrainz']

In [78]:
list(f1['analysis'].keys())

['bars_confidence',
 'bars_start',
 'beats_confidence',
 'beats_start',
 'sections_confidence',
 'sections_start',
 'segments_confidence',
 'segments_loudness_max',
 'segments_loudness_max_time',
 'segments_loudness_start',
 'segments_pitches',
 'segments_start',
 'segments_timbre',
 'songs',
 'tatums_confidence',
 'tatums_start']

In [79]:
f1['analysis']['songs'].value

array([(22050, b'bf53f8113508a466cd2d3fda18b06368', 0., 211.69587, 0.139, 0., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.324, -7.75, 1, 0.434, 198.536, 113.359, 4, 0.634, b'TRAXLZU12903D05F94')],
      dtype=[('analysis_sample_rate', '<i4'), ('audio_md5', 'S32'), ('danceability', '<f8'), ('duration', '<f8'), ('end_of_fade_in', '<f8'), ('energy', '<f8'), ('idx_bars_confidence', '<i4'), ('idx_bars_start', '<i4'), ('idx_beats_confidence', '<i4'), ('idx_beats_start', '<i4'), ('idx_sections_confidence', '<i4'), ('idx_sections_start', '<i4'), ('idx_segments_confidence', '<i4'), ('idx_segments_loudness_max', '<i4'), ('idx_segments_loudness_max_time', '<i4'), ('idx_segments_loudness_start', '<i4'), ('idx_segments_pitches', '<i4'), ('idx_segments_start', '<i4'), ('idx_segments_timbre', '<i4'), ('idx_tatums_confidence', '<i4'), ('idx_tatums_start', '<i4'), ('key', '<i4'), ('key_confidence', '<f8'), ('loudness', '<f8'), ('mode', '<i4'), ('mode_confidence', '<f8'), ('start_of_fade_out', '<f