In [1]:
from IPython.core.display import HTML

In [2]:
src = "http://labrosa.ee.columbia.edu/millionsong/sites/default/files/tutorial1.pdf"
HTML("<iframe src='%s' width=1000 height=1000></iframe>"%src)

In [3]:
"""
Tutorial for the Million Song Dataset

by Thierry Bertin-Mahieux (2011) Columbia University
   tb2332@columbia.edu
   Copyright 2011 T. Bertin-Mahieux, All Rights Reserved

This tutorial will walk you through a quick experiment
using the Million Song Dataset (MSD). We will actually be working
on the 10K songs subset for speed issues, but the code should
transpose seamlessly.

In this tutorial, we do simple metadata analysis. We look at
which artist has the most songs by iterating over the whole
dataset and using an SQLite database.

You need to have the MSD code downloaded from GITHUB.
See the MSD website for details:
http://labrosa.ee.columbia.edu/millionsong/

If you have any questions regarding the dataset or this tutorial,
please first take a look at the website. Send us an email
if you haven't found the answer.

Note: this tutorial is developed using Python 2.6
      on an Ubuntu machine. PDF created using 'pyreport'.
"""

# usual imports
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np
# get it at: http://numpy.scipy.org/

In [4]:
!pwd ; ls

/media/1ADF-0E69/msong
kaggle		   millionsongsubset_full.tar.gz  tutorial_1.ipynb
MillionSongSubset  MSongsDB


In [5]:
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path='/media/1ADF-0E69/msong/' + 'MillionSongSubset'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check
# path to the Million Song Dataset code
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_code_path='/media/1ADF-0E69/msong/' + 'MSongsDB'
assert os.path.isdir(msd_code_path),'wrong path' # sanity check
# we add some paths to python so we can import MSD code
# Ubuntu: you can change the environment variable PYTHONPATH
# in your .bashrc file so you do not have to type these lines
sys.path.append( os.path.join(msd_code_path,'PythonSrc') )

In [6]:
import hdf5_getters as GETTERS

In [7]:
# the following function gives us a nice string for time lag in seconds
def strtimedelta(starttime, stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

In [9]:
# we define a very useful function to iterate over files
def apply_to_all_files(basedir, func=lambda x : x, ext='.h5'):
    """ From a base dir, go through all subdirectories,
    find all the files with given extensions, apply the given function 'func' to all of them
    If no 'func' is passed, we do nothing except counting.
    
    RETURN:
    number of files
    """
    cnt = 0
    
    # iterate over all files in subdirectories 
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, "*" + ext))
        # count files
        cnt += len(files)
        
        # apply function to all files
        for f in files:
            func(f)
            
    return cnt

# we can now easily count the number of files in the dataset
print 'number of song files: ', apply_to_all_files(msd_subset_data_path)

 number of song files:  10000


In [10]:
# let's get all artists name in a Python set collection
all_artists_name = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the file song
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artists_name.add( artist_name )
    h5.close()

# let's apply the previous function to all the files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path, func=func_to_get_artist_name)
t2 = time.time()

print 'all artists name extracted in:', strtimedelta(t1, t2)
print 'head of artist name set: ', list(all_artists_name)[:10]

print ' '

all artists name extracted in: 0:03:34.563843
head of artist name set:  ['Groundhogs', 'Pale Forest', 'The Real Kids', 'JennyAnyKind', 'Aswad', 'Little Willie John', 'Barry Goldberg', 'Spooky Tooth / Mike Harrison', 'Red Foley', '4 Skins']
