iterate over file directories: https://www.geeksforgeeks.org/how-to-iterate-over-files-in-directory-using-python/

import local module: https://stackoverflow.com/questions/53049195/importing-custom-module-into-jupyter-notebook/53052990

\_\_init\_\_ files: https://stackoverflow.com/questions/1944569/how-do-i-write-good-correct-package-init-py-files

In [1]:
import os
import glob
import sys
import numpy as np
import pandas as pd
sys.path.append('../src/')
from util import hdf5_getters as hget
import h5py
import tables

In [2]:
rel_msdpath = '../datasets/MillionSongSubset'
rel_userdatapath = '../datasets/train_triplets.txt'
rel_mappingpath = '../datasets/msdchallenge/taste_profile_song_to_tracks.txt'

In [3]:
# returns list of all files with extension .h5 in a directory
# iterates through subdirectories as well
def load_all_h5_files(rootdir, ext='.h5'):
    tab = []
    for subdir, dirs, files in os.walk(rootdir):
        files = glob.glob(os.path.join(subdir,'*'+ext))
        for f in files:
            tab.append(os.path.abspath(f))
    return tab

def get_all_titles(rootdir, ext='.h5'):
    titles = []
    for subdir, dirs, files in os.walk(rootdir):
        files = glob.glob(os.path.join(subdir,'*'+ext))
        for f in files:
            h5 = hget.open_h5_file_read(f)
            titles.append(hget.get_title(h5))
            h5.close()
    return titles

# checked for multiple songs - none in the subset
def get_files_with_multiple_songs(rootdir, ext='.h5'):
    h5s = []
    for subdir, dirs, files in os.walk(rootdir):
        files = glob.glob(os.path.join(subdir,'*'+ext))
        for f in files:
            h5 = hget.open_h5_file_read(f)
            if ( hget.get_num_songs(h5) > 1 ):
                h5s.append(f)
            h5.close()
    return h5s

# want to make sure each h5 has 3 groups: 'analysis', 'metadata', 'musicbrainz'
def assert_rootgroup_contains_same_number_groups(files):
    violations = []
    for file in files:
        h5 = hget.open_h5_file_read(file)
        if ( len(h5.root._f_list_nodes()) > 3 ):
            violations.append(h5)
        h5.close()
    return violations

In [4]:
# From a list of h5 files, extracts song metadata and creates a dataframe
def extract_song_data(files):
    # Init empty df
    df = pd.DataFrame()
    # Get total h5 file count
    size = len(files)
    print(size, 'files found.')
    # Iter thru files
    for i, f in enumerate(files):
        # Update progress bar
        # progress(i, size, 'of files processed')
        
        # Read file into store
        s_hdf = pd.HDFStore(f)
        # DF to hold single file info
        data = pd.DataFrame()
        # Walk nodes under root
        for item in s_hdf.root._f_walknodes():
            # Get name for column
            name = item._v_pathname[1:].replace('/','_')
            # Store arrays
            if type(item) is tables.earray.EArray:
                data[name] = [np.array(item)]
            # Store tables
            elif type(item) is tables.table.Table:
                # Get all columns
                cols =  item.coldescrs.keys()
                for row in item:
                    for col in cols:
                        col_name = '_'.join([name,col])
                        try:
                            data[col_name] = row[col]
                        except Exception as e:
                            print(e)

        # Append to main df
        df = df.append(data, ignore_index=True)
        # Close store for reading
        s_hdf.close()

    # Dev set of columns
    # df = df[['metadata_songs_artist_id','metadata_songs_title','musicbrainz_songs_year','metadata_artist_terms','analysis_songs_analysis_sample_rate','metadata_songs_artist_location','analysis_sections_confidence','analysis_sections_start','analysis_segments_start','analysis_segments_timbre','analysis_segments_pitches','analysis_songs_tempo','analysis_bars_confidence','analysis_bars_start','analysis_beats_confidence','analysis_beats_start','analysis_songs_duration','analysis_songs_energy','analysis_songs_key','analysis_songs_key_confidence','analysis_songs_time_signature','analysis_songs_time_signature_confidence','metadata_similar_artists']]

    # Drop bad columns
    # df.drop(['musicbrainz_artist_mbtags_count','musicbrainz_artist_mbtags',
    #          'musicbrainz_songs_idx_artist_mbtags'], inplace=True, axis=1)

    return df

In [5]:
# converts columns in the df that are represented as byte strings to regular strings
def convert_bytes_to_string(df):
    for i, col in enumerate(df.columns):
        if ( type(df.iloc[0,i]) == bytes ):
            df[col] = df[col].str.decode('utf-8')

In [6]:
# takes a list of h5 files, opens all and closes them properly
def close_all_open_h5(files):
    for file in files:
        hFile = h5py.File(file, 'r')
        if hFile.__bool__() :
            hFile.close()

    return leaks

In [7]:
def get_user_taste_data(filename):
    tasteDF = pd.read_csv(filename, sep='\t', header=None, names=['user','song','count'])
    return tasteDF

In [8]:
def get_mapping_dict(filename):
    mapping = pd.read_csv(
                            filename,
                            sep='\t',
                            header=None,
                            names=['song_id','track_id'])
    mapping_dict = dict(zip(mapping.song_id, mapping.track_id))
    return mapping_dict

In [9]:
# pipeline to compile workable dataset
def load_data_pipeline(msdpath, userdatapath, mappingpath):
    files = load_all_h5_files(msdpath)
    df_song_data = extract_song_data(files)
    convert_bytes_to_string(df_song_data)
    
    df_user_data = get_user_taste_data(userdatapath)
    mapping_dict = get_mapping_dict(mappingpath)
    
    return df_song_data, df_user_data, mapping_dict

In [10]:
%%time
df_song_data, df_user_data, mapping_dict = load_data_pipeline(rel_msdpath, rel_userdatapath, rel_mappingpath)

10000 files found.
CPU times: user 19min 5s, sys: 1min 8s, total: 20min 13s
Wall time: 22min 25s


In [99]:
df_song_backup = df_song_data.copy()
df_user_backup = df_user_data.copy()

In [100]:
df_song = df_song_data
df_user = df_user_data

In [111]:
df_merged = df_user.merge(df_song, left_on='song', right_on='metadata_songs_song_id')

In [112]:
df_merged.head()

Unnamed: 0,user,song,count,analysis_bars_confidence,analysis_bars_start,analysis_beats_confidence,analysis_beats_start,analysis_sections_confidence,analysis_sections_start,analysis_segments_confidence,...,metadata_songs_release,metadata_songs_release_7digitalid,metadata_songs_song_hotttnesss,metadata_songs_song_id,metadata_songs_title,metadata_songs_track_7digitalid,musicbrainz_artist_mbtags,musicbrainz_artist_mbtags_count,musicbrainz_songs_idx_artist_mbtags,musicbrainz_songs_year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[1.0, 0.63, 0.519, 0.637, 0.806, 0.526, 0.39, ...","[0.0, 27.81345, 38.36246, 48.17733, 79.82575, ...","[0.0, 1.0, 1.0, 1.0, 0.31, 0.298, 0.8, 0.232, ...",...,Greatest Hits,283353,0.778821,SOWEZSI12A81C21CE6,Tu Quieres Volver,3167278,"[b'classic pop and rock', b'folk']","[1, 1]",0,1987
1,833c530ecda3d99deb8395f70400aa3999783d91,SOWEZSI12A81C21CE6,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[1.0, 0.63, 0.519, 0.637, 0.806, 0.526, 0.39, ...","[0.0, 27.81345, 38.36246, 48.17733, 79.82575, ...","[0.0, 1.0, 1.0, 1.0, 0.31, 0.298, 0.8, 0.232, ...",...,Greatest Hits,283353,0.778821,SOWEZSI12A81C21CE6,Tu Quieres Volver,3167278,"[b'classic pop and rock', b'folk']","[1, 1]",0,1987
2,d6c5bd2b570b4faf8964d7ed04f3392ff505d2be,SOWEZSI12A81C21CE6,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[1.0, 0.63, 0.519, 0.637, 0.806, 0.526, 0.39, ...","[0.0, 27.81345, 38.36246, 48.17733, 79.82575, ...","[0.0, 1.0, 1.0, 1.0, 0.31, 0.298, 0.8, 0.232, ...",...,Greatest Hits,283353,0.778821,SOWEZSI12A81C21CE6,Tu Quieres Volver,3167278,"[b'classic pop and rock', b'folk']","[1, 1]",0,1987
3,724534729c9f5dc72a009269c2c225883e4775d2,SOWEZSI12A81C21CE6,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[1.0, 0.63, 0.519, 0.637, 0.806, 0.526, 0.39, ...","[0.0, 27.81345, 38.36246, 48.17733, 79.82575, ...","[0.0, 1.0, 1.0, 1.0, 0.31, 0.298, 0.8, 0.232, ...",...,Greatest Hits,283353,0.778821,SOWEZSI12A81C21CE6,Tu Quieres Volver,3167278,"[b'classic pop and rock', b'folk']","[1, 1]",0,1987
4,ee7aa84c164038c963cfd02a7e52a5598aa470c3,SOWEZSI12A81C21CE6,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.71, 0.772, 0.791, ...","[0.18795, 0.5345, 0.90098, 1.26745, 1.6303, 1....","[1.0, 0.63, 0.519, 0.637, 0.806, 0.526, 0.39, ...","[0.0, 27.81345, 38.36246, 48.17733, 79.82575, ...","[0.0, 1.0, 1.0, 1.0, 0.31, 0.298, 0.8, 0.232, ...",...,Greatest Hits,283353,0.778821,SOWEZSI12A81C21CE6,Tu Quieres Volver,3167278,"[b'classic pop and rock', b'folk']","[1, 1]",0,1987


In [114]:
df_merged.to_pickle('../datasets/msd_tastes_merged.csv', compression='zip')

In [88]:
df_song_data.to_pickle(r'../datasets/msd.csv', compression='zip')

In [90]:
df_try = pd.read_pickle('../datasets/msd.csv', compression='zip')

In [91]:
print(df_try.iloc[0,4])
print(type(df_try.iloc[0,4]))

[1.    1.    0.508 0.334 0.113 0.593]
<class 'numpy.ndarray'>


In [92]:
print(df_song_data.iloc[0,4])
print(type(df_song_data.iloc[0,4]))

[1.    1.    0.508 0.334 0.113 0.593]
<class 'numpy.ndarray'>


## Preprocessing

In [36]:
df_song_data.shape

(10000, 74)

In [41]:
df_song_data_nan = pd.isnull(df_song_data).sum().where(lambda x : x > 0).dropna()
df_user_data_nan = pd.isnull(df_user_data).sum().where(lambda x : x > 0).dropna()

In [42]:
df_user_data_nan

Series([], dtype: float64)