In [None]:
import tarfile
import h5py
import pandas as pd
import os

In [None]:
# Unzipped MSD subset
tar = tarfile.open("millionsongsubset.tar.gz")
tar_members = tar.getmembers()

# Print the names of the files in the archive
#for member in tar_members:
    #print(member.name)

tar.close()

In [None]:
# Exploring MSD subset
filename = "MillionSongSubset/B/G/M/TRBGMWQ12903CC23CD.h5"

with h5py.File(filename, "r") as f:
    # Dictionaries per each song
    dict_song_keys = list(f.keys())
    # Access the metadata for each the song
    metadata = f["metadata"]
    metadata_keys = list(metadata.keys())
    # Access the audio data (Analysis from audio data)
    analysis = f["analysis"]
    analysis_keys = list(analysis.keys())
    # Access to musicbrainz
    musicbrainz = f["musicbrainz"]
    musicbrainz_keys = list(musicbrainz.keys())

print(dict_song_keys)

In [None]:
# Choose main dictionary between: "metadata", "analysis" or "musicbrainz"
def parsing_h5file(file_name, main_dictionary):
    with h5py.File(file_name, "r") as f:
        data_group = f[main_dictionary]

        # Create a dictionary for each key with all data contained by the song
        data_dict = {}
        for key in data_group.keys():
            data_dict[key] = data_group[key][()]

    return data_dict

### Getting features for one song

In [None]:
# Open the HDF5 file containing the metadata
file_name = "MillionSongSubset/B/G/M/TRBGMWQ12903CC23CD.h5"
# Getting features from metadata
metadata_dict = parsing_h5file(file_name, "metadata")
print(metadata_dict.keys())

In [None]:
# Getting features from analysis
analysis_dict = parsing_h5file(file_name, "analysis")
print(analysis_dict.keys())

In [None]:
# Getting features from musicbrainz
musicbrainz_dict = parsing_h5file(file_name, "musicbrainz")
print(musicbrainz_dict.keys())

### Getting features for several songs located in same main folder

In [None]:
# Parsing several songs  
def parsing_h5files(path_to_files, main_dictionary):
    list_of_filenames = [os.path.join(path_to_files, f) for f in os.listdir(path_to_files) if f.endswith(".h5")]

    # Create an empty DataFrame to store the data
    data =[]

    # Loop over the HDF5 files
    for filename in list_of_filenames:
        with h5py.File(filename, "r") as f:
            data_group = f[main_dictionary]

            # Create a dictionary for each key 
            data_dict = {}
            for key in data_group.keys():
                data_dict[key] = data_group[key][()]

            # Append data_dict to data
            data.append(data_dict)
            
    # Covert to dataframe
    column_names = list(data_dict.keys())
    df = pd.DataFrame(data, columns=column_names)
    
    return df

In [None]:
# Actual file path main folder
path_to_files = "MillionSongSubset/B/G/K/"
metadata_df = parsing_h5files(path_to_files, "metadata")
#display(metadata_df)

### Getting features for all songs

In [None]:
# Function to retrieve info from all files in MillionSongSubset
def parsing_h5files(dataset_name, main_dictionary):
    data = []
    rootdir = dataset_name # Path to the dataset
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file.endswith('.h5'):
                filename = os.path.join(subdir, file)
                with h5py.File(filename, 'r') as f:
                    data_group = f[main_dictionary]

                    # Create a dictionary for each key with all data contained by the song
                    data_dict = {}
                    for key in data_group.keys():
                        data_dict[key] = data_group[key][()]
                                                     
                    data.append(data_dict)
                    
    # Covert to dataframe
    column_names = list(data_dict.keys())
    df = pd.DataFrame(data, columns=column_names)
    
    return df

In [None]:
# Retrieving features for all songs
dataset_name = 'MillionSongSubset'
metadata_df = parsing_h5files(dataset_name, "metadata")
#print(metadata_df.head())