# Song Hotness Predictor: Data Acquisition & Preprocessing

In this notebook, we will:
1. Download/Access the Million Song Dataset
2. Load required libraries and the dataset
3. Understand the dataset structure
4. Preprocess the data (handle missing values, data imbalance, and perform feature scaling).

In [1]:
import os
import pandas as pd
import numpy as np
import tarfile
import requests
import h5py
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tqdm import tqdm

import utils.hdf5_getters as hdf5_getters

## Step 1: Data Acquisition

The Million Song Dataset (MSD) is very large, so here we demonstrate accessing a subset or a sample. We downloaded the dataset from their website: http://millionsongdataset.com/pages/getting-dataset/

In [2]:
dataset_url = "http://labrosa.ee.columbia.edu/~dpwe/tmp/millionsongsubset.tar.gz"
extraction_path = "data"
local_filename = f"{extraction_path}/millionsongsubset.tar.gz"

In [3]:
def download_dataset(url, filename):
    # Download the dataset with a progress bar
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kilobyte

    with open(filename, 'wb') as file, tqdm(
        total=total_size, unit='iB', unit_scale=True, desc="Downloading"
    ) as bar:
        for data in response.iter_content(block_size):
            bar.update(len(data))
            file.write(data)
    # Check if file size matches expected size
    actual_size = os.path.getsize(filename)
    if actual_size < total_size:
        raise Exception("Download incomplete: expected {} bytes but got {} bytes".format(total_size, actual_size))
    print("Download complete.")

# Download if file does not exist or if re-download is needed
if not os.path.exists(local_filename):
    try:
        download_dataset(dataset_url, local_filename)
    except Exception as e:
        print("Error during download:", e)
        # Optionally, delete the incomplete file:
        if os.path.exists(local_filename):
            os.remove(local_filename)
else:
    print("Dataset already downloaded.")

# Extract the dataset
if not os.path.exists(extraction_path):
    os.makedirs(extraction_path, exist_ok=True)
    try:
        with tarfile.open(local_filename, "r:gz") as tar:
            tar.extractall(path=extraction_path)
        print("Extraction complete. Files are available in:", extraction_path)
    except (tarfile.TarError, EOFError) as e:
        print("Error during extraction:", e)
        print("The archive may be corrupted. Consider re-downloading the file.")
else:
    print("Dataset already extracted.")

Dataset already downloaded.
Dataset already extracted.


Since the files are in h5 format, we would like to keep it in a CSV format so we can use pandas to inspect the data more easily

In [7]:
# Define a dictionary of fields and their corresponding getter functions.
fields = {
    "artist_familiarity": hdf5_getters.get_artist_familiarity,
    "artist_hotttnesss": hdf5_getters.get_artist_hotttnesss,
    "artist_id": hdf5_getters.get_artist_id,
    "artist_mbid": hdf5_getters.get_artist_mbid,
    "artist_playmeid": hdf5_getters.get_artist_playmeid,
    "artist_7digitalid": hdf5_getters.get_artist_7digitalid,
    "artist_latitude": hdf5_getters.get_artist_latitude,
    "artist_longitude": hdf5_getters.get_artist_longitude,
    "artist_location": hdf5_getters.get_artist_location,
    "artist_name": hdf5_getters.get_artist_name,
    "release": hdf5_getters.get_release,
    "release_7digitalid": hdf5_getters.get_release_7digitalid,
    "song_id": hdf5_getters.get_song_id,
    "song_hotttnesss": hdf5_getters.get_song_hotttnesss,
    "title": hdf5_getters.get_title,
    "track_7digitalid": hdf5_getters.get_track_7digitalid,
    "similar_artists": hdf5_getters.get_similar_artists,
    "artist_terms": hdf5_getters.get_artist_terms,
    "artist_terms_freq": hdf5_getters.get_artist_terms_freq,
    "artist_terms_weight": hdf5_getters.get_artist_terms_weight,
    "analysis_sample_rate": hdf5_getters.get_analysis_sample_rate,
    "audio_md5": hdf5_getters.get_audio_md5,
    "danceability": hdf5_getters.get_danceability,
    "duration": hdf5_getters.get_duration,
    "end_of_fade_in": hdf5_getters.get_end_of_fade_in,
    "energy": hdf5_getters.get_energy,
    "key": hdf5_getters.get_key,
    "key_confidence": hdf5_getters.get_key_confidence,
    "loudness": hdf5_getters.get_loudness,
    "mode": hdf5_getters.get_mode,
    "mode_confidence": hdf5_getters.get_mode_confidence,
    "start_of_fade_out": hdf5_getters.get_start_of_fade_out,
    "tempo": hdf5_getters.get_tempo,
    "time_signature": hdf5_getters.get_time_signature,
    "time_signature_confidence": hdf5_getters.get_time_signature_confidence,
    "track_id": hdf5_getters.get_track_id,
    "segments_start": hdf5_getters.get_segments_start,
    "segments_confidence": hdf5_getters.get_segments_confidence,
    "segments_pitches": hdf5_getters.get_segments_pitches,
    "segments_timbre": hdf5_getters.get_segments_timbre,
    "segments_loudness_max": hdf5_getters.get_segments_loudness_max,
    "segments_loudness_max_time": hdf5_getters.get_segments_loudness_max_time,
    "segments_loudness_start": hdf5_getters.get_segments_loudness_start,
    "sections_start": hdf5_getters.get_sections_start,
    "sections_confidence": hdf5_getters.get_sections_confidence,
    "beats_start": hdf5_getters.get_beats_start,
    "beats_confidence": hdf5_getters.get_beats_confidence,
    "bars_start": hdf5_getters.get_bars_start,
    "bars_confidence": hdf5_getters.get_bars_confidence,
    "tatums_start": hdf5_getters.get_tatums_start,
    "tatums_confidence": hdf5_getters.get_tatums_confidence,
    "artist_mbtags": hdf5_getters.get_artist_mbtags,
    "artist_mbtags_count": hdf5_getters.get_artist_mbtags_count,
    "year": hdf5_getters.get_year,
}

root_dir = "data/MillionSongSubset"
records = []

# Traverse all .h5 files in the directory structure
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.h5'):
            filepath = os.path.join(dirpath, file)
            try:
                h5 = hdf5_getters.open_h5_file_read(filepath)
                record = {}
                # Extract all fields using the provided getter functions
                for field, getter in fields.items():
                    try:
                        # Use songidx=0 for extraction (modify if needed for aggregated files)
                        record[field] = getter(h5, songidx=0)
                    except Exception as e:
                        print(f"Error extracting {field} from {filepath}: {e}")
                        record[field] = None
                h5.close()
                records.append(record)
            except Exception as e:
                print(f"Error processing {filepath}: {e}")

# Convert the list of records to a DataFrame and save as CSV
df = pd.DataFrame(records)
output_csv = "data/million_song_dataset.csv"
df.to_csv(output_csv, index=False)
print(f"CSV saved to: {output_csv}")

CSV saved to: data/million_song_dataset.csv


In [8]:
df.head()

Unnamed: 0,artist_familiarity,artist_hotttnesss,artist_id,artist_mbid,artist_playmeid,artist_7digitalid,artist_latitude,artist_longitude,artist_location,artist_name,...,sections_confidence,beats_start,beats_confidence,bars_start,bars_confidence,tatums_start,tatums_confidence,artist_mbtags,artist_mbtags_count,year
0,0.55746,0.386152,b'AREJXK41187B9A4ACC',b'c43bb0d6-94d7-410f-80fb-e5a243b18d23',2676,16971,46.71067,1.71819,b'France',b'Rapha\xc3\xabl',...,"[1.0, 1.0, 0.508, 0.334, 0.113, 0.593]","[0.04257, 0.52458, 1.00659, 1.48632, 1.96605, ...","[0.886, 0.725, 0.748, 0.721, 0.784, 0.417, 0.4...","[0.52458, 2.4444, 4.3658, 6.30006, 8.22086, 10...","[0.179, 0.373, 0.127, 0.015, 0.012, 0.119, 0.0...","[0.04257, 0.28357, 0.52458, 0.76558, 1.00659, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[],[],2008
1,0.626958,0.43486,b'AR2XRFQ1187FB417FE',b'a69cd724-2f57-4ed0-bfed-ba20401eb84c',5772,92108,,,b'',b'Julie Zenatti',...,"[1.0, 0.502, 0.721, 0.552, 0.575, 0.704, 0.332...","[0.59161, 1.33555, 2.07949, 2.82093, 3.56238, ...","[0.773, 0.247, 0.402, 0.115, 0.092, 0.436, 0.5...","[8.69527, 12.01758, 14.81363, 20.09003, 22.693...","[0.716, 0.306, 0.015, 0.572, 0.529, 0.353, 0.2...","[0.59161, 0.84042, 1.08799, 1.33555, 1.58436, ...","[0.365, 0.321, 0.29, 0.264, 0.238, 0.217, 0.20...",[],[],2004
2,0.425724,0.0,b'ARODOO01187FB44F4A',b'60bd8a1c-c093-4849-8f28-08101ca059b1',20540,1701,,,b'',b'The Baltimore Consort',...,"[1.0, 0.163, 0.07, 0.06]","[1.95344, 3.0622, 4.16548, 5.26467, 6.35701, 7...","[0.506, 0.467, 0.45, 0.682, 0.341, 0.406, 0.38...","[1.95344, 6.35701, 10.70579, 15.07913, 19.4832...","[0.014, 0.025, 0.049, 0.029, 0.11, 0.484, 0.35...","[1.11537, 1.39701, 1.67864, 1.95344, 2.23097, ...","[0.307, 0.305, 0.291, 0.284, 0.281, 0.28, 0.27...",[],[],0
3,0.611495,0.33452,b'ARJGW911187FB586CA',b'44b5b950-2ae2-403a-8c67-82d8fc72033d',-1,92184,,,b'',b'I Hate Sally',...,"[1.0, 0.744, 0.36, 0.628, 0.147, 0.093]","[0.39309, 1.17222, 1.95902, 2.75109, 3.55236, ...","[0.862, 0.509, 0.011, 0.39, 0.479, 0.422, 0.45...","[0.39309, 2.75109, 5.13649, 7.52978, 9.95582, ...","[0.095, 0.147, 0.157, 0.191, 0.115, 0.222, 0.2...","[0.11929, 0.39309, 0.65603, 0.91083, 1.17222, ...","[0.731, 0.628, 0.526, 0.359, 0.287, 0.246, 0.2...",[],[],2007
4,0.367255,0.311616,b'AR9HQ6Y1187FB3C2CB',b'0e6524bd-6641-46a6-bce5-96f06c19aa46',-1,278655,,,b'',b'Orlando Pops Orchestra',...,"[1.0, 0.171, 0.658, 0.609, 0.551, 0.591, 0.525...","[0.49266, 0.99301, 1.49336, 1.99371, 2.49406, ...","[0.382, 0.374, 0.464, 0.343, 0.553, 0.49, 0.63...","[0.99301, 2.99441, 4.96889, 6.9402, 8.92127, 1...","[0.32, 0.081, 0.567, 0.328, 0.112, 0.004, 0.09...","[0.24248, 0.49266, 0.74283, 0.99301, 1.24318, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[],[],0


# OLD CODE BELOW

Here, we have a table/dataframe where each row represents a song and each column represents a feature or metdadata from the extracted H5 files. We see that the columns are organized into different hierarchical groups, such as `analysis`, `metadata`, and `musicbrainz`.

Observe any missing values

In [9]:
print(df.isnull().sum())

artist_familiarity               4
artist_hotttnesss                0
artist_id                        0
artist_mbid                      0
artist_playmeid                  0
artist_7digitalid                0
artist_latitude               6258
artist_longitude              6258
artist_location                  0
artist_name                      0
release                          0
release_7digitalid               0
song_id                          0
song_hotttnesss               4352
title                            0
track_7digitalid                 0
similar_artists                  0
artist_terms                     0
artist_terms_freq                0
artist_terms_weight              0
analysis_sample_rate             0
audio_md5                        0
danceability                     0
duration                         0
end_of_fade_in                   0
energy                           0
key                              0
key_confidence                   0
loudness            

In [10]:
print(df.describe())

       artist_familiarity  artist_hotttnesss  artist_playmeid  \
count         9996.000000       10000.000000     10000.000000   
mean             0.565456           0.385552     25547.117000   
std              0.160161           0.143647     44001.390646   
min              0.000000           0.000000        -1.000000   
25%              0.467611           0.325266        -1.000000   
50%              0.563666           0.380742      2190.000000   
75%              0.668020           0.453858     33312.500000   
max              1.000000           1.082503    242965.000000   

       artist_7digitalid  artist_latitude  artist_longitude  \
count       10000.000000      3742.000000       3742.000000   
mean       109541.695500        37.157357        -63.933358   
std        142079.791611        15.598494         50.508223   
min            -1.000000       -41.280540       -162.436500   
25%         10435.000000        33.748310        -93.291560   
50%         40247.000000        38.9

In [11]:
# print all the coluns
print(df.columns)

# print additional info
print(df.info())

Index(['artist_familiarity', 'artist_hotttnesss', 'artist_id', 'artist_mbid',
       'artist_playmeid', 'artist_7digitalid', 'artist_latitude',
       'artist_longitude', 'artist_location', 'artist_name', 'release',
       'release_7digitalid', 'song_id', 'song_hotttnesss', 'title',
       'track_7digitalid', 'similar_artists', 'artist_terms',
       'artist_terms_freq', 'artist_terms_weight', 'analysis_sample_rate',
       'audio_md5', 'danceability', 'duration', 'end_of_fade_in', 'energy',
       'key', 'key_confidence', 'loudness', 'mode', 'mode_confidence',
       'start_of_fade_out', 'tempo', 'time_signature',
       'time_signature_confidence', 'track_id', 'segments_start',
       'segments_confidence', 'segments_pitches', 'segments_timbre',
       'segments_loudness_max', 'segments_loudness_max_time',
       'segments_loudness_start', 'sections_start', 'sections_confidence',
       'beats_start', 'beats_confidence', 'bars_start', 'bars_confidence',
       'tatums_start', 'tatums

In [12]:
# print first row
print(df.iloc[0])

artist_familiarity                                                      0.55746
artist_hotttnesss                                                      0.386152
artist_id                                                 b'AREJXK41187B9A4ACC'
artist_mbid                             b'c43bb0d6-94d7-410f-80fb-e5a243b18d23'
artist_playmeid                                                            2676
artist_7digitalid                                                         16971
artist_latitude                                                        46.71067
artist_longitude                                                        1.71819
artist_location                                                       b'France'
artist_name                                                   b'Rapha\xc3\xabl'
release                              b'Je Sais Que La Terre Est Plate (Deluxe)'
release_7digitalid                                                       275907
song_id                                 