# Perception and Multimedia Computing Final Project
## A Simple, Euclidean Distance-Based Music Recommendation System
### Julia Wilkins, December 2016

First, import all necessary libraries and extra packages.

In [1]:
import numpy as np, scipy as sp, matplotlib.pyplot as plt, matplotlib, librosa, math, os, glob, json, IPython
from IPython.display import Audio
from tinytag import TinyTag
from scipy.spatial import distance

ModuleNotFoundError: No module named 'numpy'

Next, import your dataset of songs, preferably from iTunes, into the 'data' folder (I have kept a few sample songs in there). We concatenate these file paths into a list called 'file_list' that will be used throughout. 

Note: Obviously, the more songs you include in the dataset, the longer everything will take and the more CPU we are using. I recommend starting with no more than 40-50 songs, which will still take a long time to process. Try using 20-30 for a quicker result! Also, the year and genre features are extracted using the built in iTunes/finder metadata, so your results will be more accurate if your chosen songs have this info.

In [2]:
file_list = glob.glob(os.path.join('data', '*.mp3')) + glob.glob(os.path.join('data', '*.m4a'))+ glob.glob(os.path.join('data', '*.wav'))
#print(file_list) # make sure everything is here so far!

## Feature Extraction/Computation

Next, we perform various feature calculations for each song, including BPM, RMS, spectral centroid, and MFCC average. We will use these features to calculate how similar each song is to every other song.

In [3]:
# Calculates approximate BPM of a song using Librosa's tempo estimator.
def calculate_bpm(song_path):
    y, sr = librosa.load(song_path, 44100)
    onset_env = librosa.onset.onset_strength(y, sr=sr)
    bpm = librosa.beat.estimate_tempo(onset_env, sr=sr)
    return bpm


# Calculates average spectral centroid over each song. 
# This gives us a decent measure of the overall brightness or timbre of a song.
def calculate_spectral_centroid(song_path):
    y, sr = librosa.load(song_path, )
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_average = np.ndarray.mean(spectral_centroids)
    return spectral_average


# Calculate average Root Mean Squared (RMS) energy for each song.
def calculate_av_RMS(song_path):
    y, sr = librosa.load(song_path, 44100)
    rms_array = librosa.feature.rmse(y=y)
    rms_mean = np.ndarray.mean(rms_array)
    return rms_mean


# Calculate average Mel Frequency Cepstral Coefficients for each song.
def calculate_MFCC(song_path):
    y, sr = librosa.load(song_path, 44100)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    mfcc_mean = np.mean(mfcc, axis=1)
    return mfcc_mean



## Filling our dictionaries

Our metadata for each track will be stored in a nested dictionary of the form {song1_name : { feature1 : value,  feature2: value ... } ... songn : { feature1 : value ... }}. The outer dictionary contains the paths to each song as keys and an inner dictionary of features and associated results as values. The helper functions below fill each of these mini-dictionaries.

In [4]:
def year_filler(file_list):
    year_dict = {}
    for file in file_list:
        tag = TinyTag.get(file)
        year = tag.year
        #print(year)
        if(year):
            year = ''.join(list(year[:4]))
            year_dict[file] = int(year) #sorry this is hacky
        else:
            year_dict[file] = None
    return year_dict


def genre_filler(file_list):
    genre_dict = {}
    for file in file_list:
        tag = TinyTag.get(file)
        genre = tag.genre
        if(genre):
            genre_dict[file] = str(genre)     
        else:
            genre_dict[file] = None
    return genre_dict


def bpm_filler(file_list):
    bpm_dict = {}
    for file in file_list:
        bpm = calculate_bpm(file)
        bpm_dict[file] = float(bpm)
    return bpm_dict


def RMS_filler(file_list):
    RMS_dict = {}
    for file in file_list:
        rms = calculate_av_RMS(file)
        RMS_dict[file] = float(rms)     
    return RMS_dict


def MFCC_filler(file_list):
    MFCC_dict = {}
    for file in file_list:
        MFCC = calculate_MFCC(file)
        MFCC_dict[file] = MFCC    
    return MFCC_dict


def spec_filler(file_list):
    spec_dict = {}
    for file in file_list:
        spec = calculate_spectral_centroid(file)
        spec_dict[file] = float(spec)     
    return spec_dict

#bpm_filler(file_list)
#year_filler(file_list)

Little helper function so populating the nest dictionaries is easier in fill_big:

In [5]:
# Calls our filler functions to populate the main dictionary. Use this above in fill_big.
def fill_dicts(meta_dict, info_dict, secondary_key):
    for key in info_dict:
        meta_dict[key][secondary_key] = info_dict[key]
    return meta_dict

Here we populate a large nested dictionary (meta_dict) of the format song_name[feature][data] This performs the heavy lifting of calling the functions that have to process the audio so it can take a while if you have a lot of files.

Note: I have experienced a strange error on occasion when attempting to access the Year of some files. I have not resolved how to get around this error yet, and would recommend printing out each file in year_filler to see which file the function is failing on, and simply removing that track from your dataset. More fixed to come soon!

**BE PATIENT!**

In [6]:
def fill_big(file_list):

    meta_dict = {}
    
    for item in file_list:
        meta_dict[item] = {
            'Year': None,
            'Genre': None,
            'BPM': None,
            'Spectral Centroid Average': None,
            'RMS Average': None,
            'MFCC Average': None,
    }
        
    years = year_filler(file_list)
    meta_dict = fill_dicts(meta_dict, years, 'Year')
    
    genres = genre_filler(file_list)
    meta_dict = fill_dicts(meta_dict, genres, 'Genre')

    bpms = bpm_filler(file_list)
    meta_dict = fill_dicts(meta_dict, bpms, 'BPM')
    
    RMS = RMS_filler(file_list)
    meta_dict = fill_dicts(meta_dict, RMS, 'RMS Average')
    
    mfcc = MFCC_filler(file_list)
    meta_dict = fill_dicts(meta_dict, mfcc, 'MFCC Average')
    
    spec = spec_filler(file_list)
    meta_dict = fill_dicts(meta_dict, spec, 'Spectral Centroid Average')
    
    return meta_dict
 
# Save this result as meta_dict, to be used later.
meta_dict = fill_big(file_list)

In [7]:
# Uncomment this if you want to see what your metadata dictionary looks like! Helpful for understanding how the rest of the calculations are happening.
#print(meta_dict)

## Calculating Similarity 

We will now use our meta_dict dictionary to calculate the distance between each feature. track_distance, which calculates the average similarity over a number of features between any two tracks will be called in sim_filler, as we see below.

As we aren't yet using this data and the distance values for classification/ML of any sort, I did not scale them to output a final similarity value between 0 and 1. I am simply summing the distances between the features of each track (with some objective weighting as you can see in track_distance), and using the tracks with the lowests distances as the most similar tracks.

There is still work to be done here to improve the weighting/scaling of features. The next step is normalizing each feature to 0 to 1 and then take a weighted average so that our similarity value is between 0 and 1.

In [8]:
# Computes total distance beatween all features given 2 tracks.
def track_distance(track1, track2, meta_dict):
    
    if(meta_dict[track1]["Year"] == None):
        year_dist = 5 # this logic is flawed but not sure how to deal with one file not having year data yet...
    elif(meta_dict[track1]["Year"] != None and meta_dict[track2]["Year"] == None):
        year_dist = 5
    else:  
        year_dist = np.abs(meta_dict[track1]["Year"] - meta_dict[track2]["Year"])
        
    bpm_dist = np.abs(meta_dict[track1]['BPM'] - meta_dict[track2]['BPM'])
    rms_dist = np.abs(meta_dict[track1]['RMS Average'] - meta_dict[track2]['RMS Average'])
    spec_dist = np.abs(meta_dict[track1]['Spectral Centroid Average'] - meta_dict[track2]['Spectral Centroid Average'])
    
    #using scipy built in euclidean distance function 
    mfcc_dist = distance.euclidean(meta_dict[track1]['MFCC Average'], meta_dict[track2]['MFCC Average'])
    
    if(meta_dict[track1]['Genre'] == meta_dict[track2]['Genre']):
        genre_dist = 0 #we want a lower outcome if the genre matches
    else:
        genre_dist = 1
    
    # weight some of these depending on metric
    total_dist = year_dist + bpm_dist + (rms_dist * 5) + (spec_dist * 0.05) + mfcc_dist + (genre_dist * 15)
    return total_dist
 
# Example:    
#track_distance('data/02 Bodyache.m4a', 'data/01 Heartsigh.m4a', meta_dict)  

Fill a mini similarity dictionary with keys = song names, values = total similarity distance to original track.

In [9]:
# Makes a similarity dictionary relative to the original file.
def sim_filler(original, file_list):
    sim_dict = {}
    for file in file_list:
        sim = track_distance(original, file, meta_dict)
        sim_dict[file] = float(sim)  
    return sim_dict


# Replace the original field with the song you want to get recommendations relative to.
# This song must be one of the songs in your original dataset.

sim_dict = sim_filler('data/02 Bodyache.m4a', file_list )
#print(sim_dict)

In [10]:
# Sorts our similarity dictionary (returns a tuple array not a dict because sorting dicts is impossible)
def sort_sim_dict(sim_dict):
    sorted_dict = sorted(sim_dict.items(), key=lambda x: x[1])
    #print(sorted_dict)
    just_names = [x[:-1] for x in sorted_dict]
    return list(just_names)

result = sort_sim_dict(sim_dict)
#print(result)

## Listen to your results!
Now that we have our similarity values relative to each track, we can sort these distances in ascending order (smallest distance => closest features to original) and then return however many top recommended tracks you want to hear. Listen to them below!

In [12]:
# Listen back to your top n-1 tracks (the first will be your original, so n=3 if you want top 2 most similar. )
# YAY! This is fun.
def player(result, n):

    for i in range(0, n):
        element = result[i]
        element = str(element)
        element = list(element)
        element = element[2:len(element)-3]
        element = ''.join(element)
        if(i==0):
            print('Your original track was:')
        elif(i==1):
            print('This is your most recommended song!')
        elif(i==2):
            print('This is your',i,'nd most recommended song!')
        elif(i==3):
            print('This is your',i,'rd most recommended song!')
        else:
            print('This is your',i,'th most recommended song!')
            
        print(element[5:])
        
        song, sr = librosa.load(element, 44100 )
        IPython.display.display(Audio(song, rate=sr))
        
# Uncomment to see (and hear(!)) the magic happen!
#player(result, 4)