In [11]:
import pandas as pd

import pickle

from scipy.spatial import minkowski_distance

## Load our centroids and import our features DataFrame

In [12]:
pickle_in = open('../Data/cluster_centers_tfidf_sent_rm_pca_9', 'rb')
centroids = pickle.load(pickle_in)
pickle_in.close()

In [13]:
df = pd.read_pickle('../Data/labeled_points_tfidf_sent_rm_pca_9')

In [15]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = df.select_dtypes(include=numerics)
no_features = df.select_dtypes(exclude=numerics)

## Function to match songs based on their minkowsky distance
Note I used Minkowsky distance over Manhattan Distance or Euclidean distance because I cared about giving greater weight to similar features but did not want to overfit on similar features.

In [17]:
def match_sentiment(sentiment_df, feats_to_match):
    """
    Takes in a DataFrame containing features for songs as well as a set of features to match.
    Finds the distances between all songs and the supplied features to match. 
    """
    distances_df = sentiment_df.apply(lambda x:
                                      minkowski_distance(x.values,
                                                         feats_to_match, p=1),
                                      axis=1)
    distances_df = pd.DataFrame(distances_df)
    distances_df.columns = ['distance']
    n_smallest = distances_df.nsmallest(15, 'distance').index
    return n_smallest

Note that we find songs close to a centroid as well as songs close to other songs.

In [36]:
song_to_match = df[df.song_title == 'sleepyhead'].iloc[:, -10:-1].values[1]

In [37]:
n_smallest = match_sentiment(features.drop('labels', axis=1), song_to_match)

Here we can return the songs closest to the supplied features.

In [39]:
for i in n_smallest:
    print(df.iloc[i].uri + ',',
          df.iloc[i].unaltered_artist_name + ',',
          df.iloc[i].unaltered_song_title,
          '\n')

spotify:track:4prEPl61C8qZpeo3IkYSMl, Passion Pit, Sleepyhead 

spotify:track:5Z4Wo3FE6P7qtZOSM8pFRH, Ellie Goulding, Paradise 

spotify:track:3h7ETIv6aOQZnI8FqnlA9L, Empire of the Sun, Awakening 

spotify:track:3B4Oyq1XIFSd8cca2oaH5T, Pernice Brothers, Red Desert 

spotify:track:6MUsGPDCrFHrGrY6744BZo, Missy Higgins, Tricks 

spotify:track:5P3CXbKPMc48G1W0dbckfy, Tahiti 80, Heartbeat 

spotify:track:28lW1wU0m8ZpIGE6UZNmUj, Modest Mouse, I Came As a Rat 

spotify:track:1RX6K9yQuBugl1sgU5oIUO, Neon Trees, Girls And Boys In School 

spotify:track:3VlUwLpEmCe9AcLsDlIo1F, Satellite Stories, Polarized 

spotify:track:6aHlRLDcTXxaqKcmyux1lr, The New Pornographers, Wide Eyes 

spotify:track:46OoSPRcpZBPqlxlTaSTP4, Cults, Clear from Far Away 

spotify:track:2RW9qqrhPwMPHlWx57CalB, Ellie Goulding, Goodness Gracious 

spotify:track:6ZUITkPPMScmzjW4jFkBn6, Empire of the Sun, DNA 

spotify:track:3cGlfEVRIBTnm4d9Uwnc56, Oh Land, First To Say Goodnight 

spotify:track:01dWh7ZBJGx6xU7ErnhEBN, Georgie