* Think of a word or words that define each point  
    * Proximity to each of those words creates a new centroid.
    * proximity is determined using pre-trained word2vec
    * basically, check how close a new word is to each of the centroids (which are defined by words)
    * how close it is to each centroid contributes contributes to it's distance in that direction

In [1]:
import pandas as pd
import numpy as np

import pickle

from scipy.spatial import minkowski_distance

Find songs closest to each centroid and define those moods

In [2]:
pickle_in = open('../Data/cluster_centers_tfidf_pca_12', 'rb')
centroids = pickle.load(pickle_in)
pickle_in.close()

In [3]:
df = pd.read_pickle('../Data/labeled_points_tf-idf_pca_12')

In [84]:
df['labels'] = df.labels.to_string()

In [88]:
def split_df(df):
    """
    Takes in a DataFrame and splits it into song features and information
    about the song.
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    features = df.select_dtypes(include=numerics)
    indices = df.select_dtypes(exclude=numerics)
    return features, indices

In [89]:
def calculate_distances(feats_df, info_df, feats_to_match):
    """
    Takes in a set of features, all song information,
    and the desired features to match.
    Returns the distances of all songs to that set of features.
    """

    distance_calc = lambda x: minkowski_distance(x.values, feats_to_match, p=1)
    distances = feats_df.apply(distance_calc, axis=1)
    distances_df = pd.DataFrame(distances, columns=['distance'])
    distances_df = distances_df.merge(info_df,
                                      left_index=True, right_index=True)

    distances_df = (distances_df.sort_values('distance')
                                .reset_index(drop=True))

    return distances_df

In [90]:
def make_recommendation_indices(n):
    """
    Takes an integer, n, as input.
    Returns a list of length n containing non-repeating random integers.
    """
    indices = []
    while len(indices) < n:
        index = int(np.random.exponential(scale=10))
        if index not in np.array(indices):
            indices.append(index)

    return indices

In [99]:
def generate_playlist(feats_to_match, df, n_songs=10):
    """
    Takes in a features to match, a data frame containing song
    information and features, and optional parameter for the
    number of songs in the playlist.
    Return the artist names, the song names, and the spotify
    uri's for each song.
    """
    feats_df, info_df = split_df(df)

    distances_df = calculate_distances(feats_df, info_df, feats_to_match)

    reco_indices = make_recommendation_indices(n_songs)

    artist_names = distances_df.iloc[reco_indices].unaltered_artist_name.values
    song_titles = distances_df.iloc[reco_indices].unaltered_song_title.values
    spotify_song_uris = distances_df.iloc[reco_indices].spotify_song_uri.values

    return artist_names, song_titles, spotify_song_uris

In [109]:
artists, songs, uris = generate_playlist(centroids[5], df, n_songs=10)

In [110]:
artists.tolist()

['Conor Oberst',
 'Starlight Mints',
 'The Shins',
 'The Promise Ring',
 'Teenage Fanclub',
 'Michael Angelakos',
 'Copeland',
 'Morrissey',
 'The Divine Comedy',
 'Conor Oberst']