# Playlist Recommendation System

This notebook can be used to recommend songs when given a playlist of one or more songs. 

In [6]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
df = pd.read_json('track_data_9_18_19.json')

In [8]:
# df=df.dropna()
# df=df[df.artist_and_track!='']
# df = df[df.duplicated(subset='artist_and_track', keep=False)]
# df.reset_index(inplace = True)
# df.drop(columns = 'index', inplace = True)

In [9]:
df.head()

Unnamed: 0,added_by_id,album_id,album_name,artist_ids,artist_names,date_added,playlist_id,track_id,track_name,user_id,rating,artists_join,artist_and_track
0,technozem,0d0ONE5rak6Q91XjwKcJvN,Vurstep,"[7ugvHO0W3IoAWzOgKrHxqr, 0K1lHu1BP65Z1DErnljxUw]","[Appleblim, Forest Drive West]",2018-12-04T21:27:55Z,5tW6vGqn4Z2oPxpgQSncKD,5huzNc7H2kv2qNq7kqscHY,Vurstep - Forest Drive West Remix,technozem,1,Appleblim___Forest Drive West,Appleblim___Forest Drive West|||||Vurstep - Fo...
1,technozem,2qz8u01gOb8Lb7KaTR90DQ,Marble,"[7asRTH6SKIMKZZ59Iw2eA5, 2iWOFT9U8InefnarwZUmv0]","[Gnork, Douala]",2018-12-12T21:33:25Z,5tW6vGqn4Z2oPxpgQSncKD,7Hhfn4AkePvX0jW4jOJPQ3,Space Jam (feat. Douala),technozem,1,Gnork___Douala,Gnork___Douala|||||Space Jam (feat. Douala)
10,technozem,4JooAi5hHhUuJNrbiseE8X,At the Controls,[68Wb5Pcy71lLaKdIB6cBA5],[Breakage],2018-12-11T20:02:07Z,5tW6vGqn4Z2oPxpgQSncKD,5NLRRESuSXLQTPgdqAECCq,Rudeboy Stuff,technozem,1,Breakage,Breakage|||||Rudeboy Stuff
100,technozem,6fXNAGf1ihG0B0Sck2Mo1l,Metropolis,[0Ij7th9uWcDVYNAIOn5W22],[Kornél Kovács],2018-12-29T17:28:10Z,5tW6vGqn4Z2oPxpgQSncKD,0ajH7MyTiRhjUbZVYVgRPc,Panda,technozem,1,Kornél Kovács,Kornél Kovács|||||Panda
1000,technozem,3YQASaeJPm3OxUSCP6Qfo9,GROEF - Des avonds in klein maneschijn,[7DksXfhuJLdqtyHnoKCJLI],[GROEF],2011-04-15T16:06:11Z,4AehzXKZqJ5VrqPHpsmN6E,6sihMoVzSFhHtr2p556f5Z,Jan mijne man/Andro/Andro GROEF,technozem,1,GROEF,GROEF|||||Jan mijne man/Andro/Andro GROEF


In [18]:
print(f'Shape: {df.shape}')
print(f'Unique Songs: {df.artist_and_track.nunique()}')
print(f'Unique Reddit Users: {df.user_id.nunique()}') # these are the unique users that added playlists to either r/spotify or r/spotifyplaylists
print(f'Unique Spotify Users: {df.added_by_id.nunique()}') # these are the unique users that created a spotify playlist in the database
print(f'Unique Playlists: {df.playlist_id.nunique()}')

Shape: (1022972, 13)
Unique Songs: 437768
Unique Reddit Users: 419
Unique Spotify Users: 1582
Unique Playlists: 8339


| Users | Playlists   | Unique Songs   | Total Songs|
|------|------|------|------|
|   1582  | 8339| 437768| 1022972|

| Avg Playlists/User | Avg Songs/Playlist   |
|-----|------|
|   5.3  | 123|

In [15]:
def search_by_artist_and_track(df, artist, track):
    '''
    This function searches the database by artist and track. All tracks that contain substrings with an
    exact match of the artist and track strings are returned. Only the first match is printed.
    '''
    print(df[df.artist_and_track.str.contains(artist, regex=False, case=False) & df.artist_and_track.str.contains(track, regex=False, case=False)].artist_and_track.iloc[0])
    return df[df.artist_and_track.str.contains(artist, regex=False, case=False) & df.artist_and_track.str.contains(track, regex=False, case=False)]



In [16]:
def make_playlist(user_id, playlist_id, df, list_of_searches):
    '''
    Makes a playlist from a list of searches using the first songs returned by each search. Returns a dataframe
    representing the new playlist.
    '''
    new_df = pd.DataFrame()
    for artist,track in list_of_searches:
        curr = search_by_artist_and_track(df, artist, track).iloc[0]
        new_df = new_df.append(pd.DataFrame([[user_id, curr.album_id,curr.album_name,curr.artist_ids,
                   curr.artist_names,'NA',playlist_id,curr.track_id,
                   curr.track_name,user_id,1,curr.artists_join,curr.artist_and_track]], columns=df.columns))
    return new_df

In [17]:
def create_sparse(df):
    '''
    Creates a sparse matrix of the songs and playlists.
    '''
    playlist_id_c = CategoricalDtype(sorted(df.playlist_id.unique()), ordered=True)
    artist_and_track_c = CategoricalDtype(sorted(df.artist_and_track.unique()), ordered=True)

    row = df.playlist_id.astype(playlist_id_c).cat.codes
    col = df.artist_and_track.astype(artist_and_track_c).cat.codes
    sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                               shape=(playlist_id_c.categories.size, artist_and_track_c.categories.size))
    return sparse_matrix, playlist_id_c

In [18]:
def create_sim_dict(playlist_id, df):
    '''
    Creates similarity dictionary from sparse matrix.
    '''
    sparse_matrix, playlist_id_c = create_sparse(df)
    similarities_playlists = cosine_similarity(sparse_matrix)
    playlists = np.array(playlist_id_c.categories)
    playlists_indices = list(playlists)
    sim_zip = zip(playlists_indices, similarities_playlists[playlists_indices.index(playlist_id)])
    sim_dict = {p:sim for p,sim in sim_zip}
    return sim_dict

In [19]:
def create_rec_df(playlist_id, df):
    '''
    Create recommendations with sum of similarities. This should be used over the mean of similarities if you
    want song popularity to influence song recommendations. Higher popularity means the song is more likely to
    be recommended. Returns recommendations, which are the songs listed in order of decreasing similarity sums.
    '''
    df_rec = df[df.playlist_id != playlist_id]
    sim_dict = create_sim_dict(playlist_id, df)
    df_sim = pd.DataFrame(zip(sim_dict.keys(), sim_dict.values()))
    df_sim.columns = ['playlist_id','sim']
    df_rec = df.merge(df_sim,how='left', on='playlist_id')
    grouped = df_rec.groupby(by = 'artist_and_track').sum()
    recommendations = grouped.sort_values(by = 'sim', ascending = False)
    return recommendations

In [20]:
def recommend_for_playlist(user_id, playlist_id, df, list_of_searches):
    '''
    Temporarily adds an input playlist into a dataframe in order to return similarities. Returns recommendations.
    Currently, it's likely that the input playlist songs are included in the recommendations. This is intentional
    in order to set up future work with the spotify api but may be changed in a later version.
    '''
    my_playlist = make_playlist(user_id, playlist_id, df, list_of_searches)
    df = df.append(my_playlist)
    recommendations = create_rec_df(playlist_id, df)
    return recommendations

In [21]:
def simplify(messy_rec, number_tracks):
    '''
    Creates a clean dataframe from recommendations, which is the return from the recommend_for_playlist functions.
    '''
    combined = []
    for row in messy_rec.head(number_tracks).itertuples():
        combined.append(row.Index.split('|||||'))
    clean_df = pd.DataFrame(np.array(combined))
    clean_df.columns = ['artist','track']
    return clean_df

In [22]:
def create_rec_df_mean(playlist_id, df):
    '''
    Create recommendations with mean of similarities. This should be used over the sum of similarities if you
    DO NOT want song popularity to influence song recommendations. This is a good option if you find you're
    getting the same recommendations often. Returns recommendations, which are the songs listed in order of
    decreasing similarity means.
    '''
    df_rec = df[df.playlist_id != playlist_id]
    sim_dict = create_sim_dict(playlist_id, df)
    df_sim = pd.DataFrame(zip(sim_dict.keys(), sim_dict.values()))
    df_sim.columns = ['playlist_id','sim']
    df_rec = df.merge(df_sim,how='left', on='playlist_id')
    grouped = df_rec.groupby(by = 'artist_and_track').mean()
    recommendations = grouped.sort_values(by = 'sim', ascending = False)
    return recommendations

In [23]:
def recommend_for_playlist_mean(user_id, playlist_id, df, list_of_searches):
    '''
    Temporarily adds an input playlist into a dataframe in order to return similarities. Returns recommendations.
    Currently, it's likely that the input playlist songs are included in the recommendations. This is intentional
    in order to set up future work with the spotify api but may be changed in a later version.
    '''
    my_playlist = make_playlist(user_id, playlist_id, df, list_of_searches)
    df = df.append(my_playlist)
    recommendations = create_rec_df_mean(playlist_id, df)
    return recommendations

In [26]:
simplify(recommend_for_playlist('justin', 'justins jams', df, [('croce','i got a name'),
                                                               ('steely dan','do it again'),
                                                              ('spinners','rubberband')]), 30)

Jim Croce|||||I Got A Name
Steely Dan|||||Do It Again
The Spinners|||||The Rubberband Man


Unnamed: 0,artist,track
0,The Spinners,The Rubberband Man
1,Steely Dan,Do It Again
2,Jim Croce,I Got A Name
3,Electric Light Orchestra,Mr. Blue Sky
4,George Harrison,My Sweet Lord
5,Yusuf / Cat Stevens,Father And Son
6,Looking Glass,Brandy (You're a Fine Girl)
7,Rupert Holmes,Escape (The Pina Colada Song)
8,The Doobie Brothers,Listen to the Music
9,Blue Swede___Björn Skifs,Hooked on a Feeling


In [28]:
simplify(recommend_for_playlist_mean('justin', 'justinsjams', df, [('croce','i got a name'),
                                                     ('steely dan','do it again')]), 20)

Jim Croce|||||I Got A Name
Steely Dan|||||Do It Again


Unnamed: 0,artist,track
0,Katie Hanley___Godspell Ensemble,By My Side
1,Carpenters,Hurting Each Other
2,Olivia Newton-John,Sam
3,The Five Stairsteps,Ooh Child
4,The Delfonics,Funny Feeling
5,Jim Croce,I Got A Name
6,The Feelies,Raised Eyebrows
7,Gloria Gaynor,Never Can Say Goodbye (Re-Recorded)
8,Misfits___Glenn Danzig,Teenagers From Mars
9,The Dillinger Escape Plan,Symptom of Terminal Illness


In [29]:
simplify(recommend_for_playlist_mean('Dan', 'Dans jams', df, [('led','stairway'),
                                                             ('paul','band on the run')]), 30)

Led Zeppelin|||||Stairway to Heaven - 1990 Remaster
Paul McCartney___Wings|||||Band On The Run - Remastered 2010


Unnamed: 0,artist,track
0,Led Zeppelin,When the Levee Breaks - Alternate UK Mix in Pr...
1,Dee Snider___Diego Boneta,I Wanna Rock
2,Jani Lane___Nuno Bettencourt___Gary F. Cherone...,More Than Words / Heaven
3,Lou Gramm___Mick Jones___Julianne Hough___Dieg...,Waiting For A Girl Like You
4,Aaron Cabott Jones,Wish Upon a Starfish
5,Mr Krax,A Tight Little Band
6,The Beatles,While My Guitar Gently Weeps - 2018 Mix
7,The Beatles,Help! - Live / Remastered
8,The Beatles,Can't Buy Me Love - Live / Remastered
9,The Beatles,Shout - Anthology 1 Version
