# Playlist Recommendation System

This notebook can be used to recommend songs when given a playlist of one or more songs. 

In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_json('pre_pivot.json')

In [14]:
df.shape

(515096, 13)

In [15]:
df.artist_and_track.nunique()

116157

In [17]:
df.shape

(515096, 13)

In [18]:
df.added_by_id.nunique()

918

In [19]:
df.playlist_id.nunique()

6049

In [20]:
df.head()

Unnamed: 0,added_by_id,album_id,album_name,artist_ids,artist_names,date_added,playlist_id,track_id,track_name,user_id,rating,artists_join,artist_and_track
0,brendan.ta,4uIDigk79DeZEYV6Z5Yf4s,What Went Down,[6FQqZYVfTNQ1pCqfkwVFEa],[Foals],2016-03-26T00:48:16Z,3pTPXB3vT93AOTSVozP54o,53L6A3I9vf7rgEZnMzx54E,Mountain At My Gates,brendan.ta,1,Foals,Foals|||||Mountain At My Gates
1,brendan.ta,4sFhah3DYcJlYeT47q3rhM,In The Silence (Deluxe Version),[7xUZ4069zcyBM4Bn10NQ1c],[Ásgeir],2016-03-26T00:49:53Z,3pTPXB3vT93AOTSVozP54o,6VNo09sojPBi5mdckQkLbX,King and Cross,brendan.ta,1,Ásgeir,Ásgeir|||||King and Cross
10,brendan.ta,5cOhR878H8hC3UsxYq5Xyv,All My Friends (feat. Tinashe & Chance the Rap...,"[2FwJwEswyIUAljqgjNSHgP, 0NIIxcxNHmOoyBx03SfTC...","[Snakehips, Tinashe, Chance the Rapper]",2016-04-29T05:34:47Z,3pTPXB3vT93AOTSVozP54o,6TaqooOXAEcijL6G1AWS2K,All My Friends (feat. Tinashe & Chance the Rap...,brendan.ta,1,Snakehips___Tinashe___Chance the Rapper,Snakehips___Tinashe___Chance the Rapper|||||Al...
100,brendan.ta,07hs3meNvGu5Fp46pnLQm7,Hungry Ghost,[2N2EFVDEbp2JB8ulEUVIxp],[Violent Soho],2017-02-16T03:09:55Z,3pTPXB3vT93AOTSVozP54o,14NnWDVM8nCssfHAqWZp1T,Covered In Chrome,brendan.ta,1,Violent Soho,Violent Soho|||||Covered In Chrome
1000,brendan.ta,4PKH86wn7Gw4iel2WD564k,Do It Like You,[0aA1GTrIMutjIh4GlPPUVN],[Crooked Colours],2019-02-04T05:22:30Z,3LF6iEL6uCE9rSdLF4m1ga,5h1Gi50T9QxVfzvIF8vciC,Do It Like You,brendan.ta,1,Crooked Colours,Crooked Colours|||||Do It Like You


| Users | Playlists   | Unique Songs   | Total Songs|
|------|------|------|------|
|   918  | 6049| 116157| 515096|

| Avg Playlists/User | Avg Songs/Playlist   |
|-----|------|
|   6.6  | 85.2|

In [21]:
def search_by_artist_and_track(df, artist, track):
    '''
    This function searches the database by artist and track. All tracks that contain substrings with an
    exact match of the artist and track strings are returned. Only the first match is printed.
    '''
    print(df[df.artist_and_track.str.contains(artist, regex=False, case=False) & df.artist_and_track.str.contains(track, regex=False, case=False)].artist_and_track.iloc[0])
    return df[df.artist_and_track.str.contains(artist, regex=False, case=False) & df.artist_and_track.str.contains(track, regex=False, case=False)]



In [22]:
def make_playlist(user_id, playlist_id, df, list_of_searches):
    '''
    Makes a playlist from a list of searches using the first songs returned by each search. Returns a dataframe
    representing the new playlist.
    '''
    new_df = pd.DataFrame()
    for artist,track in list_of_searches:
        curr = search_by_artist_and_track(df, artist, track).iloc[0]
        new_df = new_df.append(pd.DataFrame([[user_id, curr.album_id,curr.album_name,curr.artist_ids,
                   curr.artist_names,'NA',playlist_id,curr.track_id,
                   curr.track_name,user_id,1,curr.artists_join,curr.artist_and_track]], columns=df.columns))
    return new_df

In [23]:
def create_sparse(df):
    '''
    Creates a sparse matrix of the songs and playlists.
    '''
    playlist_id_c = CategoricalDtype(sorted(df.playlist_id.unique()), ordered=True)
    artist_and_track_c = CategoricalDtype(sorted(df.artist_and_track.unique()), ordered=True)

    row = df.playlist_id.astype(playlist_id_c).cat.codes
    col = df.artist_and_track.astype(artist_and_track_c).cat.codes
    sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                               shape=(playlist_id_c.categories.size, artist_and_track_c.categories.size))
    return sparse_matrix, playlist_id_c

In [24]:
def create_sim_dict(playlist_id, df):
    '''
    Creates similarity dictionary from sparse matrix.
    '''
    sparse_matrix, playlist_id_c = create_sparse(df)
    similarities_playlists = cosine_similarity(sparse_matrix)
    playlists = np.array(playlist_id_c.categories)
    playlists_indices = list(playlists)
    sim_zip = zip(playlists_indices, similarities_playlists[playlists_indices.index(playlist_id)])
    sim_dict = {p:sim for p,sim in sim_zip}
    return sim_dict

In [25]:
def create_rec_df(playlist_id, df):
    '''
    Create recommendations with sum of similarities. This should be used over the mean of similarities if you
    want song popularity to influence song recommendations. Higher popularity means the song is more likely to
    be recommended. Returns recommendations, which are the songs listed in order of decreasing similarity sums.
    '''
    df_rec = df[df.playlist_id != playlist_id]
    sim_dict = create_sim_dict(playlist_id, df)
    df_sim = pd.DataFrame(zip(sim_dict.keys(), sim_dict.values()))
    df_sim.columns = ['playlist_id','sim']
    df_rec = df.merge(df_sim,how='left', on='playlist_id')
    grouped = df_rec.groupby(by = 'artist_and_track').sum()
    recommendations = grouped.sort_values(by = 'sim', ascending = False)
    return recommendations

In [26]:
def recommend_for_playlist(user_id, playlist_id, df, list_of_searches):
    '''
    Temporarily adds an input playlist into a dataframe in order to return similarities. Returns recommendations.
    Currently, it's likely that the input playlist songs are included in the recommendations. This is intentional
    in order to set up future work with the spotify api but may be changed in a later version.
    '''
    my_playlist = make_playlist(user_id, playlist_id, df, list_of_searches)
    df = df.append(my_playlist)
    recommendations = create_rec_df(playlist_id, df)
    return recommendations

In [27]:
def simplify(messy_rec, number_tracks):
    '''
    Creates a clean dataframe from recommendations, which is the return from the recommend_for_playlist functions.
    '''
    combined = []
    for row in messy_rec.head(number_tracks).itertuples():
        combined.append(row.Index.split('|||||'))
    clean_df = pd.DataFrame(np.array(combined))
    clean_df.columns = ['artist','track']
    return clean_df

In [28]:
def create_rec_df_mean(playlist_id, df):
    '''
    Create recommendations with mean of similarities. This should be used over the sum of similarities if you
    DO NOT want song popularity to influence song recommendations. This is a good option if you find you're
    getting the same recommendations often. Returns recommendations, which are the songs listed in order of
    decreasing similarity means.
    '''
    df_rec = df[df.playlist_id != playlist_id]
    sim_dict = create_sim_dict(playlist_id, df)
    df_sim = pd.DataFrame(zip(sim_dict.keys(), sim_dict.values()))
    df_sim.columns = ['playlist_id','sim']
    df_rec = df.merge(df_sim,how='left', on='playlist_id')
    grouped = df_rec.groupby(by = 'artist_and_track').mean()
    recommendations = grouped.sort_values(by = 'sim', ascending = False)
    return recommendations

In [29]:
def recommend_for_playlist_mean(user_id, playlist_id, df, list_of_searches):
    '''
    Temporarily adds an input playlist into a dataframe in order to return similarities. Returns recommendations.
    Currently, it's likely that the input playlist songs are included in the recommendations. This is intentional
    in order to set up future work with the spotify api but may be changed in a later version.
    '''
    my_playlist = make_playlist(user_id, playlist_id, df, list_of_searches)
    df = df.append(my_playlist)
    recommendations = create_rec_df_mean(playlist_id, df)
    return recommendations

In [33]:
simplify(recommend_for_playlist('justin', 'justins jams', df, [('croce','i got a name'),
                                                               ('steely dan','do it again'),
                                                              ('spinners','rubberband')]), 30)

Jim Croce|||||I Got A Name
Steely Dan|||||Do It Again
The Spinners|||||The Rubberband Man


Unnamed: 0,artist,track
0,The Spinners,The Rubberband Man
1,Steely Dan,Do It Again
2,Jim Croce,I Got A Name
3,Electric Light Orchestra,Mr. Blue Sky
4,Looking Glass,Brandy (You're a Fine Girl)
5,Rupert Holmes,Escape (The Pina Colada Song)
6,Creedence Clearwater Revival,Fortunate Son
7,Yusuf / Cat Stevens,Father And Son
8,Boston,More Than a Feeling
9,The Doobie Brothers,Listen to the Music


In [30]:
simplify(recommend_for_playlist_mean('justin', 'justins jams', df, [('croce','i got a name'),
                                                     ('steely dan','do it again')]), 20)

Jim Croce|||||I Got A Name
Steely Dan|||||Do It Again


Unnamed: 0,artist,track
0,Jim Croce,I Got A Name
1,Steely Dan,Do It Again
2,The Doobie Brothers,Listen to the Music - Single Version; 2006 Rem...
3,Dionne Farris,I Know
4,Titus Andronicus,A More Perfect Union
5,Ambrosia___Tom Trefethen,How Much I Feel - Remastered Version
6,Chicago,Feelin' Stronger Every Day
7,Gary Wright,Dream Weaver
8,Dickey Betts___The Allman Brothers Band,Ramblin' Man
9,Linda Ronstadt,Blue Bayou


In [32]:
simplify(recommend_for_playlist_mean('Dan', 'Dans jams', df, [('led','stairway'),
                                                             ('paul','band on the run')]), 30)

Led Zeppelin|||||Stairway to Heaven - Remaster
Paul McCartney___Wings|||||Band On The Run
The Spinners|||||The Rubberband Man


Unnamed: 0,artist,track
0,The Spinners,The Rubberband Man
1,Paul McCartney___Wings,Band On The Run
2,The Revivalists,Stand Up
3,Led Zeppelin,Stairway to Heaven - Remaster
4,Randal L Meek,Following Our Dreams
5,John Denver,"Poems, Prayers and Promises - Live at the Univ..."
6,Natural Child,NSA Blues
7,Van Halen,(Oh) Pretty Woman - 2015 Remaster
8,Boston,It's Easy
9,Alabama,Keepin' Up


In [177]:
jasons_recs.head(20)

Unnamed: 0_level_0,rating,sim
artist_and_track,Unnamed: 1_level_1,Unnamed: 2_level_1
Beach Fossils|||||Down the Line,20,2.207815
Future Islands|||||A Dream of You and Me,4,1.20502
Peach Pit|||||Peach Pit,20,0.619935
Tame Impala|||||The Less I Know The Better,99,0.452416
Good Morning|||||Warned You,25,0.404688
boy pablo|||||Everytime,25,0.375735
Beach Fossils|||||What a Pleasure,13,0.36921
Tame Impala|||||Feels Like We Only Go Backwards,69,0.357343
Mac DeMarco|||||My Kind of Woman,15,0.355247
The Drums|||||Days,14,0.328813


In [152]:
coles_recs.head(20)

Unnamed: 0_level_0,rating,sim,weighted?
artist_and_track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Masego___FKJ|||||Tadow,54,0.479657,0.008883
Childish Gambino|||||Redbone,127,0.443397,0.003491
Nick Hakim|||||Cuffed,19,0.346619,0.018243
Steve Lacy|||||Some,23,0.305193,0.013269
Matt Martians___Syd___Steve Lacy|||||Dent Jusay,10,0.30035,0.030035
Frank Ocean|||||Nikes,38,0.299221,0.007874
Masego|||||Navajo,27,0.283595,0.010504
BJ The Chicago Kid|||||Turnin' Me Up,22,0.272489,0.012386
Sampa the Great|||||Blue Boss,11,0.263032,0.023912
Jorja Smith|||||Teenage Fantasy,15,0.260472,0.017365


In [73]:
sparse_matrix

<6051x116157 sparse matrix of type '<class 'numpy.int64'>'
	with 489351 stored elements in Compressed Sparse Row format>

In [74]:
similarities_playlists = cosine_similarity(sparse_matrix)

In [75]:
playlists = np.array(playlist_id_c.categories)

In [76]:
playlists_indices = list(playlists)

In [77]:
playlists_indices

['002MBUoZVJntSIsgW6EpFA',
 '0061U3t8hRFhIBw42to0Yw',
 '008wDZ2VZxGTRYLYS82mya',
 '00AP7VyekxNKmWiMj9xisM',
 '00CsN1rCRTN8tKcPKcHCjQ',
 '00CzKdtJgJelMbJJvs2rJN',
 '00LP85DvUnR9CHeqAFv7VL',
 '00SrfsM0Wh4ZfZSrA9OQS3',
 '00UWUyGDpyN7bhsoUZVqdo',
 '00Vv9ktcShVGl695Nqtqjn',
 '00WPgq2AQvZ6vaXnaUddy0',
 '00b9NrNWGW7kB3mlk6lTUd',
 '00nBF6hbRoRLUKp2WvRrtF',
 '00rjDqOCrMtgV7wcxScgHM',
 '00wHFIu00pH4CJWPV4vJwA',
 '01AAC8Hn9BqioBD4AITbYN',
 '01AyypY3Vg6BZ7Fvbpi8AO',
 '01Bj96ebCNSskOmkh8taPP',
 '01GwqHIWoJmhLKfRhu5uvq',
 '01Ivk66sjv7fxG28mSEC3B',
 '01JIbtNjFwWjuddNL0Tq71',
 '01U7nKpzTRK1Hp2V03IMWu',
 '01UbuTuMY0TvMz42KXoy3y',
 '01VpWWi8xzzS6nJ64J5hQA',
 '01WaTuwNZnqwbBkRk2UeG9',
 '01aCMPN9vISPWsMqQtol8S',
 '01gjluhMJeFT0WcvR1937w',
 '01gyv6qrcXgycqwnbhMHO2',
 '01oOxpTnnokNxu5AitOL41',
 '01qHdqpm6S7zv5N3vChwye',
 '01rNo73R3xjRpa3G0VbKVH',
 '01tvqPxen74Nz7TRz7kpSd',
 '01wyjmhDW4Fru5cb0TQ7Dp',
 '023J3zoqmmqR0cUp7hyAcw',
 '02NUIaAZADXg8qYnwCezev',
 '02Tt4tSbfwN8EYTMc3ZzgM',
 '02Yypb6ck9z09yWm41yU5l',
 

In [17]:
# np.save('playlist_similarities',similarities_playlists)