# DEPENDENCIES 

In [1]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

# SUMMARY


In [2]:
#from IPython.display import Image
#Image("C:/Users/ishku/OneDrive/Desktop/projects/recommender system/statics/summary.png")

# 1. Data Preparation


In [3]:
spotify_df = pd.read_csv("./datasets/data.csv")

In [4]:
spotify_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [5]:
data_w_genre = pd.read_csv("./datasets/data_w_genres.csv")

In [6]:
data_w_genre.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12,['show tunes']
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,33.076923,5,1,26,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.285714,0,1,7,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.444444,0,1,27,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,232428.111111,0.429333,0.037534,0.216111,-11.447222,0.086,120.329667,0.458667,42.555556,11,1,9,[]


In [7]:
data_w_genre.dtypes

artists              object
acousticness        float64
danceability        float64
duration_ms         float64
energy              float64
instrumentalness    float64
liveness            float64
loudness            float64
speechiness         float64
tempo               float64
valence             float64
popularity          float64
key                   int64
mode                  int64
count                 int64
genres               object
dtype: object

In [8]:
data_w_genre['genres'].values[0]

"['show tunes']"

In [9]:
data_w_genre['genres_upd'] = data_w_genre['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [10]:
type(data_w_genre['genres_upd'].values[0])

list

Step 1. Similar to before, we will need to extract the artists from the string list.

In [11]:
spotify_df['artists_upd_v1'] = spotify_df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))

In [12]:
spotify_df['artists'].values[0]

"['Mamie Smith']"

In [13]:
type(spotify_df['artists_upd_v1'].values[0])

list

In [14]:
spotify_df[spotify_df['artists_upd_v1'].apply(lambda x: not x)].head(5)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,artists_upd_v1
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920,[]
34,0.021,"[""The Swingin' Bachelors""]",0.559,288293,0.83,0,0bq1lNI8OPhdRQYZZSSnQk,0.146,11,0.187,-7.876,0,Voodoo Doll,0,1920-02-29,0.0546,107.912,0.352,1920,[]
36,0.79,"[""The Swingin' Bachelors""]",0.745,261013,0.791,0,0e0zhCKclqwgJgA7Tjg2ca,0.0,7,0.102,-8.584,1,Penticton Kitty,0,1920-02-29,0.0595,120.169,0.658,1920,[]
46,0.147,"[""The Swingin' Bachelors""]",0.603,235547,0.719,0,0xKe1lGyWBuk3lx3JIUUUx,0.0,0,0.111,-5.464,0,Cloud 9,0,1920-02-29,0.0271,92.01,0.697,1920,[]
47,0.367,"[""The Swingin' Bachelors""]",0.716,264240,0.836,0,11rj3RWlVnnxW0EL7jwN8R,6e-06,9,0.191,-6.099,1,Big Jim and the Twins,0,1920-02-29,0.0451,105.986,0.635,1920,[]


In [15]:
spotify_df['artists_upd_v2'] = spotify_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
spotify_df['artists_upd'] = np.where(spotify_df['artists_upd_v1'].apply(lambda x: not x), spotify_df['artists_upd_v2'], spotify_df['artists_upd_v1'] )

In [16]:
spotify_df['artists_song'] = spotify_df.apply(lambda row: row['artists_upd'][0]+" " + row['name'],axis = 1)

In [17]:
spotify_df.sort_values(['artists_song','release_date'], ascending = False, inplace = True)

In [18]:
spotify_df[spotify_df['name']=='Hello']

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,popularity,release_date,speechiness,tempo,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song
138035,0.0237,['Poe'],0.75,271680,0.735,0,1UT2yVLkkJyoLVtGbPgHpi,0.124,9,0.129,...,34,1995-09-14,0.109,92.902,0.518,1995,[Poe],[],[Poe],Poe Hello
71560,0.0136,['Oasis'],0.318,203187,0.952,0,4qYlBtzkmby4r1N7etPnUv,0.0152,9,0.167,...,55,1995-10-02,0.094,132.191,0.0537,1995,[Oasis],[],[Oasis],Oasis Hello
157339,0.0339,['OMFG'],0.844,226308,0.864,0,6BAnxKyld909yo6Pk1DO3r,0.216,7,0.386,...,64,2014-12-03,0.0348,104.997,0.796,2014,[OMFG],[],[OMFG],OMFG Hello
12512,0.414,['Lionel Richie'],0.487,251107,0.246,0,0mHyWYXmmCB9iQyK18m3FQ,0.0,9,0.0983,...,57,1983-01-01,0.0376,61.084,0.0562,1983,[Lionel Richie],[],[Lionel Richie],Lionel Richie Hello
51034,0.516,['Lionel Richie'],0.563,248573,0.262,0,1b16zIZIdL2LIMfDiANwIk,0.0,9,0.194,...,42,1983,0.0333,124.206,0.0626,1983,[Lionel Richie],[],[Lionel Richie],Lionel Richie Hello
76768,0.465,"['KAROL G', 'Ozuna']",0.802,194933,0.839,0,2776pPgCa5KrLVUqNBtjfJ,5.2e-05,7,0.117,...,0,2021-01-22,0.0592,94.997,0.861,2021,"[KAROL G, Ozuna]",[],"[KAROL G, Ozuna]",KAROL G Hello
76774,0.465,"['KAROL G', 'Ozuna']",0.802,194933,0.839,0,2ljhKzXJAPBmRN0QIkxKHG,5.2e-05,7,0.117,...,0,2021-01-22,0.0592,94.997,0.861,2021,"[KAROL G, Ozuna]",[],"[KAROL G, Ozuna]",KAROL G Hello
76780,0.465,"['KAROL G', 'Ozuna']",0.802,194933,0.839,0,36YVsPZh2byKbodjUDyjTG,5.2e-05,7,0.117,...,0,2021-01-22,0.0592,94.997,0.861,2021,"[KAROL G, Ozuna]",[],"[KAROL G, Ozuna]",KAROL G Hello
76782,0.465,"['KAROL G', 'Ozuna']",0.802,194933,0.839,0,371SS9OeuFXXUvF3SECivH,5.2e-05,7,0.117,...,0,2021-01-22,0.0592,94.997,0.861,2021,"[KAROL G, Ozuna]",[],"[KAROL G, Ozuna]",KAROL G Hello
93886,0.465,"['KAROL G', 'Ozuna']",0.802,194933,0.839,0,4CRlkAw8jAbVdXCjySXmFl,5.2e-05,7,0.117,...,0,2021-01-22,0.0592,94.997,0.861,2021,"[KAROL G, Ozuna]",[],"[KAROL G, Ozuna]",KAROL G Hello


In [19]:
spotify_df.drop_duplicates('artists_song',inplace = True)

In [20]:
spotify_df[spotify_df['name']=='Hello']

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,popularity,release_date,speechiness,tempo,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song
138035,0.0237,['Poe'],0.75,271680,0.735,0,1UT2yVLkkJyoLVtGbPgHpi,0.124,9,0.129,...,34,1995-09-14,0.109,92.902,0.518,1995,[Poe],[],[Poe],Poe Hello
71560,0.0136,['Oasis'],0.318,203187,0.952,0,4qYlBtzkmby4r1N7etPnUv,0.0152,9,0.167,...,55,1995-10-02,0.094,132.191,0.0537,1995,[Oasis],[],[Oasis],Oasis Hello
157339,0.0339,['OMFG'],0.844,226308,0.864,0,6BAnxKyld909yo6Pk1DO3r,0.216,7,0.386,...,64,2014-12-03,0.0348,104.997,0.796,2014,[OMFG],[],[OMFG],OMFG Hello
12512,0.414,['Lionel Richie'],0.487,251107,0.246,0,0mHyWYXmmCB9iQyK18m3FQ,0.0,9,0.0983,...,57,1983-01-01,0.0376,61.084,0.0562,1983,[Lionel Richie],[],[Lionel Richie],Lionel Richie Hello
76768,0.465,"['KAROL G', 'Ozuna']",0.802,194933,0.839,0,2776pPgCa5KrLVUqNBtjfJ,5.2e-05,7,0.117,...,0,2021-01-22,0.0592,94.997,0.861,2021,"[KAROL G, Ozuna]",[],"[KAROL G, Ozuna]",KAROL G Hello
92762,0.0771,['J. Cole'],0.598,219173,0.7,1,30Chv2SmIry70YwtmtaKnj,1.1e-05,9,0.106,...,58,2014-12-09,0.22,159.899,0.329,2014,[J. Cole],[],[J. Cole],J. Cole Hello
54384,0.155,"['Ice Cube', 'Dr. Dre', 'MC Ren']",0.903,232533,0.61,1,60uesDF4UyLUs61FgiDkdp,0.0,10,0.389,...,58,2000-02-29,0.339,91.46,0.627,2000,"[Ice Cube, Dr. Dre, MC Ren]",[],"[Ice Cube, Dr. Dre, MC Ren]",Ice Cube Hello
73174,0.912,['Evanescence'],0.416,220360,0.178,0,0aYUqsvZG7bAslrUkd9Z0g,0.000138,11,0.342,...,58,2003-03-04,0.031,139.969,0.0699,2003,[Evanescence],[],[Evanescence],Evanescence Hello
38280,0.33,['Adele'],0.578,295502,0.43,0,62PaSfnXSMyLshYJrlTuL3,0.0,5,0.0854,...,71,2015-11-20,0.0305,78.991,0.288,2015,[Adele],[],[Adele],Adele Hello


In [21]:
artists_exploded = spotify_df[['artists_upd','id']].explode('artists_upd')

In [22]:
artists_exploded

Unnamed: 0,artists_upd,id
167865,조정현,2ghebdwe2pNXT4eL34T7pW
173888,김효근,6ra76urfo3pTD050opXTZS
173888,Quinever 퀸에버,6ra76urfo3pTD050opXTZS
166711,김수철,4FG1uHQAVPsYiXIn3tcGRs
75703,龔詩嘉,2w5L5kQVkSRdduNewYPDA5
...,...,...
105296,"Cru"", ""Michel",3hoiinUc5VA9xUEJID7R8V
71705,"Choir"",",7GaUNnAdIMXrvYG7673QIK
139006,"4-Tay"",",0CHKRuAjf8LvCL6SxCnq7V
154041,"4-Tay"",",78859Af0fmA9VTlgnOHTAP


In [23]:
artists_exploded_enriched = artists_exploded.merge(data_w_genre, how = 'left', left_on = 'artists_upd',right_on = 'artists')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull()]

In [24]:
artists_exploded_enriched_nonnull[artists_exploded_enriched_nonnull['artists'] =='Adele']

Unnamed: 0,artists_upd,id,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres,genres_upd
173477,Adele,6FZjUIfR6vEEg1bT9elJlf,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208568,Adele,7GgWAITsYJaRM3r50rfh5w,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208569,Adele,4jL6WWKFDqCOPo2hC3VhSS,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208570,Adele,4oy98QoABnGKpvapDkI525,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208571,Adele,08YJEcxGtYXwCGqXMZDiyQ,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208572,Adele,1zwMYTA5nlNjZxYrvBB2pV,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208573,Adele,6VObnIkLVruX4UVyxWhlqm,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208574,Adele,73CMRj62VK8nUS4ezD2wvi,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208575,Adele,0t7fVeEJxO2Xi4H2K5Svc9,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"
208576,Adele,2A73XBDBQgmdXO8VsXPWIs,Adele,0.43887,0.564595,249647.52381,0.500643,0.002624,0.130888,-7.034881,0.052583,121.253167,0.373131,60.47619,0.0,1.0,84.0,"['british soul', 'pop', 'uk pop']","[british_soul, pop, uk_pop]"


In [25]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('id')['genres_upd'].apply(list).reset_index()

In [26]:
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [27]:
artists_genres_consolidated

Unnamed: 0,id,genres_upd,consolidates_genre_lists
0,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, dance_rock, new_romantic, new_wav...","[candy_pop, new_wave, rock, new_romantic, powe..."
1,000Npgk5e2SgwGaIsN3ztv,"[[classic_bollywood, classic_pakistani_pop, fi...","[sufi, indian_folk, classic_bollywood, ghazal,..."
2,000ZxLGm7jDlWCHtcXSeBe,"[[boogie-woogie, piano_blues, ragtime, stride]]","[stride, piano_blues, ragtime, boogie-woogie]"
3,000jBcNljWTnyjB4YO7ojf,[[]],[]
4,000mGrJNc2GAgQdMESdgEc,"[[classical, late_romantic_era], [historic_orc...","[late_romantic_era, classical, historic_orches..."
...,...,...,...
139532,7zxiwcqLQyWcLdThrsYklu,"[[adult_standards, bebop, big_band, cool_jazz,...","[jazz_piano, bebop, jazz, swing, lounge, vocal..."
139533,7zxmX2NsbCBNlqHBmkyWxs,"[[bebop, contemporary_post-bop, cool_jazz, har...","[hard_bop, bebop, contemporary_post-bop, jazz,..."
139534,7zzbPkvxJ6iANIxcSz60rm,"[[adult_standards, cool_jazz, jazz, lounge, sw...","[swing, jazz, lounge, vocal_jazz, cool_jazz, a..."
139535,7zzbfi8fvHe6hm342GcNYl,"[[blues_rock, country_rock, folk_rock, jam_ban...","[jam_band, folk_rock, roots_rock, country_rock..."


In [28]:
spotify_df = spotify_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], on = 'id',how = 'left')

In [29]:
spotify_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,release_date,speechiness,tempo,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,0.568,['조정현'],0.447,237688,0.215,0,2ghebdwe2pNXT4eL34T7pW,1e-06,10,0.0649,...,1989-06-15,0.0272,71.979,0.177,1989,[조정현],[],[조정현],조정현 그아픔까지사랑한거야,
1,0.939,"['김효근', 'Quinever 퀸에버']",0.174,341821,0.321,0,6ra76urfo3pTD050opXTZS,0.756,3,0.695,...,2019-10-18,0.0422,138.3,0.0554,2019,"[김효근, Quinever 퀸에버]",[],"[김효근, Quinever 퀸에버]",김효근 Promise of Thousand Years 천 년의 약속,
2,0.509,['김수철'],0.456,189245,0.321,0,4FG1uHQAVPsYiXIn3tcGRs,1e-06,4,0.156,...,1983-08-15,0.0314,79.114,0.21,1983,[김수철],[],[김수철],김수철 못다핀 꽃한송이,
3,0.134,['龔詩嘉'],0.45,184120,0.782,0,2w5L5kQVkSRdduNewYPDA5,0.0,0,0.0632,...,2015-07-31,0.0306,87.98,0.475,2015,[龔詩嘉],[],[龔詩嘉],龔詩嘉 My Island Home,
4,0.381,['黑豹'],0.353,316160,0.686,0,3KIuCzckjdeeVuswPo20mC,0.0,11,0.0568,...,1992-12-22,0.0395,200.341,0.352,1992,[黑豹],[],[黑豹],黑豹 DON'T BREAK MY HEART,"[chinese_indie, chinese_indie_rock]"


# 2. FEATURE ENGINEERING
- Normalize float variables
- OHE Year and Popularity Variables
- Create TF-IDF features off of artist genres

In [30]:
spotify_df.tail()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,release_date,speechiness,tempo,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
158279,0.0248,"[""Lil' Flip"", 'Lea']",0.814,225173,0.387,1,4s0o8TJHfX9LLHa0umnOzT,0.0,0,0.131,...,2004-03-30,0.0945,93.961,0.792,2004,"[ Flip"", ]",[Lil' Flip],"[ Flip"", ]","Flip"", Sunshine (feat. Lea)",
158280,0.0249,"[""World Class Wreckin' Cru"", ""Michel 'Le""]",0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,9,0.139,...,1994-04-06,0.0479,129.309,0.429,1994,"[ Cru"", ""Michel ]","[World Class Wreckin' Cru, Michel 'Le]","[ Cru"", ""Michel ]","Cru"", ""Michel Turn Off The Lights - Rap",
158281,0.994,"[""Vienna Boys' Choir"", 'Uwe Christian Harrer']",0.317,98000,0.0543,0,7GaUNnAdIMXrvYG7673QIK,0.0277,10,0.168,...,1996-01-01,0.0777,81.111,0.29,1996,"[ Choir"", ]",[Vienna Boys' Choir],"[ Choir"", ]","Choir"", O Tannenbaum",
158282,0.0421,"[""Rappin' 4-Tay"", 'Herm']",0.893,232267,0.577,0,0CHKRuAjf8LvCL6SxCnq7V,0.000177,7,0.119,...,1999-09-17,0.185,93.0,0.572,1999,"[ 4-Tay"", ]",[Rappin' 4-Tay],"[ 4-Tay"", ]","4-Tay"", Playa of the Year",
158283,0.0113,"[""Rappin' 4-Tay"", 'MC Breed', 'Too $hort']",0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,4,0.101,...,1996,0.246,96.039,0.273,1996,"[ 4-Tay"", , , ]",[Rappin' 4-Tay],"[ 4-Tay"", , , ]","4-Tay"", Never Talk Down",


In [31]:
spotify_df['year'] = spotify_df['release_date'].apply(lambda x: x.split('-')[0])

In [32]:
float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index.values

In [33]:
ohe_cols = 'popularity'

In [34]:
spotify_df['popularity'].describe()

count    158284.000000
mean         25.765175
std          21.732005
min           0.000000
25%           1.000000
50%          26.000000
75%          42.000000
max         100.000000
Name: popularity, dtype: float64

In [35]:
# create 5 point buckets for popularity 
spotify_df['popularity_red'] = spotify_df['popularity'].apply(lambda x: int(x/5))

In [36]:
# tfidf can't handle nulls so fill any null values with an empty list
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [37]:
spotify_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,speechiness,tempo,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red
0,0.568,['조정현'],0.447,237688,0.215,0,2ghebdwe2pNXT4eL34T7pW,1e-06,10,0.0649,...,0.0272,71.979,0.177,1989,[조정현],[],[조정현],조정현 그아픔까지사랑한거야,[],6
1,0.939,"['김효근', 'Quinever 퀸에버']",0.174,341821,0.321,0,6ra76urfo3pTD050opXTZS,0.756,3,0.695,...,0.0422,138.3,0.0554,2019,"[김효근, Quinever 퀸에버]",[],"[김효근, Quinever 퀸에버]",김효근 Promise of Thousand Years 천 년의 약속,[],0
2,0.509,['김수철'],0.456,189245,0.321,0,4FG1uHQAVPsYiXIn3tcGRs,1e-06,4,0.156,...,0.0314,79.114,0.21,1983,[김수철],[],[김수철],김수철 못다핀 꽃한송이,[],4
3,0.134,['龔詩嘉'],0.45,184120,0.782,0,2w5L5kQVkSRdduNewYPDA5,0.0,0,0.0632,...,0.0306,87.98,0.475,2015,[龔詩嘉],[],[龔詩嘉],龔詩嘉 My Island Home,[],3
4,0.381,['黑豹'],0.353,316160,0.686,0,3KIuCzckjdeeVuswPo20mC,0.0,11,0.0568,...,0.0395,200.341,0.352,1992,[黑豹],[],[黑豹],黑豹 DON'T BREAK MY HEART,"[chinese_indie, chinese_indie_rock]",6


In [38]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [39]:
#function to build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')    
    year_ohe = ohe_prep(df, 'year','year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)
     
    #add song id
    final['id']=df['id'].values
    
    return final

In [40]:
complete_feature_set = create_feature_set(spotify_df, float_cols=float_cols)#.mean(axis = 0)

In [41]:
complete_feature_set.head()

Unnamed: 0,genre|432hz,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_beats,genre|abstract_hip_hop,genre|accordeon,genre|accordion,genre|acid_house,genre|acid_jazz,...,year|2013,year|2014,year|2015,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2ghebdwe2pNXT4eL34T7pW
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,6ra76urfo3pTD050opXTZS
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4FG1uHQAVPsYiXIn3tcGRs
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,2w5L5kQVkSRdduNewYPDA5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3KIuCzckjdeeVuswPo20mC


# 3. Connect To Spotify API

In [42]:
client_id = '85b78072ce59404892faa25eac54f7bd'
client_secret= '96691d10b1844530bacbccdf7b8d2641'

In [43]:
scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

In [44]:
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [45]:
token = util.prompt_for_user_token(scope, client_id= client_id, client_secret=client_secret, redirect_uri='http://localhost:8881/')

In [46]:
sp = spotipy.Spotify(auth=token)

In [47]:
#gather playlist names and images. 
#images aren't going to be used until I start building a UI
id_name = {}
list_photo = {}
for i in sp.current_user_playlists()['items']:

    id_name[i['name']] = i['uri'].split(':')[2]
    list_photo[i['uri'].split(':')[2]] = i['images'][0]['url']

In [48]:
id_name

{'ambient shit ig?': '215wrPfRgKrwErTk64EDQY',
 'Kanye the mfking GOAT': '0GmSEaWKYg4SZW7jGvbs9e',
 'Turn the fuck up 🥵🚀💯💸🤑✨😳': '0qAtFzm2FmTivoLpQ859pC',
 'chill tf out': '6Oy8Miur1aV8PWLP6bKMpD',
 'Daily Drivers': '3OsEzFyCk5zfDuNuVH7mef'}

In [49]:
def create_necessary_outputs(playlist_name,id_dic, df):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        playlist_name (str): name of the playlist you'd like to pull from the spotify API
        id_dic (dic): dictionary that maps playlist_name to playlist_id
        df (pandas dataframe): spotify datafram
        
    Returns: 
        playlist: all songs in the playlist THAT ARE AVAILABLE IN THE KAGGLE DATASET
    """
    
    #generate playlist dataframe
    playlist = pd.DataFrame()
    playlist_name = playlist_name

    for ix, i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
        #print(i['track']['artists'][0]['name'])
        playlist.loc[ix, 'artist'] = i['track']['artists'][0]['name']
        playlist.loc[ix, 'name'] = i['track']['name']
        playlist.loc[ix, 'id'] = i['track']['id'] # ['uri'].split(':')[2]
        playlist.loc[ix, 'url'] = i['track']['album']['images'][1]['url']
        playlist.loc[ix, 'date_added'] = i['added_at']

    playlist['date_added'] = pd.to_datetime(playlist['date_added'])  
    
    playlist = playlist[playlist['id'].isin(df['id'].values)].sort_values('date_added',ascending = False)
    
    return playlist

In [50]:
id_name

{'ambient shit ig?': '215wrPfRgKrwErTk64EDQY',
 'Kanye the mfking GOAT': '0GmSEaWKYg4SZW7jGvbs9e',
 'Turn the fuck up 🥵🚀💯💸🤑✨😳': '0qAtFzm2FmTivoLpQ859pC',
 'chill tf out': '6Oy8Miur1aV8PWLP6bKMpD',
 'Daily Drivers': '3OsEzFyCk5zfDuNuVH7mef'}

In [52]:
playlist_ = create_necessary_outputs('ambient shit ig?', id_name,spotify_df)

In [53]:
from skimage import io
import matplotlib.pyplot as plt

def visualize_songs(df):
    """ 
    Visualize cover art of the songs in the inputted dataframe

    Parameters: 
        df (pandas dataframe): Playlist Dataframe
    """
    
    temp = df['url'].values
    plt.figure(figsize=(15,int(0.625 * len(temp))))
    columns = 5
    
    for i, url in enumerate(temp):
        plt.subplot(len(temp) / columns + 1, columns, i + 1)

        image = io.imread(url)
        plt.imshow(image)
        plt.xticks(color = 'w', fontsize = 0.1)
        plt.yticks(color = 'w', fontsize = 0.1)
        plt.xlabel(df['name'].values[i], fontsize = 12)
        plt.tight_layout(h_pad=0.4, w_pad=0)
        plt.subplots_adjust(wspace=None, hspace=None)

    plt.show()

In [54]:
playlist_

Unnamed: 0,artist,name,id,url,date_added
31,ODIE,North Face,63bAGRSSX2V1hhPSP2NpBC,https://i.scdn.co/image/ab67616d00001e02e8f1f5...,2021-07-21 21:51:59+00:00
29,Frank Ocean,Nikes,19YKaevk2bce4odJkP5L22,https://i.scdn.co/image/ab67616d00001e02c5649a...,2021-07-21 21:50:17+00:00
28,Frank Ocean,Seigfried,1BViPjTT585XAhkUUrkts0,https://i.scdn.co/image/ab67616d00001e02c5649a...,2021-07-21 21:49:47+00:00
27,Joji,Will He,7wvwXi9Z66dcH5t202vVkN,https://i.scdn.co/image/ab67616d00001e028e947f...,2021-07-21 21:48:18+00:00
26,Joji,SLOW DANCING IN THE DARK,0rKtyWc8bvkriBthvHKY8d,https://i.scdn.co/image/ab67616d00001e0260ba1d...,2021-07-21 21:47:42+00:00
25,Choker,Juno,6OpU3cvY19M2hFxTbLKy5L,https://i.scdn.co/image/ab67616d00001e0273ce49...,2021-07-07 09:48:29+00:00
23,Daniel Caesar,Who Hurt You?,23c9gmiiv7RCu7twft0Mym,https://i.scdn.co/image/ab67616d00001e02c70176...,2021-07-04 08:03:48+00:00
22,Peach Pit,Tommy's Party,5OuJTtNve7FxUX82eEBupN,https://i.scdn.co/image/ab67616d00001e0200b39b...,2021-07-04 08:00:34+00:00
21,Mac Ayres,Slow Down,3XstzgzP0rp3bzElEnRVHv,https://i.scdn.co/image/ab67616d00001e0222b392...,2021-07-04 08:00:21+00:00
19,COIN,Malibu 1992,0nMqu5vN44GNv8UOQxmBJK,https://i.scdn.co/image/ab67616d00001e02e086dd...,2021-07-04 07:59:37+00:00


In [55]:
visualize_songs(playlist_)

ValueError: Number of rows must be a positive integer, not 5.0

<Figure size 1080x864 with 0 Axes>

In [56]:
def generate_playlist_feature(complete_feature_set, playlist_df, weight_factor):
    """ 
    Summarize a user's playlist into a single vector

    Parameters: 
        complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
        playlist_df (pandas dataframe): playlist dataframe
        weight_factor (float): float value that represents the recency bias. The larger the recency bias, the most priority recent songs get. Value should be close to 1. 
        
    Returns: 
        playlist_feature_set_weighted_final (pandas series): single feature that summarizes the playlist
        complete_feature_set_nonplaylist (pandas dataframe): 
    """
    
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1).mean(axis =0)
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on = 'id', how = 'inner')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1)
    
    playlist_feature_set = complete_feature_set_playlist.sort_values('date_added',ascending=False)

    most_recent_date = playlist_feature_set.iloc[0,-1]
    
    for ix, row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent'] = int((most_recent_date.to_pydatetime() - row.iloc[-1].to_pydatetime()).days / 30)
        
    playlist_feature_set['weight'] = playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted = playlist_feature_set.copy()
    #print(playlist_feature_set_weighted.iloc[:,:-4].columns)
    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-4].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final = playlist_feature_set_weighted.iloc[:, :-4]
    #playlist_feature_set_weighted_final['id'] = playlist_feature_set['id']
    
    return playlist_feature_set_weighted_final.sum(axis = 0), complete_feature_set_nonplaylist

In [58]:
complete_feature_set_playlist_vector_, complete_feature_set_nonplaylist_ = generate_playlist_feature(complete_feature_set, playlist_, 1.09)
#complete_feature_set_playlist_vector_chill, complete_feature_set_nonplaylist_chill = generate_playlist_feature(complete_feature_set, playlist_chill, 

In [59]:
complete_feature_set_playlist_vector_.shape

(2551,)

In [60]:
def generate_playlist_recos(df, features, nonplaylist_features):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        df (pandas dataframe): spotify dataframe
        features (pandas series): summarized playlist feature
        nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Returns: 
        non_playlist_df_top_40: Top 40 recommendations for that playlist
    """
    
    non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    non_playlist_df_top_40['url'] = non_playlist_df_top_40['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
    
    return non_playlist_df_top_40

In [62]:
playlist_top40 = generate_playlist_recos(spotify_df, complete_feature_set_playlist_vector_, complete_feature_set_nonplaylist_)

In [67]:
playlist_top40

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red,sim,url
103495,0.915,['Giveon'],0.478,195406,0.373,0,62d6YXEYxmMWAuLpw1EysL,6e-06,5,0.128,...,0.354,2018,[Giveon],[],[Giveon],Giveon Garden Kisses,"[indie_r&b, alternative_r&b]",12,0.762916,https://i.scdn.co/image/ab67616d00001e0236e776...
81053,0.606,['Khalid'],0.551,210667,0.44,0,5kfNriitmkNE8mUbZ7gbq8,3e-05,10,0.11,...,0.341,2018,[Khalid],[],[Khalid],Khalid Saturday Nights,"[pop, alternative_r&b]",14,0.751959,https://i.scdn.co/image/ab67616d00001e0260624c...
50163,0.964,['Pink Sweat$'],0.661,189000,0.14,1,3Um6KoMmiyZqHC9e4XNCoF,1.6e-05,6,0.106,...,0.306,2018,[Pink Sweat$],[],[Pink Sweat$],Pink Sweat$ Honesty,"[bedroom_soul, indie_r&b, pop, alternative_r&b]",13,0.74454,https://i.scdn.co/image/ab67616d00001e027a4e17...
11571,0.456,['UMI'],0.84,199227,0.344,0,630Ug0XtmhhFvAKo0PNuEI,3.4e-05,5,0.35,...,0.526,2018,[UMI],[],[UMI],UMI Remember Me,"[bedroom_soul, indie_r&b, pop, alternative_r&b]",14,0.744135,https://i.scdn.co/image/ab67616d00001e022b2fe5...
81063,0.0765,['Khalid'],0.596,229320,0.552,0,6zeeWid2sgw4lap2jV61PZ,0.334,0,0.104,...,0.112,2018,[Khalid],[],[Khalid],Khalid Better,"[pop, alternative_r&b]",15,0.729387,https://i.scdn.co/image/ab67616d00001e0260624c...
55882,0.454,['Omar Apollo'],0.782,127273,0.494,0,2NCBjlH7FHEG7hXcnvUaWA,6.3e-05,0,0.134,...,0.401,2018,[Omar Apollo],[],[Omar Apollo],Omar Apollo Ugotme,"[indie_pop, bedroom_soul, alternative_r&b, ind...",12,0.723151,https://i.scdn.co/image/ab67616d00001e02819ce0...
112136,0.877,['Frank Ocean'],0.24,188324,0.116,0,41cpvQ2GyGb2BRdIRSsTqK,0.00092,0,0.1,...,0.0937,2018,[Frank Ocean],[],[Frank Ocean],Frank Ocean Moon River,"[alternative_r&b, neo_soul, lgbtq+_hip_hop, hi...",13,0.685484,https://i.scdn.co/image/ab67616d00001e026566b4...
80952,0.231,['Kiana Ledé'],0.763,222387,0.537,0,47cQCF21TczFSmGfpd7c07,0.0,9,0.109,...,0.426,2018,[Kiana Ledé],[],[Kiana Ledé],Kiana Ledé EX,"[deep_pop_r&b, dance_pop, alternative_r&b, ind...",13,0.666252,https://i.scdn.co/image/ab67616d00001e026a8933...
81049,0.0626,"['Khalid', 'Swae Lee']",0.727,238893,0.72,1,5jyyPsIGM2yqkZN9R3TmvN,1e-06,11,0.176,...,0.589,2018,"[Khalid, Swae Lee]",[],"[Khalid, Swae Lee]",Khalid The Ways (with Swae Lee),"[trap, pop, alternative_r&b]",12,0.666226,https://i.scdn.co/image/ab67616d00001e02c027ad...
40395,0.272,['Sabrina Claudio'],0.666,209885,0.39,0,3cQmqM0awej9iAESjhKrI9,9e-06,5,0.111,...,0.325,2017,[Sabrina Claudio],[],[Sabrina Claudio],Sabrina Claudio Confidently Lost,"[indie_r&b, pop, alternative_r&b]",12,0.663356,https://i.scdn.co/image/ab67616d00001e02ca1582...
