In [11]:
#install required packages
!pip install textblob
!pip install spotipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
#imports
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.oauth2 as oauth2
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import re
import json
import csv
import time

## Part 1: Turn data from json to csv

In [13]:
#access files from Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
#just try for one .json file right now

with open('/content/gdrive/MyDrive/2022-23/Final Project/mpd.slice.0-999.json', 'r') as json_file:
    data = json.load(json_file)

# with open('/content/gdrive/MyDrive/pic16b/mpd.slice.0-999.json', 'r') as json_file:
#      data = json.load(json_file)

playlists = data['playlists']

tracks = []

for playlist in playlists:
    playlist_name = playlist['name']
    playlist_tracks = playlist['tracks']
    
    for track in playlist_tracks:
     
        track_info = {
            'playlist_name': playlist_name,
            'track_name': track['track_name'],
            'artist_name': track['artist_name'],
            'album_name': track['album_name'],
            'track_uri': track['track_uri']
        }

        tracks.append(track_info)

fieldnames = ['playlist_name', 'track_name', 'artist_name', 'album_name', 'track_uri']

with open('output.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(tracks)

In [5]:
rawdf = pd.read_csv('output.csv')

In [6]:
rawdf.head()

Unnamed: 0,playlist_name,track_name,artist_name,album_name,track_uri
0,Throwbacks,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,Throwbacks,Toxic,Britney Spears,In The Zone,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,Throwbacks,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,Throwbacks,Rock Your Body,Justin Timberlake,Justified,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,Throwbacks,It Wasn't Me,Shaggy,Hot Shot,spotify:track:1lzr43nnXAijIGYnCT8M8H


## Part 2: Data Preprocessing and Additional Data Acquisition

### Remove duplicates

In [44]:
# Drop song duplicates
def drop_duplicates(df):
    '''
    Input: full dataframe of songs with possible duplicates
    Output: dataframe of songs with duplicates dropped
    '''
    df['songartistconcat'] = df.apply(lambda row: row['artist_name']+row['track_name'],axis = 1)
    return df.drop_duplicates('songartistconcat')

rawdf= drop_duplicates(rawdf)
rawdf.reset_index(drop = True, inplace=True)

#test
print("Are all songs unique: ",len(pd.unique(rawdf.songartistconcat))==len(rawdf))

In [8]:
# get audio features from track uri and create a data frame with artist/track name plus audio features

client_id = '4e23ee764036488a96b598e3e14bea67'#4a8cd8bc834b4c2bb0aac4cc89d4d990'
client_secret = 'b5b133920a274771b15b5e9b6ccf371a' #'321d379d39964c94810f3bb52e8915b1'
redirect_uri = 'https://localhost:3001'

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

track_uris = rawdf['track_uri'].tolist()

In [9]:
#get 100 track_ids for audio_features() function
hundred_uri_chunks = [track_uris[i:i + 100] for i in range(0, len(track_uris), 100)]

audio_features = []

for chunk in hundred_uri_chunks:
    chunk_features = sp.audio_features(chunk)
    audio_features.extend(chunk_features)

df_audio_features = pd.concat([rawdf, pd.DataFrame(audio_features)], axis=1)

In [10]:
df_audio_features

Unnamed: 0,playlist_name,track_name,artist_name,album_name,track_uri,songartistconcat,danceability,energy,key,loudness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Throwbacks,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy ElliottLose Control (feat. Ciara & Fat M...,0.904,0.813,4,-7.105,...,0.0471,0.810,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4
1,Throwbacks,Toxic,Britney Spears,In The Zone,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney SpearsToxic,0.774,0.838,5,-3.914,...,0.2420,0.924,143.040,audio_features,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4
2,Throwbacks,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),spotify:track:0WqIKmW4BTrj3eJFmnCKMv,BeyoncéCrazy In Love,0.664,0.758,2,-6.583,...,0.0598,0.701,99.259,audio_features,0WqIKmW4BTrj3eJFmnCKMv,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4
3,Throwbacks,Rock Your Body,Justin Timberlake,Justified,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin TimberlakeRock Your Body,0.892,0.714,4,-6.055,...,0.0521,0.817,100.972,audio_features,1AWQoqb9bSvzTjaLralEkT,spotify:track:1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4
4,Throwbacks,It Wasn't Me,Shaggy,Hot Shot,spotify:track:1lzr43nnXAijIGYnCT8M8H,ShaggyIt Wasn't Me,0.853,0.606,0,-4.596,...,0.3130,0.654,94.759,audio_features,1lzr43nnXAijIGYnCT8M8H,spotify:track:1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34245,thinking of you,I Don't Know,Jon D,Roots,spotify:track:3uCHI1gfOUL5j5swEh0TcH,Jon DI Don't Know,0.669,0.228,2,-12.119,...,0.0944,0.402,83.024,audio_features,3uCHI1gfOUL5j5swEh0TcH,spotify:track:3uCHI1gfOUL5j5swEh0TcH,https://api.spotify.com/v1/tracks/3uCHI1gfOUL5...,https://api.spotify.com/v1/audio-analysis/3uCH...,189184,4
34246,thinking of you,The Answer,Big Words,"Hollywood, a Beautiful Coincidence",spotify:track:0P1oO2gREMYUCoOkzYAyFu,Big WordsThe Answer,0.493,0.727,1,-5.031,...,0.1290,0.289,73.259,audio_features,0P1oO2gREMYUCoOkzYAyFu,spotify:track:0P1oO2gREMYUCoOkzYAyFu,https://api.spotify.com/v1/tracks/0P1oO2gREMYU...,https://api.spotify.com/v1/audio-analysis/0P1o...,263680,4
34247,thinking of you,25.22,Allan Rayman,Roadhouse 01,spotify:track:2oM4BuruDnEvk59IvIXCwn,Allan Rayman25.22,0.702,0.524,7,-10.710,...,0.2980,0.265,140.089,audio_features,2oM4BuruDnEvk59IvIXCwn,spotify:track:2oM4BuruDnEvk59IvIXCwn,https://api.spotify.com/v1/tracks/2oM4BuruDnEv...,https://api.spotify.com/v1/audio-analysis/2oM4...,189213,4
34248,thinking of you,Good Feeling,Jon Jason,Good Feeling,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,Jon JasonGood Feeling,0.509,0.286,8,-14.722,...,0.1310,0.259,121.633,audio_features,4Ri5TTUgjM96tbQZd5Ua7V,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,https://api.spotify.com/v1/tracks/4Ri5TTUgjM96...,https://api.spotify.com/v1/audio-analysis/4Ri5...,194720,4


In [11]:
#get 50 track_ids at a time for tracks(),  
fifty_uri_chunks = [track_uris[i:i + 50] for i in range(0, len(track_uris), 50)]

misc_info = []

for i, chunk in enumerate(fifty_uri_chunks):
    chunk_tracks = sp.tracks(chunk)['tracks']
    misc_info.extend(chunk_tracks)
    #use time.sleep() to avoid surpassing rate limit from API when scaling for more data
    #time.sleep(5)

misc_info = [item for item in misc_info if item is not None]  # filter out None values
misc_info=pd.DataFrame(misc_info)

track_popularity=misc_info["popularity"]
artists=misc_info["artists"]
artist_genres = []
artist_popularity = []
artist_ids = []
for artist in artists:
  #take main artist for each song
  artist_ids.append(artist[0]["id"])

#get 50 artist_ids at a time for artists(),  
fifty_artist_uri_chunks = [artist_ids[i:i + 50] for i in range(0, len(artist_ids), 50)]
for i, chunk in enumerate(fifty_artist_uri_chunks):
    artist_info=sp.artists(chunk)
    artist_info=pd.DataFrame(artist_info["artists"])
    artist_popularity.extend(artist_info["popularity"])
    artist_genres.extend(artist_info["genres"])

In [12]:
#concatenate all the relevant columns together 
df_full = pd.concat([track_popularity,df_audio_features, pd.Series(artist_genres), pd.Series(artist_popularity)], axis=1)

#rename added columns
df_full=df_full.rename(columns={0: 'genres', 1:'artist_popularity',"popularity":"track_popularity"})

In [17]:
#save as csv
#df_full.to_csv('df_full.csv')

In [13]:
df_full.shape

(34250, 27)

In [14]:
df_full.tail()

Unnamed: 0,track_popularity,playlist_name,track_name,artist_name,album_name,track_uri,songartistconcat,danceability,energy,key,...,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genres,artist_popularity
34245,49.0,thinking of you,I Don't Know,Jon D,Roots,spotify:track:3uCHI1gfOUL5j5swEh0TcH,Jon DI Don't Know,0.669,0.228,2,...,83.024,audio_features,3uCHI1gfOUL5j5swEh0TcH,spotify:track:3uCHI1gfOUL5j5swEh0TcH,https://api.spotify.com/v1/tracks/3uCHI1gfOUL5...,https://api.spotify.com/v1/audio-analysis/3uCH...,189184,4,[canadian contemporary r&b],52.0
34246,14.0,thinking of you,The Answer,Big Words,"Hollywood, a Beautiful Coincidence",spotify:track:0P1oO2gREMYUCoOkzYAyFu,Big WordsThe Answer,0.493,0.727,1,...,73.259,audio_features,0P1oO2gREMYUCoOkzYAyFu,spotify:track:0P1oO2gREMYUCoOkzYAyFu,https://api.spotify.com/v1/tracks/0P1oO2gREMYU...,https://api.spotify.com/v1/audio-analysis/0P1o...,263680,4,[],2.0
34247,31.0,thinking of you,25.22,Allan Rayman,Roadhouse 01,spotify:track:2oM4BuruDnEvk59IvIXCwn,Allan Rayman25.22,0.702,0.524,7,...,140.089,audio_features,2oM4BuruDnEvk59IvIXCwn,spotify:track:2oM4BuruDnEvk59IvIXCwn,https://api.spotify.com/v1/tracks/2oM4BuruDnEv...,https://api.spotify.com/v1/audio-analysis/2oM4...,189213,4,"[indie poptimism, modern alternative rock, mod...",50.0
34248,,thinking of you,Good Feeling,Jon Jason,Good Feeling,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,Jon JasonGood Feeling,0.509,0.286,8,...,121.633,audio_features,4Ri5TTUgjM96tbQZd5Ua7V,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,https://api.spotify.com/v1/tracks/4Ri5TTUgjM96...,https://api.spotify.com/v1/audio-analysis/4Ri5...,194720,4,,
34249,,thinking of you,Cosmic Angel - Acoustic From Capitol Studios,Grizfolk,Cosmic Angel,spotify:track:5RVuBrXVLptAEbGJdSDzL5,GrizfolkCosmic Angel - Acoustic From Capitol S...,0.639,0.461,6,...,117.583,audio_features,5RVuBrXVLptAEbGJdSDzL5,spotify:track:5RVuBrXVLptAEbGJdSDzL5,https://api.spotify.com/v1/tracks/5RVuBrXVLptA...,https://api.spotify.com/v1/audio-analysis/5RVu...,257195,4,,


### Feature Selection

In [40]:
#read in NOT preprocessed dataset from spotify and preprocess 
df_full=pd.read_csv('/content/gdrive/MyDrive/2022-23/Final Project/df_full.csv')
df_full

Unnamed: 0.1,Unnamed: 0,track_popularity,playlist_name,track_name,artist_name,album_name,track_uri,songartistconcat,danceability,energy,...,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genres,artist_popularity
0,0,69.0,Throwbacks,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy ElliottLose Control (feat. Ciara & Fat M...,0.904,0.813,...,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,"['dance pop', 'hip hop', 'hip pop', 'neo soul'...",72.0
1,1,84.0,Throwbacks,Toxic,Britney Spears,In The Zone,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney SpearsToxic,0.774,0.838,...,143.040,audio_features,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4,"['dance pop', 'pop']",80.0
2,2,21.0,Throwbacks,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),spotify:track:0WqIKmW4BTrj3eJFmnCKMv,BeyoncéCrazy In Love,0.664,0.758,...,99.259,audio_features,0WqIKmW4BTrj3eJFmnCKMv,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4,"['pop', 'r&b']",87.0
3,3,79.0,Throwbacks,Rock Your Body,Justin Timberlake,Justified,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin TimberlakeRock Your Body,0.892,0.714,...,100.972,audio_features,1AWQoqb9bSvzTjaLralEkT,spotify:track:1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4,"['dance pop', 'pop']",80.0
4,4,0.0,Throwbacks,It Wasn't Me,Shaggy,Hot Shot,spotify:track:1lzr43nnXAijIGYnCT8M8H,ShaggyIt Wasn't Me,0.853,0.606,...,94.759,audio_features,1lzr43nnXAijIGYnCT8M8H,spotify:track:1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4,"['dance pop', 'pop rap', 'reggae fusion']",73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34245,34245,49.0,thinking of you,I Don't Know,Jon D,Roots,spotify:track:3uCHI1gfOUL5j5swEh0TcH,Jon DI Don't Know,0.669,0.228,...,83.024,audio_features,3uCHI1gfOUL5j5swEh0TcH,spotify:track:3uCHI1gfOUL5j5swEh0TcH,https://api.spotify.com/v1/tracks/3uCHI1gfOUL5...,https://api.spotify.com/v1/audio-analysis/3uCH...,189184,4,['canadian contemporary r&b'],52.0
34246,34246,14.0,thinking of you,The Answer,Big Words,"Hollywood, a Beautiful Coincidence",spotify:track:0P1oO2gREMYUCoOkzYAyFu,Big WordsThe Answer,0.493,0.727,...,73.259,audio_features,0P1oO2gREMYUCoOkzYAyFu,spotify:track:0P1oO2gREMYUCoOkzYAyFu,https://api.spotify.com/v1/tracks/0P1oO2gREMYU...,https://api.spotify.com/v1/audio-analysis/0P1o...,263680,4,[],2.0
34247,34247,31.0,thinking of you,25.22,Allan Rayman,Roadhouse 01,spotify:track:2oM4BuruDnEvk59IvIXCwn,Allan Rayman25.22,0.702,0.524,...,140.089,audio_features,2oM4BuruDnEvk59IvIXCwn,spotify:track:2oM4BuruDnEvk59IvIXCwn,https://api.spotify.com/v1/tracks/2oM4BuruDnEv...,https://api.spotify.com/v1/audio-analysis/2oM4...,189213,4,"['indie poptimism', 'modern alternative rock',...",50.0
34248,34248,,thinking of you,Good Feeling,Jon Jason,Good Feeling,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,Jon JasonGood Feeling,0.509,0.286,...,121.633,audio_features,4Ri5TTUgjM96tbQZd5Ua7V,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,https://api.spotify.com/v1/tracks/4Ri5TTUgjM96...,https://api.spotify.com/v1/audio-analysis/4Ri5...,194720,4,,


In [41]:
# df_full = drop_duplicates(df_full)
# print("Are all songs unique: ",len(pd.unique(df_full.songartistconcat))==len(df_full))
# df_full.reset_index(drop = True, inplace=True)

# drop columns that we are not using to calculate cosine similarity
drop=['track_uri','songartistconcat','type','uri','track_href','analysis_url','duration_ms',]
df_full=df_full.drop(drop,axis=1)


In [42]:
df_full.shape

(34250, 21)

In [50]:
#drop na's, just in case
df=df_full.dropna()
df.reset_index(drop = True, inplace=True)
df.shape
#df.drop(0,axis=1)

(34248, 21)

In [52]:
df

Unnamed: 0.1,Unnamed: 0,track_popularity,playlist_name,track_name,artist_name,album_name,danceability,energy,key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,time_signature,genres,artist_popularity
0,0,69.0,Throwbacks,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,0.904,0.813,4,-7.105,...,0.1210,0.03110,0.006970,0.0471,0.810,125.461,0UaMYEvWZi0ZqiDOoHU3YI,4,"['dance pop', 'hip hop', 'hip pop', 'neo soul'...",72.0
1,1,84.0,Throwbacks,Toxic,Britney Spears,In The Zone,0.774,0.838,5,-3.914,...,0.1140,0.02490,0.025000,0.2420,0.924,143.040,6I9VzXrHxO9rA9A5euc8Ak,4,"['dance pop', 'pop']",80.0
2,2,21.0,Throwbacks,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),0.664,0.758,2,-6.583,...,0.2100,0.00238,0.000000,0.0598,0.701,99.259,0WqIKmW4BTrj3eJFmnCKMv,4,"['pop', 'r&b']",87.0
3,3,79.0,Throwbacks,Rock Your Body,Justin Timberlake,Justified,0.892,0.714,4,-6.055,...,0.1410,0.20100,0.000234,0.0521,0.817,100.972,1AWQoqb9bSvzTjaLralEkT,4,"['dance pop', 'pop']",80.0
4,4,0.0,Throwbacks,It Wasn't Me,Shaggy,Hot Shot,0.853,0.606,0,-4.596,...,0.0713,0.05610,0.000000,0.3130,0.654,94.759,1lzr43nnXAijIGYnCT8M8H,4,"['dance pop', 'pop rap', 'reggae fusion']",73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34243,34243,22.0,thinking of you,Fragile,ROZES,Burn Wild,0.458,0.650,5,-6.581,...,0.0333,0.17400,0.000000,0.0865,0.341,77.953,4e7E3rBA7axwmPmCc0I2XA,4,['channel pop'],41.0
34244,34244,37.0,thinking of you,Diamond Child,Aayushi,Diamond Child,0.416,0.394,11,-9.269,...,0.0641,0.51300,0.001550,0.0988,0.131,81.988,1msfqzqHggvi1mlCT4Z7O5,4,['australian r&b'],40.0
34245,34245,49.0,thinking of you,I Don't Know,Jon D,Roots,0.669,0.228,2,-12.119,...,0.0690,0.79200,0.065000,0.0944,0.402,83.024,3uCHI1gfOUL5j5swEh0TcH,4,['canadian contemporary r&b'],52.0
34246,34246,14.0,thinking of you,The Answer,Big Words,"Hollywood, a Beautiful Coincidence",0.493,0.727,1,-5.031,...,0.2170,0.08730,0.000000,0.1290,0.289,73.259,0P1oO2gREMYUCoOkzYAyFu,4,[],2.0


In [60]:
#convert genres column from string to list of genres

df['genres_list']=df['genres'].apply(lambda x: str(x).split(", "))

print((df['genres_list']))

0        [['dance pop', 'hip hop', 'hip pop', 'neo soul...
1                                   [['dance pop', 'pop']]
2                                         [['pop', 'r&b']]
3                                   [['dance pop', 'pop']]
4              [['dance pop', 'pop rap', 'reggae fusion']]
                               ...                        
34243                                    [['channel pop']]
34244                                 [['australian r&b']]
34245                      [['canadian contemporary r&b']]
34246                                                 [[]]
34247    [['indie poptimism', 'modern alternative rock'...
Name: genres_list, Length: 34248, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genres_list']=df['genres'].apply(lambda x: str(x).split(", "))


## Part 3: Feature Engineering

In [53]:
def ohe_prep(df, column, new_name): 
    ''' 
    Create one-hot-encoded features of a specific column
    
    Input: 
    df (pandas dataframe): Spotify Dataframe
    column (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    tf_df: One-hot encoded features 
    '''
    
    #get_dummies() converts categorical variable into dummy/indicator variables
    tf_df = pd.get_dummies(df[column])
    print(tf_df)
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [58]:
type(df['genres'][0])

str

Genres - one hot encoding, tf-idf version

In [62]:
# # TF-IDF implementation: find most important genre for each song and that genre's prevalence across all songs to weight genre accordingly

# #function from scikit-learn
# tfidf = TfidfVectorizer()

# #get weighted values for each genre
# tfidf_matrix =  tfidf.fit_transform(df['genres_list'].apply(lambda x: " ".join(x)))
# genre_df = pd.DataFrame(tfidf_matrix.toarray())

# genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]
# if 'genre|unknown' in genre_df.columns:
#   genre_df.drop(columns='genre|unknown') # Drop unknown genre

# #delete index col
# genre_df.reset_index(drop = True, inplace=True)

# #view for first row
# print(genre_df.head())

   genre|21st  genre|432hz  genre|abstract  genre|acid  genre|acousmatic  \
0         0.0          0.0             0.0         0.0               0.0   
1         0.0          0.0             0.0         0.0               0.0   
2         0.0          0.0             0.0         0.0               0.0   
3         0.0          0.0             0.0         0.0               0.0   
4         0.0          0.0             0.0         0.0               0.0   

   genre|acoustic  genre|action  genre|adoracion  genre|adult  \
0             0.0           0.0              0.0          0.0   
1             0.0           0.0              0.0          0.0   
2             0.0           0.0              0.0          0.0   
3             0.0           0.0              0.0          0.0   
4             0.0           0.0              0.0          0.0   

   genre|adventista  ...  genre|ye  genre|yodeling  genre|york  genre|youth  \
0               0.0  ...       0.0             0.0         0.0          0

Song and Artist Popularity -  normalization

In [None]:
# #artist normalization
# artist_pop = df[["artist_popularity"]].reset_index(drop = True)
# #from scikit-learn
# scaler = MinMaxScaler()
# artist_pop_scaled = pd.DataFrame(scaler.fit_transform(artist_pop), columns = artist_pop.columns)

# print(artist_pop_scaled.head())

# #track normalization
# track_pop = df[["track_popularity"]].reset_index(drop = True)
# #from scikit-learn
# scaler = MinMaxScaler()
# track_pop_scaled = pd.DataFrame(scaler.fit_transform(track_pop), columns = track_pop.columns)

# print(track_pop_scaled.head())

   artist_popularity
0               0.72
1               0.80
2               0.87
3               0.80
4               0.73
   track_popularity
0          0.734043
1          0.893617
2          0.223404
3          0.840426
4          0.000000


Audio features - normalization to largest and smallest values of each feature

In [None]:
# df.head()

Unnamed: 0,track_popularity,playlist_name,track_name,artist_name,album_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,time_signature,genres,artist_popularity
0,69.0,Throwbacks,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,0UaMYEvWZi0ZqiDOoHU3YI,4,"[dance pop, hip hop, hip pop, neo soul, pop ra...",72.0
1,84.0,Throwbacks,Toxic,Britney Spears,In The Zone,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,6I9VzXrHxO9rA9A5euc8Ak,4,"[dance pop, pop]",80.0
2,21.0,Throwbacks,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,0WqIKmW4BTrj3eJFmnCKMv,4,"[pop, r&b]",87.0
3,79.0,Throwbacks,Rock Your Body,Justin Timberlake,Justified,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,1AWQoqb9bSvzTjaLralEkT,4,"[dance pop, pop]",80.0
4,0.0,Throwbacks,It Wasn't Me,Shaggy,Hot Shot,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,1lzr43nnXAijIGYnCT8M8H,4,"[dance pop, pop rap, reggae fusion]",73.0


In [None]:
# # one-hot encoding
# key_ohe = ohe_prep(df, 'key','key') * 0.5 #keep data range in same range as other scaled numbers
# mode_ohe = ohe_prep(df, 'mode','mode') * 0.5
# time_signature = ohe_prep(df,'time_signature',"time_signature") * 0.5

# # scale audio columns
# audiofeature_cols=['danceability','energy','key','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo']
# floats = df[audiofeature_cols].reset_index(drop = True)
# scaler = MinMaxScaler()
# floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) #* 0.2
# floats_scaled

       0   1   2   3   4   5   6   7   8   9   10  11
0       0   0   0   0   1   0   0   0   0   0   0   0
1       0   0   0   0   0   1   0   0   0   0   0   0
2       0   0   1   0   0   0   0   0   0   0   0   0
3       0   0   0   0   1   0   0   0   0   0   0   0
4       1   0   0   0   0   0   0   0   0   0   0   0
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ..
34243   0   0   0   0   0   1   0   0   0   0   0   0
34244   0   0   0   0   0   0   0   0   0   0   0   1
34245   0   0   1   0   0   0   0   0   0   0   0   0
34246   0   1   0   0   0   0   0   0   0   0   0   0
34247   0   0   0   0   0   0   0   1   0   0   0   0

[34248 rows x 12 columns]
       0  1
0      1  0
1      1  0
2      1  0
3      1  0
4      0  1
...   .. ..
34243  0  1
34244  0  1
34245  0  1
34246  0  1
34247  0  1

[34248 rows x 2 columns]
       0  1  3  4  5
0      0  0  0  1  0
1      0  0  0  1  0
2      0  0  0  1  0
3      0  0  0  1  0
4      0  0  0  1  0
...   .. .. .. .. ..
34243  0

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.914980,0.813,0.363636,0.842733,0.125780,0.031225,0.007005,0.0471,0.811623,0.572105
1,0.783401,0.838,0.454545,0.893573,0.118503,0.025000,0.025126,0.2420,0.925852,0.652266
2,0.672065,0.758,0.181818,0.851050,0.218295,0.002390,0.000000,0.0598,0.702405,0.452624
3,0.902834,0.714,0.363636,0.859462,0.146570,0.201807,0.000235,0.0521,0.818637,0.460435
4,0.863360,0.606,0.000000,0.882707,0.074116,0.056325,0.000000,0.3130,0.655311,0.432103
...,...,...,...,...,...,...,...,...,...,...
34243,0.463563,0.650,0.454545,0.851082,0.034615,0.174699,0.000000,0.0865,0.341683,0.355468
34244,0.421053,0.394,1.000000,0.808256,0.066632,0.515060,0.001558,0.0988,0.131263,0.373867
34245,0.677126,0.228,0.181818,0.762849,0.071726,0.795181,0.065327,0.0944,0.402806,0.378592
34246,0.498988,0.727,0.090909,0.875777,0.225572,0.087651,0.000000,0.1290,0.289579,0.334063


Text Sentiment analysis - for track/album name

In [47]:
def getSubjectivity(text):
  '''
  Getting the Text Subjectivity using TextBlob
  '''
  return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
  '''
  Getting the Text Polarity using TextBlob
  '''
  return TextBlob(text).sentiment.polarity

def getAnalysis(score, task="polarity"):
  '''
  Categorizing the Polarity & Subjectivity score (3 categories)
  '''
  if task == "subjectivity":
    if score < 1/3:
      return "low"
    elif score > 1/3:
      return "high"
    else:
      return "medium"
  else:
    if score < 0:
      return 'Negative'
    elif score == 0:
      return 'Neutral'
    else:
      return 'Positive'

def sentiment_analysis(df, text_col):
  '''
  Perform sentiment analysis on text
  ---
  Input:
  df (pandas dataframe): Dataframe of interest
  text_col (str): column of interest
  '''
  df['subjectivity'] = df[text_col].apply(getSubjectivity).apply(lambda x: getAnalysis(x,"subjectivity"))
  df['polarity'] = df[text_col].apply(getPolarity).apply(getAnalysis)
  return df

In [None]:
# # Sentiment analysis
# track_sentiment = sentiment_analysis(df, "track_name")
# album_sentiment = sentiment_analysis(df, "album_name")

# #ohe for sentiment analysis data
# track_subject_ohe = ohe_prep(track_sentiment, 'subjectivity','subjectivity') * 0.25 #weigh less because sentiment analysis less effective on short text
# track_polar_ohe = ohe_prep(track_sentiment, 'polarity','polarity') * 0.25
# album_subject_ohe = ohe_prep(album_sentiment, 'subjectivity','subjectivity') * 0.25 #weigh less because sentiment analysis less effective on short text
# album_polar_ohe = ohe_prep(album_sentiment, 'polarity','polarity') * 0.25

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subjectivity'] = df[text_col].apply(getSubjectivity).apply(lambda x: getAnalysis(x,"subjectivity"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['polarity'] = df[text_col].apply(getPolarity).apply(getAnalysis)


       high  low  medium
0         0    1       0
1         0    1       0
2         1    0       0
3         1    0       0
4         1    0       0
...     ...  ...     ...
34243     1    0       0
34244     0    1       0
34245     0    1       0
34246     1    0       0
34247     0    1       0

[34248 rows x 3 columns]
       Negative  Neutral  Positive
0             0        1         0
1             0        1         0
2             0        0         1
3             0        0         1
4             0        0         1
...         ...      ...       ...
34243         0        0         1
34244         0        1         0
34245         0        1         0
34246         0        0         1
34247         0        1         0

[34248 rows x 3 columns]
       high  low  medium
0         0    1       0
1         0    1       0
2         1    0       0
3         1    0       0
4         1    0       0
...     ...  ...     ...
34243     1    0       0
34244     0    1       0
342

Putting it all together:

In [63]:
def create_feature_set(df):
    '''
    Process spotify df to create a final set of features that will be used to generate recommendations
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    float_cols (list(str)): List of float columns that will be scaled
            
    Output: 
    final (pandas dataframe): Final set of features 
    '''
    
    # TF-IDF implementation: find most important genre for each song and that genre's prevalence across all songs to weight genre accordingly
    #function from scikit-learn
    tfidf = TfidfVectorizer()
    #get weighted values for each genre
    tfidf_matrix =  tfidf.fit_transform(df['genres_list'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]
    if 'genre|unknown' in genre_df.columns:
      genre_df.drop(columns='genre|unknown') # Drop unknown genre
    #reset index col
    genre_df.reset_index(drop = True, inplace=True)

    # one-hot encoding
    key_ohe = ohe_prep(df, 'key','key') * 0.5 #keep data range in same range as other scaled numbers
    mode_ohe = ohe_prep(df, 'mode','mode') * 0.5
    time_signature = ohe_prep(df,'time_signature',"time_signature") * 0.5

    # Sentiment analysis
    track_sentiment = sentiment_analysis(df, "track_name")
    album_sentiment = sentiment_analysis(df, "album_name")
    #ohe for sentiment analysis data
    track_subject_ohe = ohe_prep(track_sentiment, 'subjectivity','subjectivity') * 0.25 #weigh less because sentiment analysis less effective on short text
    track_polar_ohe = ohe_prep(track_sentiment, 'polarity','polarity') * 0.25
    album_subject_ohe = ohe_prep(album_sentiment, 'subjectivity','subjectivity') * 0.25 
    album_polar_ohe = ohe_prep(album_sentiment, 'polarity','polarity') * 0.25

    # scale audio columns
    audiofeature_cols=['danceability','energy','key','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo']
    floats = df[audiofeature_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) #* 0.2

    #artist pop normalization
    artist_pop = df[["artist_popularity"]].reset_index(drop = True)
    scaler = MinMaxScaler() #from scikit-learn
    artist_pop_scaled = pd.DataFrame(scaler.fit_transform(artist_pop), columns = artist_pop.columns)

    #track pop normalization
    track_pop = df[["track_popularity"]].reset_index(drop = True)
    scaler = MinMaxScaler() #from scikit-learn
    track_pop_scaled = pd.DataFrame(scaler.fit_transform(track_pop), columns = track_pop.columns)

    # Concanenate all features
    final = pd.concat([genre_df, key_ohe, mode_ohe, time_signature, track_subject_ohe, track_polar_ohe, album_subject_ohe, album_polar_ohe, floats_scaled, artist_pop_scaled, track_pop_scaled], axis = 1)
    
    # Add song id
    final['id']=df['id'].values
    
    return final

In [64]:
final_df=create_feature_set(df)

       0   1   2   3   4   5   6   7   8   9   10  11
0       0   0   0   0   1   0   0   0   0   0   0   0
1       0   0   0   0   0   1   0   0   0   0   0   0
2       0   0   1   0   0   0   0   0   0   0   0   0
3       0   0   0   0   1   0   0   0   0   0   0   0
4       1   0   0   0   0   0   0   0   0   0   0   0
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ..
34243   0   0   0   0   0   1   0   0   0   0   0   0
34244   0   0   0   0   0   0   0   0   0   0   0   1
34245   0   0   1   0   0   0   0   0   0   0   0   0
34246   0   1   0   0   0   0   0   0   0   0   0   0
34247   0   0   0   0   0   0   0   1   0   0   0   0

[34248 rows x 12 columns]
       0  1
0      1  0
1      1  0
2      1  0
3      1  0
4      0  1
...   .. ..
34243  0  1
34244  0  1
34245  0  1
34246  0  1
34247  0  1

[34248 rows x 2 columns]
       0  1  3  4  5
0      0  0  0  1  0
1      0  0  0  1  0
2      0  0  0  1  0
3      0  0  0  1  0
4      0  0  0  1  0
...   .. .. .. .. ..
34243  0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subjectivity'] = df[text_col].apply(getSubjectivity).apply(lambda x: getAnalysis(x,"subjectivity"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['polarity'] = df[text_col].apply(getPolarity).apply(getAnalysis)


       high  low  medium
0         0    1       0
1         0    1       0
2         1    0       0
3         1    0       0
4         1    0       0
...     ...  ...     ...
34243     1    0       0
34244     0    1       0
34245     0    1       0
34246     1    0       0
34247     0    1       0

[34248 rows x 3 columns]
       Negative  Neutral  Positive
0             0        1         0
1             0        1         0
2             0        0         1
3             0        0         1
4             0        0         1
...         ...      ...       ...
34243         0        0         1
34244         0        1         0
34245         0        1         0
34246         0        0         1
34247         0        1         0

[34248 rows x 3 columns]
       high  low  medium
0         0    1       0
1         0    1       0
2         1    0       0
3         1    0       0
4         1    0       0
...     ...  ...     ...
34243     1    0       0
34244     0    1       0
342

In [65]:
final_df.to_csv('final_features.csv')

In [None]:
#IMPORT FEATURE CSV
final_df=pd.read_csv('/content/gdrive/MyDrive/2022-23/Final Project/final_features.csv')

In [66]:
#drop na's, just in case
final_df=final_df.dropna()
final_df.reset_index(drop = True, inplace=True)
final_df.shape

(34248, 1367)

In [67]:
final_df.head()

Unnamed: 0,genre|21st,genre|432hz,genre|abstract,genre|acid,genre|acousmatic,genre|acoustic,genre|action,genre|adoracion,genre|adult,genre|adventista,...,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_popularity,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.842733,0.12578,0.031225,0.007005,0.0471,0.811623,0.572105,0.72,0.734043,0UaMYEvWZi0ZqiDOoHU3YI
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.893573,0.118503,0.025,0.025126,0.242,0.925852,0.652266,0.8,0.893617,6I9VzXrHxO9rA9A5euc8Ak
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.85105,0.218295,0.00239,0.0,0.0598,0.702405,0.452624,0.87,0.223404,0WqIKmW4BTrj3eJFmnCKMv
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.859462,0.14657,0.201807,0.000235,0.0521,0.818637,0.460435,0.8,0.840426,1AWQoqb9bSvzTjaLralEkT
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.882707,0.074116,0.056325,0.0,0.313,0.655311,0.432103,0.73,0.0,1lzr43nnXAijIGYnCT8M8H


In [68]:
def generate_playlist_feature(complete_feature_set, playlist_df):
    '''
    Summarize a user's playlist into a single vector
    ---
    Input: 
    complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
    playlist_df (pandas dataframe): playlist dataframe
        
    Output: 
    complete_feature_set_playlist_final (pandas series): single vector feature that summarizes the playlist
    complete_feature_set_nonplaylist (pandas dataframe): dataframe which includes all the features for the spotify songs not already in the input playlist
    '''
    
    # Find song features in the playlist
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]
    # Find all non-playlist song features
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]
    complete_feature_set_playlist_final = complete_feature_set_playlist.drop(columns = "id")
    return complete_feature_set_playlist_final.sum(axis = 0), complete_feature_set_nonplaylist

In [69]:
def generate_playlist_recos(df, features, nonplaylist_features):
    '''
    Generated recommendation based on songs in aspecific playlist.
    ---
    Input: 
    df (pandas dataframe): spotify dataframe
    features (pandas series): summarized playlist feature (single vector)
    nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Output: 
    non_playlist_df_top_40: Top 40 recommendations for that playlist
    '''
    
    non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
    # Find cosine similarity between the playlist and the complete song set
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    
    return non_playlist_df_top_40

# Get Playlist Input

In [26]:
import spotipy.oauth2 as oauth2

# Set up OAuth2 authentication
auth = oauth2.SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope='user-library-read')

# Get authorization URL
auth_url = auth.get_authorize_url()

# Redirect user to authorization URL
print("Please visit this URL to authorize the app:", auth_url)

# Prompt user to enter authorization code
code = input("Please enter the authorization code: ")

# Get access token using authorization code
token_info = auth.get_access_token(code)
access_token = token_info['access_token']

# Set up Spotify API instance
sp = spotipy.Spotify(auth=access_token)

Please visit this URL to authorize the app: https://accounts.spotify.com/authorize?client_id=4e23ee764036488a96b598e3e14bea67&response_type=code&redirect_uri=https%3A%2F%2Flocalhost%3A3001&scope=user-library-read
Please enter the authorization code: AQALr_2wgaI7FfDlZ720uM-z1XVHYqt9OLcDPRtSTOMWpXQCR1TyEEc0O9k58q6IeZAxBXLH8a9NQV74udDzQ-GoHe76HVWkGeOVify50JGsaz0e6UF2H8J1FW6L1eT7zYRn_Xuy_gssWKpZ_RyBziV4mLyCBHRtvdIY8Wwa7QgAe4ZHIotN81zsNuFT


  token_info = auth.get_access_token(code)


In [27]:
playlist_name = 'Test Playlist'
results = sp.search(q=playlist_name, type='playlist')
playlists = results['playlists']['items']

# Get the first playlist that matches the name
playlist = playlists[0]

# Get tracks in playlist
tracks = sp.playlist_tracks(playlist['id'], fields='items(track(id))')['items']
track_id=[]

#TODO: FIX AND MAKE MORE EFFICIENT
# Loop through tracks and extract features
for track in tracks:
    track_id.append(track['track']['id'])
    # track_info = sp.track(track_id)
    #track_name = track_info['name']
    # track_features = sp.audio_features(track_id)
    # print(track_name, track_features)

In [30]:
len(tracks)

23

In [31]:
#TODO: SCALE LATER FOR N SONGS
hundred_uri_chunks = [track_id[i:i + 50] for i in range(0, len(track_id), 50)]
print(hundred_uri_chunks)

playlist_audio_features = []

for chunk in hundred_uri_chunks:
    print(chunk)
    chunk_features = sp.audio_features(chunk)
    playlist_audio_features.extend(chunk_features)

#playlist_audio_features = pd.concat([rawdf, pd.DataFrame(playlist_audio_features)], axis=1)

[['3KzgdYUlqV6TOG7JCmx2Wg', '7fQ3PYTYdu208fQ3JEm2U7', '3krgfOQI9Szq8cF0Umm1O1', '3H8Sn0mYsZMPPlMCbebOJ5', '7gxW8hMXEUkc1G3m7z9vei', '0elmUoU7eMPwZX1Mw1MnQo', '4Yi5yMQPdItYT5BWUClYU9', '38Hbuh2vq2aVFyajV43nhE', '5lbEPivbiMeV17DD789ex6', '6KnLLk05VRKSDxZjZPBPeG', '4WUcNkpoNSKoe5MUuyzrfC', '6115ks6fGkf7KYRDAi8lNK', '6qMMQzYTKabamnMPlCmfxb', '76F9rF3pQjPm4i8KVwx9Yd', '3qTYzMkbzxqRtA2hlSz4Ba', '3AgY5gLURlcdYBVGv1RVm7', '4hHbeIIKO5Y5uLyIEbY9Gn', '15MKQ6uU9E4vyyzpXCAoXo', '6BKRNurnrKum1I61AZXo9D', '1t09rxY1rMnnlFW8SyjEiU', '4RRrIq088bzxbuODK11gnU', '2WaYW84yWij5NSCpgSeU2R', '3aPlQWU07jGgyHaBHVS5TS']]
['3KzgdYUlqV6TOG7JCmx2Wg', '7fQ3PYTYdu208fQ3JEm2U7', '3krgfOQI9Szq8cF0Umm1O1', '3H8Sn0mYsZMPPlMCbebOJ5', '7gxW8hMXEUkc1G3m7z9vei', '0elmUoU7eMPwZX1Mw1MnQo', '4Yi5yMQPdItYT5BWUClYU9', '38Hbuh2vq2aVFyajV43nhE', '5lbEPivbiMeV17DD789ex6', '6KnLLk05VRKSDxZjZPBPeG', '4WUcNkpoNSKoe5MUuyzrfC', '6115ks6fGkf7KYRDAi8lNK', '6qMMQzYTKabamnMPlCmfxb', '76F9rF3pQjPm4i8KVwx9Yd', '3qTYzMkbzxqRtA2hlSz4Ba', '3AgY5gL

In [32]:
#get 50 track_ids at a time for tracks(),  
fifty_uri_chunks = [track_id[i:i + 50] for i in range(0, len(track_id), 50)]

playlist_misc_info = []

for i, chunk in enumerate(fifty_uri_chunks):
    chunk_tracks = sp.tracks(chunk)['tracks']
    print(chunk_tracks)
    playlist_misc_info.extend(chunk_tracks)
    #use time.sleep() to avoid surpassing rate limit from API when scaling for more data
    #time.sleep(5)

playlist_misc_info = [item for item in playlist_misc_info if item is not None]  # filter out None values
playlist_misc_info=pd.DataFrame(playlist_misc_info)

playlist_track_popularity=playlist_misc_info["popularity"]
playlist_track_name=playlist_misc_info["name"]

playlist_album_name=[]
for song in playlist_misc_info["album"]:
    playlist_album_name.append(song["name"])

playlist_artists=playlist_misc_info["artists"]
playlist_artist_genres = []
playlist_artist_popularity = []
playlist_artist_ids = []
playlist_artist_name = []
for artist in playlist_artists:
  #take main artist for each song
  playlist_artist_ids.append(artist[0]["id"])

#get 50 artist_ids at a time for artists(),  
fifty_artist_uri_chunks = [playlist_artist_ids[i:i + 50] for i in range(0, len(playlist_artist_ids), 50)]
for i, chunk in enumerate(fifty_artist_uri_chunks):
    playlist_artist_info=sp.artists(chunk)
    playlist_artist_info=pd.DataFrame(playlist_artist_info["artists"])
    playlist_artist_popularity.extend(playlist_artist_info["popularity"])
    playlist_artist_genres.extend(playlist_artist_info["genres"])
    playlist_artist_name.extend(playlist_artist_info['name'])

[{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0EodhzA6yW1bIdD5B4tcmJ'}, 'href': 'https://api.spotify.com/v1/artists/0EodhzA6yW1bIdD5B4tcmJ', 'id': '0EodhzA6yW1bIdD5B4tcmJ', 'name': 'Bobby Darin', 'type': 'artist', 'uri': 'spotify:artist:0EodhzA6yW1bIdD5B4tcmJ'}], 'available_markets': ['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BN', 'BO', 'BR', 'BS', 'BT', 'BW', 'BY', 'BZ', 'CA', 'CD', 'CG', 'CH', 'CI', 'CL', 'CM', 'CO', 'CR', 'CV', 'CW', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'ET', 'FI', 'FJ', 'FM', 'FR', 'GA', 'GB', 'GD', 'GE', 'GH', 'GM', 'GN', 'GQ', 'GR', 'GT', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IQ', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', 'MG', 'MH', 'M

In [90]:
playlist_df=pd.concat([pd.Series(playlist_album_name),  playlist_track_name, playlist_track_popularity, pd.Series(playlist_artist_popularity), pd.Series(playlist_artist_genres), pd.DataFrame(playlist_audio_features),  pd.Series(playlist_artist_name)], axis=1)

In [86]:
playlist_df.columns

Index([                 0,             'name',       'popularity',
                        1,                  2,     'danceability',
                 'energy',              'key',         'loudness',
                   'mode',      'speechiness',     'acousticness',
       'instrumentalness',         'liveness',          'valence',
                  'tempo',             'type',               'id',
                    'uri',       'track_href',     'analysis_url',
            'duration_ms',   'time_signature',                  3],
      dtype='object')

In [87]:
print(playlist_df.head())

                                                   0  \
0                                         That's All   
1  Dean Martin: The Capitol Recordings, Vol. 11 (...   
2                                    The Decca Years   
3                                       Danke Schoen   
4                          The Legendary Bobby Darin   

                            name  popularity   1  \
0                 Beyond the Sea          71  61   
1  Ain't That A Kick In The Head          62  66   
2         Something's Gotta Give          56  49   
3                   Danke Schoen          57  47   
4         More - Remastered 2004          61  61   

                                                   2  danceability  energy  \
0  [adult standards, easy listening, lounge, rock...         0.521   0.516   
1  [adult standards, easy listening, lounge, voca...         0.586   0.253   
2              [adult standards, lounge, vocal jazz]         0.541   0.341   
3          [adult standards, easy list

In [91]:
# TODO: ID column has duplicate, remove later
playlist_df=playlist_df.rename(columns={0: 'album_name', 1:'artist_popularity',"popularity":"track_popularity", 3:"artist_name", 2:"genres_list", "name":"track_name" })

In [94]:
playlist_df.columns

Index(['album_name', 'track_name', 'track_popularity', 'artist_popularity',
       'genres_list', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url',
       'duration_ms', 'time_signature', 'artist_name', 'songartistconcat'],
      dtype='object')

In [89]:
# playlist_df = playlist_df.drop(playlist_df.columns[3], axis=1)
# playlist_df.columns

Index(['album_name', 'track_name', 'track_popularity', 'genres_list',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'artist_name'],
      dtype='object')

In [93]:
playlist_df = drop_duplicates(playlist_df)
print("Are all songs unique: ",len(pd.unique(playlist_df.songartistconcat))==len(playlist_df))
playlist_df.reset_index(drop = True, inplace=True)


#drop=['track_uri','songartistconcat','type','uri','track_href','analysis_url','duration_ms',]
#df_full=df_full.drop(drop,axis=1)

playlist_df_final=playlist_df.dropna()
playlist_df_final.reset_index(drop = True, inplace=True)
playlist_df_final.shape

Are all songs unique:  True


(23, 25)

In [95]:
playlist_df.columns

Index(['album_name', 'track_name', 'track_popularity', 'artist_popularity',
       'genres_list', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url',
       'duration_ms', 'time_signature', 'artist_name', 'songartistconcat'],
      dtype='object')

In [None]:
# from re import S
# # Retrieve track features and names
# track_ids = pd.Series(track_id) #[track['track']['id'] for track in tracks]
# track_name=[sp.track(track_ids) for id in track_ids]
# track_features = sp.audio_features(track_ids)

# # Convert to DataFrame
# df = pd.DataFrame(track_features)
# df['name'] = track_names

In [72]:
playlist_audio_features

[{'danceability': 0.521,
  'energy': 0.516,
  'key': 2,
  'loudness': -7.456,
  'mode': 0,
  'speechiness': 0.0369,
  'acousticness': 0.723,
  'instrumentalness': 0,
  'liveness': 0.257,
  'valence': 0.569,
  'tempo': 136.483,
  'type': 'audio_features',
  'id': '3KzgdYUlqV6TOG7JCmx2Wg',
  'uri': 'spotify:track:3KzgdYUlqV6TOG7JCmx2Wg',
  'track_href': 'https://api.spotify.com/v1/tracks/3KzgdYUlqV6TOG7JCmx2Wg',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3KzgdYUlqV6TOG7JCmx2Wg',
  'duration_ms': 172480,
  'time_signature': 4},
 {'danceability': 0.586,
  'energy': 0.253,
  'key': 6,
  'loudness': -11.93,
  'mode': 0,
  'speechiness': 0.0538,
  'acousticness': 0.604,
  'instrumentalness': 0,
  'liveness': 0.3,
  'valence': 0.718,
  'tempo': 108.168,
  'type': 'audio_features',
  'id': '7fQ3PYTYdu208fQ3JEm2U7',
  'uri': 'spotify:track:7fQ3PYTYdu208fQ3JEm2U7',
  'track_href': 'https://api.spotify.com/v1/tracks/7fQ3PYTYdu208fQ3JEm2U7',
  'analysis_url': 'https://api.spotify.

In [96]:
playlist = create_feature_set(playlist_df)

    0   2   4   5   6   7   8   9   11
0    0   1   0   0   0   0   0   0   0
1    0   0   0   0   1   0   0   0   0
2    0   0   0   0   0   0   0   1   0
3    0   0   0   0   0   0   0   1   0
4    0   0   0   1   0   0   0   0   0
5    0   0   0   1   0   0   0   0   0
6    0   0   0   0   0   0   1   0   0
7    0   0   0   0   0   1   0   0   0
8    0   0   0   0   0   0   1   0   0
9    1   0   0   0   0   0   0   0   0
10   1   0   0   0   0   0   0   0   0
11   0   0   0   0   0   1   0   0   0
12   0   0   0   0   1   0   0   0   0
13   0   0   0   0   0   0   0   1   0
14   0   1   0   0   0   0   0   0   0
15   0   0   0   0   0   0   1   0   0
16   0   0   0   0   1   0   0   0   0
17   0   0   0   0   0   1   0   0   0
18   0   0   0   0   0   0   1   0   0
19   1   0   0   0   0   0   0   0   0
20   0   0   0   0   0   0   0   0   1
21   0   0   0   0   0   0   0   1   0
22   0   0   1   0   0   0   0   0   0
    0  1
0   1  0
1   1  0
2   1  0
3   1  0
4   1  0
5   1  0
6

In [97]:
playlist

Unnamed: 0,genre|adult,genre|and,genre|easy,genre|harlem,genre|jazz,genre|listening,genre|lounge,genre|renaissance,genre|rock,genre|roll,...,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_popularity,id
0,0.246066,0.402973,0.279253,0.0,0.291492,0.279253,0.256597,0.0,0.402973,0.402973,...,0.9318,0.013793,0.801626,0.0,0.228938,0.474026,0.612626,0.605263,1.0,3KzgdYUlqV6TOG7JCmx2Wg
1,0.343607,0.0,0.389948,0.0,0.407039,0.389948,0.358312,0.0,0.0,0.0,...,0.478415,0.05696,0.60813,0.0,0.277696,0.796537,0.374347,0.736842,0.865672,7fQ3PYTYdu208fQ3JEm2U7
2,0.411902,0.0,0.0,0.0,0.487943,0.0,0.42953,0.0,0.0,0.0,...,0.803506,0.040358,0.80813,0.0,0.00567,0.616883,0.625578,0.289474,0.776119,3krgfOQI9Szq8cF0Umm1O1
3,0.42021,0.0,0.476883,0.0,0.0,0.476883,0.438193,0.0,0.0,0.0,...,0.804621,0.03576,0.931707,0.0,0.053181,0.744589,0.703949,0.236842,0.791045,3H8Sn0mYsZMPPlMCbebOJ5
4,0.246066,0.402973,0.279253,0.0,0.291492,0.279253,0.256597,0.0,0.402973,0.402973,...,0.801682,0.0,0.40813,0.0,0.213063,0.738095,0.747398,0.605263,0.850746,7gxW8hMXEUkc1G3m7z9vei
5,0.42021,0.0,0.476883,0.0,0.0,0.476883,0.438193,0.0,0.0,0.0,...,0.261857,0.012261,0.923577,0.0,0.098537,0.512987,0.57687,1.0,0.955224,0elmUoU7eMPwZX1Mw1MnQo
6,0.246066,0.402973,0.279253,0.0,0.291492,0.279253,0.256597,0.0,0.402973,0.402973,...,0.139137,1.0,0.645528,0.0,0.700646,0.318182,0.943643,0.605263,0.044776,4Yi5yMQPdItYT5BWUClYU9
7,0.246066,0.402973,0.279253,0.0,0.291492,0.279253,0.256597,0.0,0.402973,0.402973,...,0.08715,0.077139,0.990244,0.0,1.0,0.0,0.535525,0.605263,0.0,38Hbuh2vq2aVFyajV43nhE
8,0.246066,0.402973,0.279253,0.0,0.291492,0.279253,0.256597,0.0,0.402973,0.402973,...,0.445886,0.333333,0.730081,0.0,0.953509,0.560606,0.940798,0.605263,0.059701,5lbEPivbiMeV17DD789ex6
9,0.246066,0.402973,0.279253,0.0,0.291492,0.279253,0.256597,0.0,0.402973,0.402973,...,0.172071,0.478927,0.912195,0.0,0.719923,0.495671,0.105478,0.605263,0.029851,6KnLLk05VRKSDxZjZPBPeG


In [98]:
print(playlist_df['id'])

0     3KzgdYUlqV6TOG7JCmx2Wg
1     7fQ3PYTYdu208fQ3JEm2U7
2     3krgfOQI9Szq8cF0Umm1O1
3     3H8Sn0mYsZMPPlMCbebOJ5
4     7gxW8hMXEUkc1G3m7z9vei
5     0elmUoU7eMPwZX1Mw1MnQo
6     4Yi5yMQPdItYT5BWUClYU9
7     38Hbuh2vq2aVFyajV43nhE
8     5lbEPivbiMeV17DD789ex6
9     6KnLLk05VRKSDxZjZPBPeG
10    4WUcNkpoNSKoe5MUuyzrfC
11    6115ks6fGkf7KYRDAi8lNK
12    6qMMQzYTKabamnMPlCmfxb
13    76F9rF3pQjPm4i8KVwx9Yd
14    3qTYzMkbzxqRtA2hlSz4Ba
15    3AgY5gLURlcdYBVGv1RVm7
16    4hHbeIIKO5Y5uLyIEbY9Gn
17    15MKQ6uU9E4vyyzpXCAoXo
18    6BKRNurnrKum1I61AZXo9D
19    1t09rxY1rMnnlFW8SyjEiU
20    4RRrIq088bzxbuODK11gnU
21    2WaYW84yWij5NSCpgSeU2R
22    3aPlQWU07jGgyHaBHVS5TS
Name: id, dtype: object


In [99]:
playlist = pd.DataFrame(playlist)

In [100]:
a,b = generate_playlist_feature(final_df, playlist)

In [101]:
# summarized vector
print(a)

genre|21st           0.000000
genre|432hz          0.000000
genre|abstract       0.000000
genre|acid           0.000000
genre|acousmatic     0.000000
                       ...   
liveness             0.722000
valence              1.783567
tempo                1.421173
artist_popularity    2.080000
track_popularity     1.914894
Length: 1366, dtype: float64


In [None]:
# summarized final df with nonplaylist songs 
print(b)

       genre|21st  genre|432hz  genre|abstract  genre|acid  genre|acousmatic  \
0             0.0          0.0             0.0         0.0               0.0   
1             0.0          0.0             0.0         0.0               0.0   
2             0.0          0.0             0.0         0.0               0.0   
3             0.0          0.0             0.0         0.0               0.0   
4             0.0          0.0             0.0         0.0               0.0   
...           ...          ...             ...         ...               ...   
34243         0.0          0.0             0.0         0.0               0.0   
34244         0.0          0.0             0.0         0.0               0.0   
34245         0.0          0.0             0.0         0.0               0.0   
34246         0.0          0.0             0.0         0.0               0.0   
34247         0.0          0.0             0.0         0.0               0.0   

       genre|acoustic  genre|action  ge

In [102]:
top_40 = generate_playlist_recos(final_df, a, b)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]


In [104]:
indices = top_40.index.tolist()


In [105]:
for i in indices:
  print(df['track_name'][i] + ', ' + df['artist_name'][i])

I Won't Dance - 1998 Digital Remaster, Frank Sinatra
Pencil Thin Mustache, Jimmy Buffett
Isn't She Lovely, Frank Sinatra
One for My Baby (And One More for the Road), Tony Bennett
Everything, Michael Bublé
Rudolph, the Red-Nosed Reindeer, Dean Martin
Let It Snow! Let It Snow! Let It Snow!, Dean Martin
Mambo Italiano - 78rpm Version, Rosemary Clooney
Human Race, Mike Love
You Remind Me of Something, R. Kelly
O Christmas Tree, Tony Bennett
Moondance, Michael Bublé
That's Amore, Dean Martin
Powerful Stuff, Sean Hayes
Let It Snow! Let It Snow! Let It Snow!, Frank Sinatra
I've Got You Under My Skin, Frank Sinatra
Satisfy You, Diddy
(There's No Place Like) Home for the Holidays - 1959 Version, Perry Como
XO, Eden Project
The Way You Look Tonight, Frank Sinatra
Marcas De Ayer, Adriana Mezzadri
Margaritaville, Jimmy Buffett
Sex Weed, R. Kelly
Bells Will Be Ringing, Calvin N. Emery
Lyin King, Jhene Aiko
Holly Jolly Christmas, Michael Bublé
what's normal anyway, Miguel
All I Do, B5
L-O-V-E - 1992

In [103]:
playlist_name = 'Test Playlist'
results = sp.search(q=playlist_name, type='playlist')
playlists = results['playlists']['items']

# Get the first playlist that matches the name
playlist = playlists[0]

# Get tracks in playlist
tracks = sp.playlist_tracks(playlist['id'], fields='items(track(id))')['items']

# Loop through tracks and extract features
for track in tracks:
    track_id = track['track']['id']
    track_info = sp.track(track_id)
    track_name = track_info['name']
    track_features = sp.audio_features(track_id)
    print(track_name, track_features)

Beyond the Sea [{'danceability': 0.521, 'energy': 0.516, 'key': 2, 'loudness': -7.456, 'mode': 0, 'speechiness': 0.0369, 'acousticness': 0.723, 'instrumentalness': 0, 'liveness': 0.257, 'valence': 0.569, 'tempo': 136.483, 'type': 'audio_features', 'id': '3KzgdYUlqV6TOG7JCmx2Wg', 'uri': 'spotify:track:3KzgdYUlqV6TOG7JCmx2Wg', 'track_href': 'https://api.spotify.com/v1/tracks/3KzgdYUlqV6TOG7JCmx2Wg', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3KzgdYUlqV6TOG7JCmx2Wg', 'duration_ms': 172480, 'time_signature': 4}]
Ain't That A Kick In The Head [{'danceability': 0.586, 'energy': 0.253, 'key': 6, 'loudness': -11.93, 'mode': 0, 'speechiness': 0.0538, 'acousticness': 0.604, 'instrumentalness': 0, 'liveness': 0.3, 'valence': 0.718, 'tempo': 108.168, 'type': 'audio_features', 'id': '7fQ3PYTYdu208fQ3JEm2U7', 'uri': 'spotify:track:7fQ3PYTYdu208fQ3JEm2U7', 'track_href': 'https://api.spotify.com/v1/tracks/7fQ3PYTYdu208fQ3JEm2U7', 'analysis_url': 'https://api.spotify.com/v1/audio-analys

In [None]:
playlist_artist_name

In [None]:
track_info