# Building a Spotify Recommendation Engine for Music Labels
## Recommendation and Clustering
### Flatiron School Data Science Program Phase 4 Project<br>Justin Williams & Khyatee Desai

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile 
from sklearn import set_config
set_config(print_changed_only=False, display=None)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import sys

# Spotify API

In [2]:
# create a sample playlist
playlist = ['Polly - Moses Sumney', 'Goodbye - Porches','rangerover - Porches','Tal Uno - Barrie',
               "Angel's Song - Arlo Parks",'Randy - Big Thief', 'Butterfly - Adrianne Lenker',
               'All of Me Wants All of You - Sufjan Stevens','Cola - Arlo Parks']

### Get spotify id's for each song

In [3]:
# save my credentials
os.environ["SPOTIPY_CLIENT_ID"] = "a8561b2ef375442f8f89527b255f4c78"
os.environ["SPOTIPY_CLIENT_SECRET"] = "566a59fd72764b9580f8767c5bc0ca7f"

spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

# save song id's, release dates, and song popularity in lists
song_ids = []
release_dates = []
popularities = []
for song in playlist:
    results = spotify.search(q=song, type='track')
    song_ids.append(results['tracks']['items'][0]['id'])
    release_dates.append(results['tracks']['items'][0]['album']['release_date'])
    popularities.append(results['tracks']['items'][0]['popularity'])

### Retrieve Song attributes for each song

In [4]:
# use the song id's gathered above to get audio attributes for each song
attributes = spotify.audio_features(tracks=song_ids)
attributes[0]

{'danceability': 0.389,
 'energy': 0.258,
 'key': 10,
 'loudness': -11.534,
 'mode': 1,
 'speechiness': 0.0315,
 'acousticness': 0.927,
 'instrumentalness': 5.2e-05,
 'liveness': 0.0906,
 'valence': 0.195,
 'tempo': 137.831,
 'type': 'audio_features',
 'id': '6LQOd5wz3RmBFFZxdhSfLT',
 'uri': 'spotify:track:6LQOd5wz3RmBFFZxdhSfLT',
 'track_href': 'https://api.spotify.com/v1/tracks/6LQOd5wz3RmBFFZxdhSfLT',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/6LQOd5wz3RmBFFZxdhSfLT',
 'duration_ms': 218413,
 'time_signature': 4}

In [5]:
# combine the song popularity and release dates with the rest of the attributes
for i in range(len(attributes)):
    attributes[i]['track/artist'] = playlist[i]
    attributes[i]['popularity'] = popularities[i]
    attributes[i]['release_date'] = release_dates[i]
    
# create a dataframe from all the attributes
playlist_df = pd.DataFrame(attributes)

# Clean Spotify Data

### Create Decade Column

In [6]:
# drop unecessary features 
playlist_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'time_signature'], axis=1, inplace=True)

# set release date as datetime
playlist_df['release_date'] = pd.to_datetime(playlist_df['release_date'])

# make year column datetime
playlist_df['year'] = playlist_df['release_date'].dt.year

# create decade column
playlist_df['decade'] = playlist_df['year'].apply(lambda x: str(x)[:3]+'0s')

### Map Keys/Modes to make more interpretable

In [7]:
# change keys/mode to letter/maj/min representations
keys = {0:'C', 1:'Db',2:'D',3:'Eb',4:'E',5:'F',6:'F#',7:'G',8:'Ab',9:'A',10:'Bb',11:'B'}
modes = {0:'Minor',1:'Major'}

# create new column in df
playlist_df['letter_keys'] = playlist_df['key'].map(keys)
playlist_df['modes'] = playlist_df['mode'].map(modes)

round(playlist_df['letter_keys'].value_counts(normalize=True)*100, 2)

B     33.33
E     22.22
G     11.11
D     11.11
C     11.11
Bb    11.11
Name: letter_keys, dtype: float64

In [8]:
# create a column that concatonates key with mode
playlist_df['key_mode'] = playlist_df['letter_keys'] + " " + playlist_df['modes']

### Impute Outliers

In [9]:
# Bring outliers down/up to 5 standard deviations from the mean
for feat in playlist_df.columns:
    try:
        abv_5_std = playlist_df[feat].mean()+ 5* playlist_df[feat].std()
        below_5_std = playlist_df[feat].mean()- 5* playlist_df[feat].std()
        conditions = [playlist_df[feat]>abv_5_std, playlist_df[feat]<below_5_std]
        choices = [abv_5_std, below_5_std]
        df[feat] = np.select(conditions, choices, playlist_df[feat])
    except:
        pass

### Standardize

In [10]:
# Scale features down to 0-1
playlist_df['scaled_speech'] = (playlist_df['speechiness'] - min(playlist_df['speechiness'])) / (max(playlist_df['speechiness']) - min(playlist_df['speechiness']))
playlist_df['scaled_duration'] = (playlist_df['duration_ms'] - min(playlist_df['duration_ms'])) / (max(playlist_df['duration_ms']) - min(playlist_df['duration_ms']))
playlist_df['scaled_loudness'] = (playlist_df['loudness'] - min(playlist_df['loudness'])) / (max(playlist_df['loudness']) - min(playlist_df['loudness']))
playlist_df['scaled_tempo'] = (playlist_df['tempo'] - min(playlist_df['tempo'])) / (max(playlist_df['tempo']) - min(playlist_df['tempo']))
playlist_df['scaled_pop'] = (playlist_df['popularity'] - min(playlist_df['popularity'])) / (max(playlist_df['popularity']) - min(playlist_df['popularity']))


### Create Dummy Variables

In [11]:
# all the dummies from the big dataset
all_dummies = ['A Major', 'A Minor',
       'Ab Major', 'Ab Minor', 'B Major', 'B Minor', 'Bb Major', 'Bb Minor',
       'C Major', 'C Minor', 'D Major', 'D Minor', 'Db Major', 'Db Minor',
       'E Major', 'E Minor', 'Eb Major', 'Eb Minor', 'F Major', 'F Minor',
       'F# Major', 'F# Minor', 'G Major', 'G Minor', '1920s', '1930s', '1940s',
       '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s', '2020s']

# make dummies for key/mode and decades
key_dummies = pd.get_dummies(playlist_df['key_mode'])
decade_dummies = pd.get_dummies(playlist_df['decade'])
dummies = pd.concat([ key_dummies, decade_dummies], axis=1)

# check which dummies weren't created - we need to add these in as zeros so the dataframes have the same shape
playlist_df = pd.concat([playlist_df, key_dummies, decade_dummies], axis=1)
zeroes = list(set(all_dummies) - set(dummies.columns))

# add those dummies in as zeroes
playlist_df[zeroes] = 0

# drop first column for dummies
playlist_df.drop(['A Major', '1920s'],axis=1, inplace=True)

### Import Record Labels dataset

In [12]:
labels_df = pd.read_csv('data/cleaned_data_labels.csv',index_col=[0])

# drop unnecesary features
labels_df.drop(['key_mode', 'decade', 'modes', 'letter_keys',  'year', 
         'release_date', 'mode', 'loudness', 'key', 'id', 'explicit', 
         'tempo', 'duration_ms', 'speechiness', 'popularity' ], axis=1, inplace=True)

In [13]:
# dataframe of all Dead Oceans songs
ocean_df = labels_df[labels_df.label == 'Dead Oceans']
ocean_df.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,name,valence,scaled_speech,scaled_duration,scaled_loudness,scaled_tempo,scaled_pop,A Minor,Ab Major,Ab Minor,B Major,B Minor,Bb Major,Bb Minor,C Major,C Minor,D Major,D Minor,Db Major,Db Minor,E Major,E Minor,Eb Major,Eb Minor,F Major,F Minor,F# Major,F# Minor,G Major,G Minor,1930s,1940s,1950s,1960s,1970s,1980s,1990s,2000s,2010s,2020s,artist,label
3919,0.162,0.181,0.486,0.203,0.111,Machine Gun,0.0545,0.152872,0.048459,0.716686,0.63326,0.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,Slowdive,Dead Oceans
3920,0.0987,0.172,0.745,0.0073,0.235,40 Days,0.491,0.153282,0.035013,0.774223,0.757644,0.37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,Slowdive,Dead Oceans
3921,0.195,0.129,0.647,0.292,0.104,Machine Gun,0.0393,0.15697,0.048824,0.792687,0.633165,0.41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,Slowdive,Dead Oceans
3922,0.00137,0.431,0.482,0.441,0.238,Ballad Of Sister Sue,0.123,0.126232,0.049432,0.816804,0.524579,0.38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,Slowdive,Dead Oceans
3923,5.7e-05,0.227,0.48,0.835,0.0477,Catch The Breeze,0.188,0.15902,0.047115,0.841203,0.37978,0.37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,Slowdive,Dead Oceans


### Calculate cosine similarities between playlist and all Dead Oceans songs

In [14]:
# turn both dataframes into numpy arrays
playlist_array = np.array(playlist_df.drop(['decade','duration_ms','key','key_mode','letter_keys',
                     'loudness','mode','modes','popularity','release_date','speechiness',
                         'tempo','track/artist','year'], axis=1))

label_array = np.array(ocean_df.drop(['artist','name','label'], axis=1))

In [15]:
# turn both arrays into sparse matrices
playlist_sparse = sparse.csr_matrix(playlist_array)
label_sparse = sparse.csr_matrix(label_array)

# get cosine similarities between user playlist and record label
cosim = cosine_similarity(playlist_sparse, label_sparse,dense_output=True)
print(cosim[0])


[0.41048198 0.46508308 0.50690708 0.38347746 0.38167026 0.47863051
 0.389911   0.39059759 0.35573562 0.36883794 0.27732504 0.39653992
 0.40443136 0.37710166 0.40054964 0.30535826 0.43224922 0.35005341
 0.3650677  0.39990222 0.35987119 0.35287898 0.4347205  0.40957185
 0.4273254  0.6799827  0.39989644 0.38722497 0.46576319 0.66389508
 0.44183993 0.64154536 0.43386476 0.62117831 0.4472665  0.38027091
 0.36413054 0.42236729 0.62774839 0.42997248 0.43353232 0.43207587
 0.47329969 0.40911743 0.39949446 0.38289718 0.3508459  0.35654525
 0.36190396 0.33774586 0.36539917 0.40163796 0.43061679 0.3559152
 0.35101306 0.44434106 0.39586729 0.46860207 0.45046191 0.60696489
 0.44746828 0.39339233 0.43816977 0.64842426 0.4429689  0.4227782
 0.41794975 0.38316499 0.3656166  0.46221482 0.6138289  0.38686906
 0.64005204 0.32723175 0.6693392  0.42299694 0.6051339  0.40815275
 0.41238729 0.61936899 0.41138316]


### Sort by highest similarities and map output to song names/artists

In [16]:
for i in range(len(cosim)):
    print('Dead Oceans songs similar to', playlist_df.iloc[i]['track/artist'],':\n' )
    print(sorted(list(zip(cosim[i],ocean_df['name'], ocean_df['artist'])), reverse=True)[:2])
    print('================================================================================================================')

Dead Oceans songs similar to Polly - Moses Sumney :

[(0.6799826957661531, 'A Perfect Sonnet', 'Bright Eyes'), (0.6693392046451008, 'Nobody', 'Mitski')]
Dead Oceans songs similar to Goodbye - Porches :

[(0.7235321945507469, 'Day Glo', 'Brazos'), (0.6911031673391842, 'Kyoto', 'Phoebe Bridgers')]
Dead Oceans songs similar to rangerover - Porches :

[(0.7543136314811328, 'At The Bottom Of Everything', 'Bright Eyes'), (0.748801127319026, 'June On The West Coast', 'Bright Eyes')]
Dead Oceans songs similar to Tal Uno - Barrie :

[(0.6550584864922804, 'Evan Finds the Third Room', 'Khruangbin'), (0.6489500292716106, 'Sing', 'Slowdive')]
Dead Oceans songs similar to Angel's Song - Arlo Parks :

[(0.6296033724409232, 'Sing', 'Slowdive'), (0.6176369014227862, 'Sing', 'Slowdive')]
Dead Oceans songs similar to Randy - Big Thief :

[(0.5105669076285985, 'Evan Finds the Third Room', 'Khruangbin'), (0.5105604536560789, 'Dagger', 'Slowdive')]
Dead Oceans songs similar to Butterfly - Adrianne Lenker :


### Put it all in a function

In [17]:
def recommend(playlist, record_label):
    '''input 1: playlist formatted as a list of tuples [(track_name, artist), (track_name, artist),etc]
        input 2: record label name (string)'''
    
  # in the works

### Test on Justin's Playlist

### Test on DJ's Playlist

### Test on Nick's Playlist

# Clustering