In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import myconfig

## load data and drop columns

In [None]:
song_cluster = pd.read_csv('list_of_songs.csv')

In [None]:
song_cluster = pd.DataFrame(song_cluster)
song_cluster

In [None]:
#create separate dataframe for song name and id
data = [song_cluster["name_caller"], song_cluster["id_caller.1"],song_cluster["artist"] ]
headers = ["name", "id", "artist"]

song_df = pd.concat(data, axis=1, keys=headers)
song_df

In [None]:
#drop unneccessary columns
song_cluster.drop(["time_signature"],axis=1, inplace=True)
song_cluster.drop(["name_other"],axis=1, inplace=True)
song_cluster.drop(["id_caller.1"],axis=1, inplace=True)
song_cluster.drop(["id_caller.2"],axis=1, inplace=True)
song_cluster.drop(["name_caller.1"],axis=1, inplace=True)
song_cluster.drop(["id_other.1"],axis=1, inplace=True)
song_cluster.drop(["id_other.2"],axis=1, inplace=True)
song_cluster.drop(["name_other.1"],axis=1, inplace=True)
song_cluster.drop(["type"],axis=1, inplace=True)
song_cluster.drop(["id_caller"],axis=1, inplace=True)
song_cluster.drop(["uri"],axis=1, inplace=True)
song_cluster.drop(["track_href"],axis=1, inplace=True)
song_cluster.drop(["analysis_url"],axis=1, inplace=True)
song_cluster.drop(["duration_ms"],axis=1, inplace=True)
song_cluster.drop(["name_caller"],axis=1, inplace=True)
song_cluster.drop(["name"],axis=1, inplace=True)
song_cluster.drop(["id_other"],axis=1, inplace=True)
song_cluster.drop(["artist"],axis=1, inplace=True)

In [None]:
X = song_cluster
X

## scaling features

In [None]:
X.describe()

In [None]:
#use standard scaler on dataframe
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
display(X.head())
print()
display(X_scaled_df.head())

In [None]:
X_scaled_df.describe()

## clustering the songs with k-means

In [None]:
#create 7 clusters using kmeans
kmeans = KMeans(n_clusters=7, random_state=1234)
kmeans.fit(X_scaled_df)

In [None]:
kmeans.labels_

In [None]:
# assign a cluster to each example
labels = kmeans.predict(X_scaled_df)
# retrieve unique clusters
clusters = np.unique(labels)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = np.where(labels == cluster)
    # create scatter of these samples
    pyplot.scatter(X.to_numpy()[row_ix, 1], X.to_numpy()[row_ix, 5])
    # show the plot
pyplot.show()

In [None]:
clusters = kmeans.predict(X_scaled_df)
#clusters
pd.Series(clusters).value_counts().sort_index()

In [None]:
#create dataframe from clusters
X["cluster"] = clusters
X

### show sample from a specific cluster

In [None]:
X[X['cluster'] == 3].sample()

#### playing with the parameters

In [None]:
#use inertia to find the best model (mean squared distance between each instance and its closest centroid) 
kmeans.inertia_

In [None]:
kmeans2 = KMeans(n_clusters=50,
                init="k-means++",
                n_init=20,  
                max_iter=10,
                tol=0,
                algorithm="elkan",
                random_state=1234)
kmeans2.fit(X_scaled_df)
print(kmeans2.inertia_)

#### choosing k

In [None]:
K = range(2, 21)
inertia = []

for k in K:
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_scaled_df)
    inertia.append(kmeans.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

#### saving with pickle

In [None]:
import pickle

#scaler = StandardScaler()
#model = KMeans()

with open("scaler.pickle", "wb") as f:
    pickle.dump(scaler,f)

with open("kmeans_4.pickle", "wb") as f:
    pickle.dump(kmeans,f)

In [None]:
def load(filename = "filename.pickle"): 
    try: 
        with open(filename, "rb") as f: 
            return pickle.load(f) 
        
    except FileNotFoundError: 
        print("File not found!") 



In [None]:
scaler2 = load("scaler.pickle")

#### Silhouette

In [None]:
K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_scaled_df)
    
    filename = "kmeans_" + str(k) + ".pickle"
    with open(filename, "wb") as f:
        pickle.dump(kmeans,f)
    
    silhouette.append(silhouette_score(X_scaled_df, kmeans.predict(X_scaled_df)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhouette Method showing the optimal k')

In [None]:
X

In [None]:
#concatenate dataframes with name&id and audio features
spotify_songs = pd.concat([song_df, X], axis=1)

In [None]:
spotify_songs

In [None]:
print(str(spotify_songs['name'][spotify_songs['cluster'] == 0].sample()).split('    ')[1].split("\n")[0])

## Building the Song Recommender

In [None]:
#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= myconfig.client_id,
                                                           client_secret= myconfig.client_secret))

In [None]:
#import song database
spotify_songs.to_csv(r'spotify_songs.csv', index = False)

In [None]:
#import billboard hot 100
top_100 = pd.read_excel('billboard_top100.xlsx')

#input from user returns audio features from spotify
input_song = input("Please enter a song title: ")
results = sp.search(q=input_song,limit=1,market="GB")
track_id=results['tracks']['items'][0]['external_urls']["spotify"][31:]
input_dict = sp.audio_features(track_id)[0]
del input_dict["type"]
del input_dict["id"]
del input_dict["track_href"]
del input_dict["analysis_url"]
del input_dict["duration_ms"]
del input_dict["time_signature"]
del input_dict["uri"]

In [None]:
#turn audio features into list
input_list=list(input_dict.values())  
input_list

In [None]:
#turn audio features into array
input_array = np.array(input_list).reshape(1,-1)
input_array

In [None]:
#scale audio features
input_array_scaled = scaler.transform(input_array)
input_array_scaled

In [None]:
#turn audio features into dataframe
input_df = pd.DataFrame(data=input_array_scaled, index=None,columns=input_dict.keys())
input_df = pd.DataFrame(input_df)
input_df

In [None]:
#use kmeans predict on audio features dataframe to return cluster for song input
cluster_label = kmeans.predict(input_df)
cluster_label

## Recommender Function

In [None]:
#function to show whether song is in the hot 100 and to suggest another song from the same cluster
def user_search():
    user_input = input("Please enter a song title: ")
    from IPython.display import IFrame

    #searching for track_id of input-song
    results = sp.search(q=user_input,limit=1,market="GB")
    track_id=results['tracks']['items'][0]['external_urls']["spotify"][31:]
    
    # getting the spotify audio features of the song & deleting unnecessary features
    input_dict = sp.audio_features(track_id)[0]
    del input_dict["type"]
    del input_dict["id"]
    del input_dict["track_href"]
    del input_dict["analysis_url"]
    del input_dict["duration_ms"]
    del input_dict["time_signature"]
    del input_dict["uri"]
    
    # standard scaling 
    input_list=list(input_dict.values())  
    input_array = np.array(input_list).reshape(1,-1)
    input_array_scaled = scaler.transform(input_array)
    
    # getting cluster label for user_song
    input_df = pd.DataFrame(data=input_array_scaled, index=None,columns=input_dict.keys())
    input_df = pd.DataFrame(input_df)
    cluster_label = kmeans.predict(input_df)
    
    if top_100['song_title'].str.contains(user_input).any():
        artists_of_song = list(top_100['song_artist'][top_100['song_title'] == user_input].values)
        if len(artists_of_song) == 1:
            song_artist = artists_of_song[0]
            print("Your song is by",song_artist,"and is in the Hot100")
            print("Another song from the Top100 that you might like:", np.random.choice(top_100['song_title']))
        elif len(artists_of_song) > 1:
            song_artist1 = artists_of_song[0]
            song_artist2 = artists_of_song[1]
            print("Your song is in the Hot100, but by two different artists: ",song_artist1, "&", song_artist2)
            print("Another song from the Top100 that you might like:", np.random.choice(top_100['song_title']))
    else:
        for cluster_index in range(7):
            if cluster_label == cluster_index:
                song_recom = spotify_songs.loc[spotify_songs['cluster'] == cluster_index].sample()  
                print("Your song is not in the Hot100")
                print("Maybe you like this song: ",str(song_recom["name"]).split('    ')[1].split("\n")[0],"by",str(song_recom["artist"]).split('    ')[1].split("\n")[0])
                return IFrame(src="https://open.spotify.com/embed/track/"+str(song_recom['id']).split('    ')[1].split("\n")[0],
                           width="320",
                           height="80",
                           frameborder="0",
                           allowtransparency="true",
                           allow="encrypted-media",
                          )


In [None]:
user_search()