# Spotify Song Suggester

This application suggests a personalized playlist of tracks based on a user's song preference data.

---

## Data Cleaning


In [1]:
# Imports
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load data
DATA_PATH = '../data/raw/tracks.csv.zip'
df = pd.read_csv(DATA_PATH)

# Display data
print(df.shape)
df.head()

(586672, 20)


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [3]:
# Transform release_date feature to integer years
df.release_date = df.release_date.apply(lambda x: int(x[:4]))
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [4]:
# Create modeling data
features = [
    'popularity',
    'duration_ms',
    'explicit',
    'release_date',
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'time_signature'
]

X = df[features]

print(X.shape)
X.head()

(586672, 16)


Unnamed: 0,popularity,duration_ms,explicit,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6,126903,0,1922,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,0,98200,0,1922,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,0,181640,0,1922,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,0,176907,0,1922,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,0,163080,0,1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [5]:
# Scale data
scaler = StandardScaler()
pca = PCA(n_components=5)

X = scaler.fit_transform(X)
X = pca.fit_transform(X)
X.shape

(586672, 5)

In [6]:
# NearestNeighbors model
model = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
neighbors = model.fit(X)

In [7]:
# Find nearest neighbors to test track "My Girl"
my_girl_track = X[38498].reshape(1, -1)
distances, indices = neighbors.kneighbors(my_girl_track)
indices

array([[ 38498, 119126, 343116, 130192, 358146, 435397,  74376,  68876,
        489947,  68345]], dtype=int64)

In [8]:
[df.iloc[ind].name for ind in indices]

[38498                                               My Girl
 119126                                          Hello Walls
 343116                            True Love Will Never Fade
 130192    Kiss the Girl - From "The Little Mermaid"/ Sou...
 358146                                        Никой не може
 435397                                                Sampa
 74376                          Somebody That I Used To Know
 68876                                    The Cuppycake Song
 489947                   Redemption Song - B Is For Bob Mix
 68345                                   Parece Que Fue Ayer
 Name: name, dtype: object]

## Save Base Model

In [9]:
# Save using pickle
pickle.dump(model, open('../models/base_nn.sav', mode='wb'))