# Spotify Recommendation System
![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS02VeZIC0KchAraPdqSsXwdbCNR6z_EsWKv2VVvTzZTpqOP_jdXjffB1X3lXgAX4Ubzqw&usqp=CAU)

The dataset was gotten from open source

The Spotify recommendation system uses collaborative filtering(i.e recommend based on what my neighbor prefers). This dataset contains over 19 features grouped by artist, year, genre.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings

In [3]:
sns.set()

data = pd.read_csv("dataset/spotify.csv")
data.head(5)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [13]:
data.tail(5)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
174384,0.00917,"['DJ Combo', 'Sander-7', 'Tony T']",0.792,147615,0.866,0,46LhBf6TvYjZU2SMvGZAbn,6e-05,6,0.178,-5.089,0,The One,0,2020-12-25,0.0356,125.972,0.186,2020
174385,0.795,['Alessia Cara'],0.429,144720,0.211,0,7tue2Wemjd0FZzRtDrQFZd,0.0,4,0.196,-11.665,1,A Little More,0,2021-01-22,0.036,94.71,0.228,2021
174386,0.806,['Roger Fly'],0.671,218147,0.589,0,48Qj61hOdYmUCFJbpQ29Ob,0.92,4,0.113,-12.393,0,Together,0,2020-12-09,0.0282,108.058,0.714,2020
174387,0.92,['Taylor Swift'],0.462,244000,0.24,1,1gcyHQpBQ1lfXGdhZmWrHP,0.0,0,0.113,-12.077,1,champagne problems,69,2021-01-07,0.0377,171.319,0.32,2021
174388,0.239,['Roger Fly'],0.677,197710,0.46,0,57tgYkWQTNHVFEt6xDKKZj,0.891,7,0.215,-12.237,1,Improvisations,0,2020-12-09,0.0258,112.208,0.747,2020


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174389 entries, 0 to 174388
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      174389 non-null  float64
 1   artists           174389 non-null  object 
 2   danceability      174389 non-null  float64
 3   duration_ms       174389 non-null  int64  
 4   energy            174389 non-null  float64
 5   explicit          174389 non-null  int64  
 6   id                174389 non-null  object 
 7   instrumentalness  174389 non-null  float64
 8   key               174389 non-null  int64  
 9   liveness          174389 non-null  float64
 10  loudness          174389 non-null  float64
 11  mode              174389 non-null  int64  
 12  name              174389 non-null  object 
 13  popularity        174389 non-null  int64  
 14  release_date      174389 non-null  object 
 15  speechiness       174389 non-null  float64
 16  tempo             17

In [5]:
data.dtypes

acousticness        float64
artists              object
danceability        float64
duration_ms           int64
energy              float64
explicit              int64
id                   object
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
name                 object
popularity            int64
release_date         object
speechiness         float64
tempo               float64
valence             float64
year                  int64
dtype: object

### Data Wrangling/Cleaning

In [8]:
data.isnull().sum()

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
dtype: int64

In [9]:
#Since data contains no missing value. Consider the correlation between features
#Prior to considering correlation, drop some columns that are not required
data_drop = data.drop(columns =['id', 'name', 'artists', 'release_date', 'year'])

data_drop.corr()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
acousticness,1.0,-0.263217,-0.089169,-0.750852,-0.208176,0.221956,-0.028028,-0.029654,-0.546639,0.064633,-0.396744,-0.022437,-0.22384,-0.166968
danceability,-0.263217,1.0,-0.100757,0.204838,0.200842,-0.215589,0.026266,-0.110033,0.249541,-0.048358,0.123746,0.239962,0.005479,0.536713
duration_ms,-0.089169,-0.100757,1.0,0.060516,-0.033808,0.103621,0.00202,0.028942,0.019791,-0.046849,0.024717,-0.097838,-0.008182,-0.183199
energy,-0.750852,0.204838,0.060516,1.0,0.102561,-0.17775,0.03578,0.134815,0.779267,-0.05616,0.328939,-0.112616,0.266448,0.326418
explicit,-0.208176,0.200842,-0.033808,0.102561,1.0,-0.130609,0.005282,0.037288,0.106249,-0.062503,0.152545,0.353872,0.008075,-0.009275
instrumentalness,0.221956,-0.215589,0.103621,-0.17775,-0.130609,1.0,-0.004619,-0.047941,-0.317562,-0.056731,-0.300625,-0.133966,-0.068656,-0.219188
key,-0.028028,0.026266,0.00202,0.03578,0.005282,-0.004619,1.0,-0.003368,0.025227,-0.127397,0.001951,0.009648,0.005009,0.025592
liveness,-0.029654,-0.110033,0.028942,0.134815,0.037288,-0.047941,-0.003368,1.0,0.062695,0.001677,-0.078959,0.122034,0.008586,-0.005781
loudness,-0.546639,0.249541,0.019791,0.779267,0.106249,-0.317562,0.025227,0.062695,1.0,-0.01925,0.337194,-0.213504,0.217914,0.30252
mode,0.064633,-0.048358,-0.046849,-0.05616,-0.062503,-0.056731,-0.127397,0.001677,-0.01925,1.0,0.007652,-0.040711,0.002438,0.021592


### Data Transformation

In [None]:
# Use the MinMaxScaler method from the Scikit-learn library to normalize the dataset

from sklearn.preprocessing import MinMaxScaler

datatypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
normarization = data.select_dtypes(include=datatypes)
for col in normarization.columns:
    MinMaxScaler(col)

In [None]:
# Prevent songs of similar characteristics but different genres affect the recommendation system.
# Hence a song differentiating feature for different categories is created using K means clustering algorithm

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10)
features = kmeans.fit_predict(normarization)
data['features'] = features
MinMaxScaler(data['features'])

### Spotify Recommendation System

In [10]:
class Spotify_Recommendation():
    def __init__(self, dataset):
        self.dataset = dataset
        
    def spotify_recommend_sys(self, songs, amount=1):
        distance = []
        song = self.dataset[(self.dataset.name.str.lower() == songs.lower())].head(1).values[0]
        rec = self.dataset[self.dataset.name.str.lower() != songs.lower()]
        
        for songs in tqdm(rec.values):
            d = 0
            for col in np.arange(len(rec.columns)):
                if not col in [1, 6, 12, 14, 18]:
                    d = d + np.absolute(float(song[col]) - float(songs[col]))
            distance.append(d)
        rec['distance'] = distance
        rec = rec.sort_values('distance')
        columns = ['artists', 'name']
        return rec[columns][:amount]

In [11]:
recommendation = Spotify_Recommendation(data)
recommendation.spotify_recommend_sys("Golfing Papa", 12)

100%|████████████████████████████████| 174388/174388 [00:04<00:00, 36666.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec['distance'] = distance


Unnamed: 0,artists,name
89929,['Graeme E. Pearson & The Mutineers'],The Year 2000
41957,['Olga Svendsen'],Jeg har en ven
76845,['Ignacio Corsini'],Amigazo - Remasterizado
99485,['Billy Stewart'],I'm No Romeo
43884,"['Giacomo Puccini', 'Giuseppe Antonicelli', 'M...",La Bohème: Act I - Ehi! Rodolfo!
133398,['Vicente Fernández'],No Vas A Creer
24127,['Leela Bai'],Mat Josh Men Aa Tu Hosh Men Aa
161721,['Roger Williams'],The Sentimental Touch
20807,['Francisco Canaro'],El Hijo de Julián - Instrumental (Remasterizado)
80757,['Jackie Gleason'],And Away We Go!


In [12]:
recommendation.spotify_recommend_sys("Lovers Rock", 8)

100%|████████████████████████████████| 174387/174387 [00:04<00:00, 36020.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec['distance'] = distance


Unnamed: 0,artists,name
103171,['Barão Vermelho'],Bete Balanço
55318,['Shinedown'],Save Me
16385,['O-Zone'],Dragostea Din Tei
11168,['Bob Marley & The Wailers'],Positive Vibration
158441,"[""Olivia O'Brien""]",Love Myself
54226,"['Naughty By Nature', 'Zhané']",Jamboree (feat. Zhané)
85047,['The Outlaws'],Song For You
50644,['The Alan Parsons Project'],Mammagamma - Instrumental
