<a href="https://colab.research.google.com/github/yesimcebeci/Spotify-Recommendation-System/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Recommender Systems with Spotify Data

## Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering, Birch, MiniBatchKMeans, OPTICS, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV
import plotly.express as px
import pickle


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import data
path = "/content/drive/MyDrive/Colab Notebooks/data/playlist.csv"
data = pd.read_csv(path)
# Dataset is now stored in a Pandas Dataframe

In [None]:
data.head()

Unnamed: 0,artist,artist_id,popularity,album,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Hozier,2FXC3k01G6Gw61bmprjgqS,0,Hozier (Deluxe),Take Me To Church,7dS5EaCoMnN7DzlpT6aRn2,0.566,0.664,4,-5.303,0,0.0464,0.0,0.116,0.437,128.945,241688,4
1,Mike Posner,2KsP6tYLJlTBvSUxnwlVWa,76,31 Minutes to Takeoff,Cooler Than Me - Single Mix,2V4bv1fNWfTcyRJKmej6Sj,0.768,0.82,7,-4.63,0,0.0474,0.0,0.689,0.625,129.965,213293,4
2,"Tyler, The Creator",4V8LLVI7PbaPR0K2TGSxFF,84,Flower Boy,See You Again (feat. Kali Uchis),7KA4W4McWYRpgf0fWsJZWB,0.558,0.559,6,-9.222,1,0.0959,7e-06,0.109,0.62,78.558,180387,4
3,Bastille,7EQ0qTo7fWT7DPxmxtSYEc,72,Bad Blood,Pompeii,3gbBpTdY8lnQwqxNCcf795,0.679,0.715,9,-6.383,1,0.0407,0.0,0.271,0.571,127.435,214148,4
4,Shakira,0EmeFodog0BfCgMzAIvKQp,83,"Oral Fixation, Vol. 2 (Expanded Edition)",Hips Don't Lie (feat. Wyclef Jean),3ZFTkvIE7kyPt6Nu3PEa7V,0.778,0.824,10,-5.892,0,0.0707,0.0,0.405,0.758,100.024,218093,4


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5295 entries, 0 to 5294
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            5295 non-null   object 
 1   artist_id         5295 non-null   object 
 2   popularity        5295 non-null   int64  
 3   album             5293 non-null   object 
 4   track_name        5293 non-null   object 
 5   track_id          5295 non-null   object 
 6   danceability      5295 non-null   float64
 7   energy            5295 non-null   float64
 8   key               5295 non-null   int64  
 9   loudness          5295 non-null   float64
 10  mode              5295 non-null   int64  
 11  speechiness       5295 non-null   float64
 12  instrumentalness  5295 non-null   float64
 13  liveness          5295 non-null   float64
 14  valence           5295 non-null   float64
 15  tempo             5295 non-null   float64
 16  duration_ms       5295 non-null   int64  


## Data Understanding

In [None]:
#Checking for missing data
data.isna().sum()

artist              0
artist_id           0
popularity          0
album               2
track_name          2
track_id            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64

In [None]:
data.dropna(inplace = True)

In [None]:
# Checking nulls after dropping
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5293 entries, 0 to 5294
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            5293 non-null   object 
 1   artist_id         5293 non-null   object 
 2   popularity        5293 non-null   int64  
 3   album             5293 non-null   object 
 4   track_name        5293 non-null   object 
 5   track_id          5293 non-null   object 
 6   danceability      5293 non-null   float64
 7   energy            5293 non-null   float64
 8   key               5293 non-null   int64  
 9   loudness          5293 non-null   float64
 10  mode              5293 non-null   int64  
 11  speechiness       5293 non-null   float64
 12  instrumentalness  5293 non-null   float64
 13  liveness          5293 non-null   float64
 14  valence           5293 non-null   float64
 15  tempo             5293 non-null   float64
 16  duration_ms       5293 non-null   int64  
