# Importing the **libraries**

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Spotify-2000.csv')
data.head()

Unnamed: 0,Index,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,1,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71
1,2,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39
2,3,Clint Eastwood,Gorillaz,alternative hip hop,2001,168,69,66,-9,7,52,341,2,17,69
3,4,The Pretender,Foo Fighters,alternative metal,2007,173,96,43,-4,3,37,269,0,4,76
4,5,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,2002,106,82,58,-5,10,87,256,1,3,59


# Removing index column

In [3]:
data = data.drop('Index', axis=1)

In [4]:
data.head()

Unnamed: 0,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71
1,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39
2,Clint Eastwood,Gorillaz,alternative hip hop,2001,168,69,66,-9,7,52,341,2,17,69
3,The Pretender,Foo Fighters,alternative metal,2007,173,96,43,-4,3,37,269,0,4,76
4,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,2002,106,82,58,-5,10,87,256,1,3,59


# Checking the data types

In [5]:
print(data.dtypes)

Title                     object
Artist                    object
Top Genre                 object
Year                       int64
Beats Per Minute (BPM)     int64
Energy                     int64
Danceability               int64
Loudness (dB)              int64
Liveness                   int64
Valence                    int64
Length (Duration)         object
Acousticness               int64
Speechiness                int64
Popularity                 int64
dtype: object


# Filtering all the numeric values to get coorelation between audio features

In [6]:
numeric_data = data.select_dtypes(include=['int'])
correlation_matrix = numeric_data.corr()
correlation_matrix

Unnamed: 0,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Acousticness,Speechiness,Popularity
Year,1.0,0.01257,0.147235,0.077493,0.343764,0.019017,-0.166163,-0.132946,0.054097,-0.158962
Beats Per Minute (BPM),0.01257,1.0,0.156644,-0.140602,0.092927,0.016256,0.059653,-0.122472,0.085598,-0.003181
Energy,0.147235,0.156644,1.0,0.139616,0.735711,0.174118,0.405175,-0.665156,0.205865,0.103393
Danceability,0.077493,-0.140602,0.139616,1.0,0.044235,-0.103063,0.514564,-0.135769,0.125229,0.144344
Loudness (dB),0.343764,0.092927,0.735711,0.044235,1.0,0.098257,0.147041,-0.451635,0.12509,0.165527
Liveness,0.019017,0.016256,0.174118,-0.103063,0.098257,1.0,0.050667,-0.046206,0.092594,-0.111978
Valence,-0.166163,0.059653,0.405175,0.514564,0.147041,0.050667,1.0,-0.239729,0.107102,0.095911
Acousticness,-0.132946,-0.122472,-0.665156,-0.135769,-0.451635,-0.046206,-0.239729,1.0,-0.098256,-0.087604
Speechiness,0.054097,0.085598,0.205865,0.125229,0.12509,0.092594,0.107102,-0.098256,1.0,0.111689
Popularity,-0.158962,-0.003181,0.103393,0.144344,0.165527,-0.111978,0.095911,-0.087604,0.111689,1.0


# creating a new dataframe

In [7]:
# creating a new dataframe
data2 = data[["Beats Per Minute (BPM)", "Loudness (dB)", "Liveness", "Valence", "Acousticness", "Speechiness", "Popularity"]]

# Implementing MinMaxscaler to maintain the range of 0 and 1

In [8]:
from sklearn.preprocessing import MinMaxScaler

for i in data.columns:
    MinMaxScaler(i)

Implementing Kmeans to similar featues

In [9]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(data2)

# making a new column and putting back to previous data from predicted data from data2

In [10]:
data["Music Segments"] = clusters
MinMaxScaler(data["Music Segments"])
data["Music Segments"] = data["Music Segments"].map({
    1: "Cluster 1",2: "Cluster 2", 3: "Cluster 3", 4: "Cluster 4",5: "Cluster 5",
    6: "CLuster 6", 7: "Cluster 7", 8: "Cluster 8",9: "Cluster 9",10: "Cluster 10"
})

# removing null values

In [11]:
data = data.dropna()

In [12]:
data.head()

Unnamed: 0,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity,Music Segments
0,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71,Cluster 1
1,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39,Cluster 7
2,Clint Eastwood,Gorillaz,alternative hip hop,2001,168,69,66,-9,7,52,341,2,17,69,Cluster 4
3,The Pretender,Foo Fighters,alternative metal,2007,173,96,43,-4,3,37,269,0,4,76,Cluster 4
4,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,2002,106,82,58,-5,10,87,256,1,3,59,Cluster 7


# Visualization of **data**

In [14]:
import plotly.graph_objects as go
PLOT = go.Figure()
for i in list(data["Music Segments"].unique()):

  PLOT.add_trace(go.Scatter3d(x = data[data["Music Segments"] == i]['Beats Per Minute (BPM)'],
                              y = data[data["Music Segments"] == i]['Energy'],
                              z = data[data["Music Segments"] == i]['Danceability'],
                              mode = 'markers', marker_size = 6, marker_line_width = 1,
                              name = str(i)
                              ))
PLOT.update_traces(hovertemplate='Beats Per Minutes (BPM): %{x} <br>Energy: %{y} <br>Danceability: %{z}')

PLOT.update_layout(width = 800, height = 800, autosize = True, showlegend = True,
                   scene = dict(xaxis=dict(title = 'Beats Per Minute (BPM)', titlefont_color = 'black'),
                                yaxis=dict(title = 'Energy', titlefont_color = 'black'),
                                zaxis=dict(title = 'Danceability', titlefont_color = 'black')),
                   font = dict(family = "Gilroy", color  = 'black', size = 12))