In [2]:
# Import the required libraries and dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Step 1:  Read in the dataset about the current songs' decades of the playlists.

In [3]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
songs_df = pd.read_csv(
    Path("final.csv"),                
    # index_col="decade"
)

# Review the DataFrame
songs_df.head()

Unnamed: 0,id,song_name,artist,pic_url,preview_url,release_year,decade,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,1H5IfYyIIAlgDX8zguUzns,Suspicious Minds,Elvis Presley,https://i.scdn.co/image/ab67616d0000b273fdc0aa...,https://p.scdn.co/mp3-preview/e7072dbcb87f4c8c...,1969,1960,0.487,0.382,7,...,5e-06,0.411,0.714,116.557,audio_features,spotify:track:1H5IfYyIIAlgDX8zguUzns,https://api.spotify.com/v1/tracks/1H5IfYyIIAlg...,https://api.spotify.com/v1/audio-analysis/1H5I...,261280,4
1,2xar08Fq5xra2KKZs5Bw9j,I've Got a Woman,Ray Charles,https://i.scdn.co/image/ab67616d0000b273f0e951...,https://p.scdn.co/mp3-preview/425d081433204c6d...,1957,1950,0.556,0.61,9,...,2e-06,0.258,0.665,199.373,audio_features,spotify:track:2xar08Fq5xra2KKZs5Bw9j,https://api.spotify.com/v1/tracks/2xar08Fq5xra...,https://api.spotify.com/v1/audio-analysis/2xar...,171467,4
2,64VP3skE86iTvdOlbzuIcO,Great Balls Of Fire,Jerry Lee Lewis,https://i.scdn.co/image/ab67616d0000b27395788c...,,1961,1960,0.533,0.729,7,...,0.0,0.159,0.884,78.67,audio_features,spotify:track:64VP3skE86iTvdOlbzuIcO,https://api.spotify.com/v1/tracks/64VP3skE86iT...,https://api.spotify.com/v1/audio-analysis/64VP...,111536,4
3,6C7aTTCUWRK7dD379yUT3W,Roll Over Beethoven,Chuck Berry,https://i.scdn.co/image/ab67616d0000b273a496dc...,,1959,1950,0.715,0.797,3,...,0.0,0.209,0.841,92.739,audio_features,spotify:track:6C7aTTCUWRK7dD379yUT3W,https://api.spotify.com/v1/tracks/6C7aTTCUWRK7...,https://api.spotify.com/v1/audio-analysis/6C7a...,144600,4
4,3YdKJzcoMZMacISlpY4QoP,I Only Have Eyes for You,The Flamingos,https://i.scdn.co/image/ab67616d0000b2731ba2d2...,https://p.scdn.co/mp3-preview/8b14a85d40cb0931...,1959,1950,0.552,0.313,5,...,0.0017,0.12,0.303,88.554,audio_features,spotify:track:3YdKJzcoMZMacISlpY4QoP,https://api.spotify.com/v1/tracks/3YdKJzcoMZMa...,https://api.spotify.com/v1/audio-analysis/3YdK...,202773,3


In [4]:
songs_df.shape

(1911, 24)

In [5]:
songs_df.drop(songs_df[(songs_df['release_year'] <1950)].index, inplace=True)
songs_df = songs_df.drop(columns=['id', 'pic_url', 'preview_url', 'release_year', 'type', 'uri', 'track_href', 'analysis_url', 'instrumentalness', 'time_signature', 'mode'])
songs_df = songs_df.dropna()
songs_df.reset_index(drop=True, inplace=True)
songs_df.shape

(1810, 13)

In [6]:
songs_df.head(65)

Unnamed: 0,song_name,artist,decade,danceability,energy,key,loudness,speechiness,acousticness,liveness,valence,tempo,duration_ms
0,Suspicious Minds,Elvis Presley,1960,0.487,0.382,7,-10.889,0.0309,0.0422,0.411,0.714,116.557,261280
1,I've Got a Woman,Ray Charles,1950,0.556,0.610,9,-4.729,0.2130,0.8330,0.258,0.665,199.373,171467
2,Great Balls Of Fire,Jerry Lee Lewis,1960,0.533,0.729,7,-7.227,0.0714,0.5630,0.159,0.884,78.670,111536
3,Roll Over Beethoven,Chuck Berry,1950,0.715,0.797,3,-10.154,0.1930,0.8370,0.209,0.841,92.739,144600
4,I Only Have Eyes for You,The Flamingos,1950,0.552,0.313,5,-12.410,0.0296,0.9160,0.120,0.303,88.554,202773
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,(You're The) Devil In Disguise,Elvis Presley,1960,0.483,0.579,5,-13.923,0.1160,0.4810,0.242,0.891,122.527,140680
61,I Don't Want To Set The World On Fire,The Ink Spots,1990,0.445,0.146,5,-16.064,0.0333,0.9620,0.249,0.332,86.637,181067
62,A Teenager's Romance,Ricky Nelson,2000,0.510,0.484,10,-6.942,0.0380,0.7840,0.306,0.683,103.356,140773
63,There's a Moon Out Tonight,The Capris,1950,0.508,0.488,6,-9.486,0.0336,0.8670,0.272,0.600,100.444,133880


In [7]:
cols2scale = ["danceability", "energy", "key", "loudness", "liveness", "valence", "tempo", "duration_ms", "acousticness", "speechiness"]

# Scale the attribute values
songs_scaled_og = StandardScaler().fit_transform(
    songs_df[cols2scale]
)

songs_scaled_og

array([[-0.66220352, -1.02720748,  0.50643942, ...,  0.58550623,
        -0.89432272, -0.484938  ],
       [-0.20246625, -0.02197291,  1.06949554, ..., -0.7256738 ,
         1.65662273,  2.89806462],
       [-0.35571201,  0.502689  ,  0.50643942, ..., -1.60060649,
         0.7856626 ,  0.26745962],
       ...,
       [ 0.79696261,  0.81572257, -0.05661669, ...,  0.55708197,
        -1.01161202, -0.31773852],
       [ 0.71034544,  0.502689  , -1.46425698, ...,  0.92789659,
        -0.47884248,  5.96338825],
       [-0.20246625,  1.31393093, -1.46425698, ...,  0.89033336,
        -0.96819304, -0.24714319]])

In [8]:
# Create a DataFrame with the scaled data
songs_scaled = pd.DataFrame(songs_scaled_og, columns=cols2scale)

# Copy the tickers names from the original data
songs_scaled["decade"] = songs_df.index

# Set the Ticker column as index
songs_scaled = songs_scaled.set_index("decade")

# Display sample data
songs_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,liveness,valence,tempo,duration_ms,acousticness,speechiness
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,-0.662204,-1.027207,0.506439,-0.504184,1.717403,0.36312,-0.141088,0.585506,-0.894323,-0.484938
1,-0.202466,-0.021973,1.069496,1.134323,0.607338,0.161596,2.7785,-0.725674,1.656623,2.898065
2,-0.355712,0.502689,0.506439,0.469877,-0.11094,1.062285,-1.476754,-1.600606,0.785663,0.26746
3,0.856928,0.802496,-0.619673,-0.30868,0.251826,0.885437,-0.980766,-1.117905,1.669526,2.52651
4,-0.229118,-1.331423,-0.056617,-0.908757,-0.393898,-1.327214,-1.128304,-0.268637,1.924362,-0.509089


In [9]:
# Create the PCA model instance where n_components=4
pca = PCA(n_components=4)


In [10]:
# Fit the songs_scaled data to the PCA
songs_pca_data = pca.fit_transform(songs_scaled_og)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
songs_pca_data[:5]

array([[ 2.71128417e-01,  6.29924721e-01,  5.18297668e-01,
         3.62261517e-01],
       [-6.66446878e-01, -8.06036085e-01,  2.54819537e+00,
         2.62990734e+00],
       [-4.69677507e-04, -1.50064561e+00,  3.16518828e-01,
         4.13990683e-01],
       [-8.40463283e-02, -2.35846157e+00,  2.64037685e-01,
         1.19381597e+00],
       [ 2.88617596e+00,  2.94272160e-02, -3.99341842e-01,
         7.86559853e-03]])

In [11]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.26081851, 0.15145491, 0.12144277, 0.10372028])

In [12]:
# Extract the component weightings to determine which variables load onto which principal component
pca.components_

array([[-0.2117762 , -0.56662852, -0.02442058, -0.49700432, -0.10796521,
        -0.29653164, -0.15316495, -0.12816385,  0.47037025, -0.1604748 ],
       [-0.50287863,  0.08391933,  0.08345771,  0.09347419,  0.00174021,
        -0.59418609,  0.05661353,  0.53033875, -0.27208679, -0.11319174],
       [-0.49357126,  0.09845864, -0.09471335,  0.12202854,  0.53006797,
         0.01588202,  0.47820689, -0.40896305,  0.20996798,  0.03985317],
       [ 0.03177825, -0.07640246,  0.74576869, -0.08326567,  0.15602009,
        -0.08650165, -0.02328409,  0.0111958 ,  0.06376584,  0.62728807]])

In [18]:
cov_mat = np.cov(songs_scaled_og.T)
cor_mat = np.corrcoef(songs_scaled_og.T)
cor_mat

array([[ 1.        ,  0.1232609 , -0.00144256,  0.11450385, -0.0962053 ,
         0.498088  , -0.13364853, -0.00806283, -0.16136793,  0.13801199],
       [ 0.1232609 ,  1.        ,  0.01450272,  0.75696873,  0.15834903,
         0.3676908 ,  0.19639717,  0.14992015, -0.65537549,  0.155016  ],
       [-0.00144256,  0.01450272,  1.        ,  0.00714781, -0.02130247,
        -0.02577767,  0.01964281,  0.03265334, -0.04666269,  0.05365131],
       [ 0.11450385,  0.75696873,  0.00714781,  1.        ,  0.1287848 ,
         0.20075055,  0.12844059,  0.0700411 , -0.47725864,  0.12064425],
       [-0.0962053 ,  0.15834903, -0.02130247,  0.1287848 ,  1.        ,
         0.07184251,  0.02697763, -0.04475277, -0.01447037,  0.0890123 ],
       [ 0.498088  ,  0.3676908 , -0.02577767,  0.20075055,  0.07184251,
         1.        ,  0.13855314, -0.25287396, -0.11742125,  0.07543841],
       [-0.13364853,  0.19639717,  0.01964281,  0.12844059,  0.02697763,
         0.13855314,  1.        , -0.01791979

In [17]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
eig_vecs

array([[ 0.2117762 ,  0.50287863,  0.19723488, -0.506956  ,  0.28147573,
        -0.49357126, -0.2795086 , -0.03054664,  0.05872607,  0.03177825],
       [ 0.56662852, -0.08391933,  0.75470438,  0.22444221, -0.04830266,
         0.09845864,  0.16289586,  0.08680866, -0.00112507, -0.07640246],
       [ 0.02442058, -0.08345771,  0.00268568,  0.01960996,  0.02659947,
        -0.09471335, -0.02753147,  0.51086997, -0.40552619,  0.74576869],
       [ 0.49700432, -0.09347419, -0.44753544, -0.09037199,  0.57357549,
         0.12202854,  0.40375824,  0.14043059,  0.04732616, -0.08326567],
       [ 0.10796521, -0.00174021, -0.01782232, -0.15473659, -0.00196648,
         0.53006797, -0.48554006,  0.3022023 ,  0.57592175,  0.15602009],
       [ 0.29653164,  0.59418609, -0.31115003,  0.58783865, -0.16771391,
         0.01588202, -0.23622222,  0.07812879, -0.13738806, -0.08650165],
       [ 0.15316495, -0.05661353,  0.01375898, -0.22794209,  0.13226009,
         0.47820689, -0.36241811, -0.40197965

In [22]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
cum_var_exp

array([ 26.08185109,  41.22734161,  53.37161903,  63.74364729,
        73.81201291,  82.76023194,  90.31408237,  94.58536659,
        98.4776497 , 100.        ])

In [23]:
# demonstration that eigen values are the same for covariance and correlation matrices
eig_vals, eig_vecs = np.linalg.eig(cor_mat)
eig_vals

array([2.60818511, 1.51454905, 0.15223503, 0.38922831, 0.42712842,
       1.21442774, 0.75538504, 0.8948219 , 1.00683656, 1.03720283])

In [65]:
# Creating a DataFrame with the PCA data
df_songs_pca = pd.DataFrame(songs_pca_data, columns=["PC1", "PC2", "PC3", "PC4"])
toplot_df = df_songs_pca.copy()

# Copy the tickers names from the original data
df_songs_pca["decade"] = songs_df['decade']

# Set the Ticker column as index
df_songs_pca = df_songs_pca.set_index("decade")

# Review the DataFrame
df_songs_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1960,0.271128,0.629925,0.518298,0.362262
1950,-0.666447,-0.806036,2.548195,2.629907
1960,-0.00047,-1.500646,0.316519,0.413991
1950,-0.084046,-2.358462,0.264038,1.193816
1950,2.886176,0.029427,-0.399342,0.007866


In [66]:
# Determining clusters from 2-7

cluster_num = [f"cluster_{x}" for x in range(2,10)]

song_clusters = []
for n, cn in enumerate(cluster_num):
    model = KMeans(n_clusters=n+2, n_init = 10)

    # Fit the model for the df_songs_pca DataFrame
    model.fit(df_songs_pca)

    # Predict the model segments (clusters)
    this_cluster = model.predict(df_songs_pca)
    temp_df = pd.DataFrame({cn: this_cluster})
    # print(temp_df)
    song_clusters.append(temp_df)

song_cluster_df=pd.concat(song_clusters, axis = 1)
song_cluster_df

Unnamed: 0,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9
0,0,0,2,1,1,0,7,2
1,0,2,3,3,3,2,1,3
2,0,2,1,0,0,4,3,1
3,0,2,1,0,0,4,3,5
4,1,1,0,4,4,5,5,4
...,...,...,...,...,...,...,...,...
1805,0,0,1,2,2,3,2,0
1806,0,2,1,2,2,3,2,0
1807,0,2,1,2,5,6,2,0
1808,0,2,3,2,2,3,0,3


In [67]:
toplot_df["song_name"] = songs_df["song_name"]
toplot_df["artist"]=songs_df["artist"]
toplot_df["decade"]=songs_df["decade"].astype(str)
toplot_df=pd.concat([toplot_df, song_cluster_df], axis = 1)
toplot_df.shape


(1810, 15)

In [68]:
toplot_df.head(65)

Unnamed: 0,PC1,PC2,PC3,PC4,song_name,artist,decade,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9
0,0.271128,0.629925,0.518298,0.362262,Suspicious Minds,Elvis Presley,1960,0,0,2,1,1,0,7,2
1,-0.666447,-0.806036,2.548195,2.629907,I've Got a Woman,Ray Charles,1950,0,2,3,3,3,2,1,3
2,-0.000470,-1.500646,0.316519,0.413991,Great Balls Of Fire,Jerry Lee Lewis,1960,0,2,1,0,0,4,3,1
3,-0.084046,-2.358462,0.264038,1.193816,Roll Over Beethoven,Chuck Berry,1950,0,2,1,0,0,4,3,5
4,2.886176,0.029427,-0.399342,0.007866,I Only Have Eyes for You,The Flamingos,1950,1,1,0,4,4,5,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.721490,-1.326788,1.114330,0.745435,(You're The) Devil In Disguise,Elvis Presley,1960,1,1,0,0,0,4,3,1
61,3.910729,-0.053974,0.390617,0.308707,I Don't Want To Set The World On Fire,The Ink Spots,1990,1,1,0,4,4,5,5,4
62,0.962628,-0.791712,1.133838,0.993854,A Teenager's Romance,Ricky Nelson,2000,1,1,0,0,0,4,3,1
63,1.614803,-0.861142,1.075077,0.166709,There's a Moon Out Tonight,The Capris,1950,1,1,0,0,0,4,3,5


In [69]:
toplot_df.dropna()

Unnamed: 0,PC1,PC2,PC3,PC4,song_name,artist,decade,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9
0,0.271128,0.629925,0.518298,0.362262,Suspicious Minds,Elvis Presley,1960,0,0,2,1,1,0,7,2
1,-0.666447,-0.806036,2.548195,2.629907,I've Got a Woman,Ray Charles,1950,0,2,3,3,3,2,1,3
2,-0.000470,-1.500646,0.316519,0.413991,Great Balls Of Fire,Jerry Lee Lewis,1960,0,2,1,0,0,4,3,1
3,-0.084046,-2.358462,0.264038,1.193816,Roll Over Beethoven,Chuck Berry,1950,0,2,1,0,0,4,3,5
4,2.886176,0.029427,-0.399342,0.007866,I Only Have Eyes for You,The Flamingos,1950,1,1,0,4,4,5,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1805,-0.917927,0.524654,-1.033298,0.632751,Party In The U.S.A.,Miley Cyrus,2000,0,0,1,2,2,3,2,0
1806,-1.134177,-0.932461,-1.052827,0.300490,Can't Get You out of My Head,Kylie Minogue,2000,0,2,1,2,2,3,2,0
1807,-1.298489,0.285540,-1.374582,-0.484277,Unwritten,Natasha Bedingfield,2000,0,2,1,2,5,6,2,0
1808,-2.270271,-0.091437,-0.663547,2.401243,American Boy,Estelle,2000,0,2,3,2,2,3,0,3


In [70]:
toplot_df.shape

(1810, 15)

In [71]:
# save data to json file for plotting on website
# toplot_df.to_json(r'pca_data.json')

In [72]:
bydec_df = toplot_df.copy()
bydec_df = bydec_df.groupby("decade")
bydec_df = bydec_df.mean()
bydec_df


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,PC1,PC2,PC3,PC4,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1950,1.565346,-0.822397,0.52947,0.175083,0.715232,1.317881,0.728477,1.887417,1.953642,4.006623,3.417219,3.635762
1960,0.869686,-0.450995,0.442184,-0.054676,0.57561,1.307317,1.053659,1.756098,2.204878,3.717073,3.643902,3.526829
1970,0.142834,0.134661,-0.222912,-0.032405,0.388679,1.207547,1.249057,1.837736,2.641509,3.267925,4.173585,3.950943
1980,-0.456116,-0.035012,-0.482334,-0.186887,0.206612,1.338843,1.421488,1.822314,2.847107,3.384298,3.859504,3.669421
1990,-0.174339,0.373209,-0.070555,0.022547,0.275488,1.084599,1.5141,1.900217,2.62256,2.967462,4.002169,4.08026
2000,-0.538587,0.03575,0.077623,0.058007,0.226337,1.240741,1.580247,1.833333,2.473251,3.230453,3.510288,4.374486


In [73]:
import plotly
import plotly.graph_objs as go
markercolor = toplot_df['cluster_2']
xdata = 'PC1'
ydata = 'PC2'
zdata = 'PC3'

fig1 = go.Scatter3d(x=toplot_df[xdata],
                    y=toplot_df[ydata],
                    z=toplot_df[zdata],
                    hovertemplate =
                    '<b>Song: </b>%{text}'+
                    '<extra></extra>',
                    text = '<b>'+ toplot_df.song_name + '</b><br> Artist: ' + toplot_df.artist +'<br> Decade: ' + toplot_df.decade.astype(str),
                    marker=dict(color = markercolor,
                                opacity=0.9,
                                reversescale=True,
                                colorscale='jet',
                                size=5),
                    line=dict (width=0.02),
                    mode='markers')

#Make Plot.ly Layout
mylayout = go.Layout(scene=dict(xaxis=dict( title=xdata),
                                yaxis=dict( title=ydata),
                                zaxis=dict(title=zdata)),)

#Plot and save html
plotly.offline.plot({"data": [fig1],
                     "layout": mylayout},
                     auto_open=True,
                     filename=("3DPlot.html"))

'3DPlot.html'

In [74]:
X = songs_scaled
cov = (X.T @ X) / (X.shape[0] - 1)
eig_values, eig_vectors = np.linalg.eig(cov)
eig_values.sort()
eig_values


array([0.15231918, 0.38944347, 0.42736454, 0.75580261, 0.89531655,
       1.00739313, 1.03777618, 1.21509907, 1.51538628, 2.60962689])

In [75]:
eig_vectors.size

100