In [55]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Step 1:  Read in the dataset about the current songs' decades of the playlists.

In [56]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
songs_df = pd.read_csv(
    Path("../final project/Database/final.csv"),                
    index_col="decade"
)

# Review the DataFrame
songs_df.head()

Unnamed: 0_level_0,id,song_name,artist,pic_url,preview_url,release_year,danceability,energy,key,loudness,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1960,1H5IfYyIIAlgDX8zguUzns,Suspicious Minds,Elvis Presley,https://i.scdn.co/image/ab67616d0000b273fdc0aa...,https://p.scdn.co/mp3-preview/e7072dbcb87f4c8c...,1969,0.487,0.382,7,-10.889,...,5e-06,0.411,0.714,116.557,audio_features,spotify:track:1H5IfYyIIAlgDX8zguUzns,https://api.spotify.com/v1/tracks/1H5IfYyIIAlg...,https://api.spotify.com/v1/audio-analysis/1H5I...,261280,4
1950,2xar08Fq5xra2KKZs5Bw9j,I've Got a Woman,Ray Charles,https://i.scdn.co/image/ab67616d0000b273f0e951...,https://p.scdn.co/mp3-preview/425d081433204c6d...,1957,0.556,0.61,9,-4.729,...,2e-06,0.258,0.665,199.373,audio_features,spotify:track:2xar08Fq5xra2KKZs5Bw9j,https://api.spotify.com/v1/tracks/2xar08Fq5xra...,https://api.spotify.com/v1/audio-analysis/2xar...,171467,4
1960,64VP3skE86iTvdOlbzuIcO,Great Balls Of Fire,Jerry Lee Lewis,https://i.scdn.co/image/ab67616d0000b27395788c...,,1961,0.533,0.729,7,-7.227,...,0.0,0.159,0.884,78.67,audio_features,spotify:track:64VP3skE86iTvdOlbzuIcO,https://api.spotify.com/v1/tracks/64VP3skE86iT...,https://api.spotify.com/v1/audio-analysis/64VP...,111536,4
1950,6C7aTTCUWRK7dD379yUT3W,Roll Over Beethoven,Chuck Berry,https://i.scdn.co/image/ab67616d0000b273a496dc...,,1959,0.715,0.797,3,-10.154,...,0.0,0.209,0.841,92.739,audio_features,spotify:track:6C7aTTCUWRK7dD379yUT3W,https://api.spotify.com/v1/tracks/6C7aTTCUWRK7...,https://api.spotify.com/v1/audio-analysis/6C7a...,144600,4
1950,3YdKJzcoMZMacISlpY4QoP,I Only Have Eyes for You,The Flamingos,https://i.scdn.co/image/ab67616d0000b2731ba2d2...,https://p.scdn.co/mp3-preview/8b14a85d40cb0931...,1959,0.552,0.313,5,-12.41,...,0.0017,0.12,0.303,88.554,audio_features,spotify:track:3YdKJzcoMZMacISlpY4QoP,https://api.spotify.com/v1/tracks/3YdKJzcoMZMa...,https://api.spotify.com/v1/audio-analysis/3YdK...,202773,3


In [57]:
songs_df.shape

(1911, 23)

In [58]:
songs_df.drop(songs_df[(songs_df['release_year'] <1950)].index, inplace=True)
songs_df = songs_df.drop(columns=['id', 'song_name', 'artist', 'pic_url', 'preview_url', 'release_year', 'type', 'uri', 'track_href', 'analysis_url'])
songs_df = songs_df.dropna()
songs_df.shape

(1810, 13)

### Step 2: Scale the `songs_df` DataFrame and create a new DataFrame that contains the scaled data. 

In [59]:
# Scale the attribute values
songs_scaled = StandardScaler().fit_transform(
    songs_df[["danceability", "energy", "key", "loudness", "liveness", "valence", "tempo", "duration_ms", "acousticness"]]
)

In [60]:
# Create a DataFrame with the scaled data
songs_scaled = pd.DataFrame(
    songs_scaled,
    columns=["danceability", "energy", "key", "loudness", "liveness", "valence", "tempo", "duration_ms", "acousticness"]
)

# Copy the tickers names from the original data
songs_scaled["decade"] = songs_df.index

# Set the Ticker column as index
songs_scaled = songs_scaled.set_index("decade")

# Display sample data
songs_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,liveness,valence,tempo,duration_ms,acousticness
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1960,-0.662204,-1.027207,0.506439,-0.504184,1.717403,0.36312,-0.141088,0.585506,-0.894323
1950,-0.202466,-0.021973,1.069496,1.134323,0.607338,0.161596,2.7785,-0.725674,1.656623
1960,-0.355712,0.502689,0.506439,0.469877,-0.11094,1.062285,-1.476754,-1.600606,0.785663
1950,0.856928,0.802496,-0.619673,-0.30868,0.251826,0.885437,-0.980766,-1.117905,1.669526
1950,-0.229118,-1.331423,-0.056617,-0.908757,-0.393898,-1.327214,-1.128304,-0.268637,1.924362


### Step 3: Initialize the K-means model with three clusters and then fit the `songs_scaled` DataFrame to the model.

In [61]:
# Initialize the K-Means model with n_clusters=4
model = KMeans(n_clusters=4)

In [62]:
# Fit the model for the songs_scaled_scaled DataFrame
model.fit(songs_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


### Step 4. Predict the clusters and then create a new DataFrame with the predicted clusters.

In [63]:
# Predict the model segments (clusters)
songs_clusters = model.predict(songs_scaled)

# View the song segments
print(songs_clusters)

[1 1 3 ... 3 3 1]


In [64]:
# Create a new column in the DataFrame with the predicted clusters
songs_scaled["SongCluster"] = songs_clusters

# Review the DataFrame
songs_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,liveness,valence,tempo,duration_ms,acousticness,SongCluster
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1960,-0.662204,-1.027207,0.506439,-0.504184,1.717403,0.36312,-0.141088,0.585506,-0.894323,1
1950,-0.202466,-0.021973,1.069496,1.134323,0.607338,0.161596,2.7785,-0.725674,1.656623,1
1960,-0.355712,0.502689,0.506439,0.469877,-0.11094,1.062285,-1.476754,-1.600606,0.785663,3
1950,0.856928,0.802496,-0.619673,-0.30868,0.251826,0.885437,-0.980766,-1.117905,1.669526,3
1950,-0.229118,-1.331423,-0.056617,-0.908757,-0.393898,-1.327214,-1.128304,-0.268637,1.924362,2


### Step 5: Create a scatter plot to visualize the "SongCluster" using  "danceability" as the x-variable and "energy" as the y-variable.  Be sure to style and format your plot.

In [65]:
# Create a scatter plot with x="acousticness",  y="energy"
songs_scaled.hvplot.scatter(
    x="acousticness",
    y="energy",
    by="SongCluster",
    hover_cols = ["decade"], 
    title = "Scatter Plot by Songs - k=4"
)

### Step 6: Reduce the number of features to two principal components on the `songs_scaled` DataFrame, and  calculate the explained variance ratio that results from the PCA data.

In [66]:
# Create the PCA model instance where n_components=4
pca = PCA(n_components=4)


In [67]:
# Fit the songs_scaled data to the PCA
songs_pca_data = pca.fit_transform(songs_scaled)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
songs_pca_data[:5]

array([[ 0.0498722 ,  1.17907014,  0.45951513,  0.3882913 ],
       [-0.24951133,  0.43756283,  2.42131094, -2.06616765],
       [ 0.2848315 , -1.81868923,  0.64954468,  0.28740555],
       [ 0.61934917, -2.1622679 ,  0.72159565,  1.09777408],
       [ 2.84115309,  0.20929732, -0.389593  ,  0.45380288]])

In [68]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.24687966, 0.23595602, 0.11894905, 0.09828696])

### Step 7: Use the calculate PCA DataFrame in Step 6 to create a new DataFrame called, `df_songs_pca`, then add an additional column to the `df_songs_pca` DataFrame that contains the tickers from the original `songs_df` DataFrame.

In [69]:
# Creating a DataFrame with the PCA data
df_songs_pca = pd.DataFrame(songs_pca_data, columns=["PC1", "PC2", "PC3", "PC4"])

# Copy the tickers names from the original data
df_songs_pca["decade"] = songs_df.index

# Set the Ticker column as index
df_songs_pca = df_songs_pca.set_index("decade")

# Review the DataFrame
df_songs_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1960,0.049872,1.17907,0.459515,0.388291
1950,-0.249511,0.437563,2.421311,-2.066168
1960,0.284831,-1.818689,0.649545,0.287406
1950,0.619349,-2.162268,0.721596,1.097774
1950,2.841153,0.209297,-0.389593,0.453803


### Step 8: Rerun the K-means algorithm on the `df_songs_pca` DataFrame and create a scatter plot using the  "SongCluster" and the two principal components for the x- and y-axes. Be sure to style and format your plot.

In [70]:
# Initialize the K-Means model with n_clusters=4
model = KMeans(n_clusters=4)

# Fit the model for the df_songs_pca DataFrame
model.fit(df_songs_pca)

# Predict the model segments (clusters)
song_clusters = model.predict(df_songs_pca)

# Print the song segments
print(songs_clusters)

  super()._check_params_vs_input(X, default_n_init=10)


[1 1 3 ... 3 3 1]


In [71]:
# Create a copy of the df_songs_pca DataFrame and name it as df_songs_pca_predictions
df_songs_pca_predictions = df_songs_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_songs_pca_predictions["SongCluster"] = songs_clusters

# Review the DataFrame
df_songs_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,SongCluster
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1960,0.049872,1.17907,0.459515,0.388291,1
1950,-0.249511,0.437563,2.421311,-2.066168,1
1960,0.284831,-1.818689,0.649545,0.287406,3
1950,0.619349,-2.162268,0.721596,1.097774,3
1950,2.841153,0.209297,-0.389593,0.453803,2


In [74]:
songcluster_temp_df = df_songs_pca_predictions.reset_index(drop=False)
songcluster_temp_df = songcluster_temp_df.groupby("SongCluster")
songcluster_temp_df = songcluster_temp_df['decade'].mean()
songcluster_temp_df.head()

SongCluster
0    1987.458034
1    1984.072727
2    1972.710280
3    1982.811594
Name: decade, dtype: float64

In [75]:
# Create the scatter plot with x="PC1" and y="PC2"
df_songs_pca_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="SongCluster",
    title = "Scatter Plot by Song Segment - PCA=2"
)

### Bonus

* Use the elbow method to find the best value for `k` using the PCA data. Use a range from 1 to 11.

* Plot a line chart with all the inertia values computed with the different values of k to visually identify the optimal value for `k`.

In [None]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1, 11))

In [None]:
# Create an empy list to store the inertia values
inertia = []

In [None]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_songs_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_songs_pca)
    inertia.append(model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {
    "k": k,
    "inertia": inertia
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot_pca = df_elbow_pca.hvplot.line(x="k", y="inertia", title="Elbow Curve Using PCA Data", xticks=k)
elbow_plot_pca

**Question:** What is the best value for k when using the PCA data? Does it differ from the best k value found using the original data?

**Answer:** Based on this Elbow Curve, it looks like `k=4` is the correct one.

In [104]:
X = songs_scaled
cov = (X.T @ X) / (X.shape[0] - 1)
eig_values, eig_vectors = np.linalg.eig(cov)
eig_values.sort()
eig_values


array([0.15402809, 0.39242073, 0.43001586, 0.76060683, 0.9456701 ,
       1.00681489, 1.14760155, 1.29311821, 2.56721061, 4.83872097])

In [92]:
eig_vectors.size

100