In [30]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Step 1:  Read in the dataset about the current songs' decades of the playlists.

In [31]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
songs_df = pd.read_csv(
    Path("../Project4-main/Database/final.csv"),                
    index_col="decade"
)

# Review the DataFrame
songs_df.head()

Unnamed: 0_level_0,id,song_name,artist,pic_url,preview_url,release_year,danceability,energy,key,loudness,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,4Vn7TykT27PIygBiZjTR2s,Hotel California - 2013 Remaster,Eagles,https://i.scdn.co/image/ab67616d0000b273061849...,https://p.scdn.co/mp3-preview/6fedc11d0f55bef1...,2003,0.579,0.508,2,-9.484,...,0.000494,0.0575,0.609,147.125,audio_features,spotify:track:4Vn7TykT27PIygBiZjTR2s,https://api.spotify.com/v1/tracks/4Vn7TykT27PI...,https://api.spotify.com/v1/audio-analysis/4Vn7...,391376,4
1980,2WfaOiMkCvy7F5fcp2zZ8L,Take on Me,a-ha,https://i.scdn.co/image/ab67616d0000b273e8dd4d...,https://p.scdn.co/mp3-preview/ed66a8c444c35b2f...,1985,0.573,0.902,6,-7.638,...,0.00125,0.0928,0.876,84.412,audio_features,spotify:track:2WfaOiMkCvy7F5fcp2zZ8L,https://api.spotify.com/v1/tracks/2WfaOiMkCvy7...,https://api.spotify.com/v1/audio-analysis/2Wfa...,225280,4
2000,5ChkMS8OtdzJeqyybCc9R5,Billie Jean,Michael Jackson,https://i.scdn.co/image/ab67616d0000b2734121fa...,https://p.scdn.co/mp3-preview/0f6b8a3524ec4100...,2008,0.92,0.654,11,-3.051,...,0.0153,0.036,0.847,117.046,audio_features,spotify:track:5ChkMS8OtdzJeqyybCc9R5,https://api.spotify.com/v1/tracks/5ChkMS8OtdzJ...,https://api.spotify.com/v1/audio-analysis/5Chk...,293827,4
1990,4jDmJ51x1o9NZB5Nxxc7gY,Careless Whisper,George Michael,https://i.scdn.co/image/ab67616d0000b27364c19b...,https://p.scdn.co/mp3-preview/426887696731457a...,1998,0.574,0.628,2,-8.815,...,0.0,0.271,0.803,153.119,audio_features,spotify:track:4jDmJ51x1o9NZB5Nxxc7gY,https://api.spotify.com/v1/tracks/4jDmJ51x1o9N...,https://api.spotify.com/v1/audio-analysis/4jDm...,300107,4
1980,5C0LFQARavkPpn7JgA4sLk,Every Breath You Take - Remastered 2003,The Police,https://i.scdn.co/image/ab67616d0000b27307ea1b...,,1983,0.82,0.452,1,-9.796,...,0.00294,0.0701,0.74,117.401,audio_features,spotify:track:5C0LFQARavkPpn7JgA4sLk,https://api.spotify.com/v1/tracks/5C0LFQARavkP...,https://api.spotify.com/v1/audio-analysis/5C0L...,253250,4


In [32]:
songs_df.shape

(867, 23)

### Step 2: Scale the `songs_df` DataFrame and create a new DataFrame that contains the scaled data. 

In [33]:
# Scale the attribute values
songs_scaled = StandardScaler().fit_transform(
    songs_df[["danceability", "energy", "key", "loudness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms"]]
)

In [34]:
# Create a DataFrame with the scaled data
songs_scaled = pd.DataFrame(
    songs_scaled,
    columns=["danceability", "energy", "key", "loudness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms"]
)

# Copy the tickers names from the original data
songs_scaled["decade"] = songs_df.index

# Set the Ticker column as index
songs_scaled = songs_scaled.set_index("decade")

# Display sample data
songs_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,instrumentalness,liveness,valence,tempo,duration_ms
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000,-0.1297,-0.529605,-0.923714,-0.135865,-0.235623,-0.825537,-0.10678,1.031703,2.530519
1980,-0.168682,1.28674,0.202276,0.351366,-0.22806,-0.566349,1.021755,-1.287454,-0.057849
2000,2.085774,0.143457,1.609763,1.562055,-0.087493,-0.9834,0.89918,-0.080633,1.010358
1990,-0.162185,0.023596,-0.923714,0.04071,-0.240566,0.742075,0.713204,1.253364,1.108223
1980,1.436075,-0.787766,-1.205212,-0.218214,-0.211152,-0.733022,0.446921,-0.067505,0.378023


### Step 3: Initialize the K-means model with three clusters and then fit the `songs_scaled` DataFrame to the model.

In [35]:
# Initialize the K-Means model with n_clusters=6 (6 decades, 50's, 60's, 70's, 80's, 90's, and 00's)
model = KMeans(n_clusters=3)

In [36]:
# Fit the model for the df_stocks_scaled DataFrame
model.fit(songs_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


### Step 4. Predict the clusters and then create a new DataFrame with the predicted clusters.

In [37]:
# Predict the model segments (clusters)
songs_clusters = model.predict(songs_scaled)

# View the stock segments
print(songs_clusters)

[2 1 1 1 1 1 1 0 0 2 1 1 1 2 1 1 2 1 1 1 2 1 1 2 0 0 1 0 2 0 1 1 0 2 1 1 1
 1 2 1 1 2 2 2 2 2 1 1 0 0 1 1 2 0 1 1 2 2 1 1 0 0 1 1 1 2 0 0 1 1 1 2 1 1
 1 2 2 2 2 2 0 2 0 0 2 1 2 1 1 2 2 1 1 0 0 0 0 0 0 0 2 1 2 0 0 1 1 0 0 0 1
 2 1 2 1 1 0 0 2 0 1 0 1 0 2 0 2 0 1 1 1 0 0 2 2 2 1 0 1 0 0 0 0 0 0 2 1 2
 0 0 1 0 0 0 0 0 2 0 2 0 0 0 2 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0
 1 1 1 1 1 0 0 0 1 1 0 0 0 0 1 2 1 0 1 1 0 0 1 0 2 0 0 0 0 0 1 1 1 1 1 0 0
 1 1 1 0 1 1 0 0 1 0 0 1 2 0 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 2 2 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 2
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 0 1 0 1 1 1 1 1 2 1
 1 1 2 1 1 1 1 1 1 1 2 2 1 1 0 1 2 1 1 2 2 1 0 1 2 2 0 1 2 1 2 2 1 1 1 0 2
 1 2 1 0 1 1 1 1 2 2 1 0 0 1 2 2 1 2 1 2 2 1 2 0 2 2 1 2 1 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 1 1 1 2 0 1 0 1 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 2 0 1
 1 0 1 0 1 0 0 1 1 0 1 1 

In [38]:
# Create a new column in the DataFrame with the predicted clusters
songs_scaled["SongCluster"] = songs_clusters

# Review the DataFrame
songs_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,instrumentalness,liveness,valence,tempo,duration_ms,SongCluster
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000,-0.1297,-0.529605,-0.923714,-0.135865,-0.235623,-0.825537,-0.10678,1.031703,2.530519,2
1980,-0.168682,1.28674,0.202276,0.351366,-0.22806,-0.566349,1.021755,-1.287454,-0.057849,1
2000,2.085774,0.143457,1.609763,1.562055,-0.087493,-0.9834,0.89918,-0.080633,1.010358,1
1990,-0.162185,0.023596,-0.923714,0.04071,-0.240566,0.742075,0.713204,1.253364,1.108223,1
1980,1.436075,-0.787766,-1.205212,-0.218214,-0.211152,-0.733022,0.446921,-0.067505,0.378023,1


### Step 5: Create a scatter plot to visualize the "SongCluster" using  "danceability" as the x-variable and "energy" as the y-variable.  Be sure to style and format your plot.

In [39]:
# Create a scatter plot with x="danceability",  y="key"
songs_scaled.hvplot.scatter(
    x="danceability",
    y="duration_ms",
    by="SongCluster",
    hover_cols = ["decade"], 
    title = "Scatter Plot by Songs - k=6"
)

### Step 6: Reduce the number of features to two principal components on the `songs_scaled` DataFrame, and  calculate the explained variance ratio that results from the PCA data.

In [40]:
# Create the PCA model instance where n_components=2
pca = PCA(n_components=2)

In [41]:
# Fit the df_stocks_scaled data to the PCA
songs_pca_data = pca.fit_transform(songs_scaled)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
songs_pca_data[:5]

array([[ 0.00603566,  1.61774338],
       [-1.10574471, -0.42375235],
       [-1.7330371 , -0.65414734],
       [-0.62923804,  0.52916347],
       [ 0.0341721 , -1.16087402]])

In [42]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.24182976, 0.1578923 ])

### Step 7: Use the calculate PCA DataFrame in Step 6 to create a new DataFrame called, `df_songs_pca`, then add an additional column to the `df_songs_pca` DataFrame that contains the tickers from the original `songs_df` DataFrame.

In [43]:
# Creating a DataFrame with the PCA data
df_songs_pca = pd.DataFrame(songs_pca_data, columns=["PC1", "PC2"])

# Copy the tickers names from the original data
df_songs_pca["decade"] = songs_df.index

# Set the Ticker column as index
df_songs_pca = df_songs_pca.set_index("decade")

# Review the DataFrame
df_songs_pca.head()

Unnamed: 0_level_0,PC1,PC2
decade,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,0.006036,1.617743
1980,-1.105745,-0.423752
2000,-1.733037,-0.654147
1990,-0.629238,0.529163
1980,0.034172,-1.160874


### Step 8: Rerun the K-means algorithm on the `df_songs_pca` DataFrame and create a scatter plot using the  "SongCluster" and the two principal components for the x- and y-axes. Be sure to style and format your plot.

In [44]:
# Initialize the K-Means model with n_clusters=6
model = KMeans(n_clusters=3)

# Fit the model for the df_stocks_pca DataFrame
model.fit(df_songs_pca)

# Predict the model segments (clusters)
stock_clusters = model.predict(df_songs_pca)

# Print the stock segments
print(songs_clusters)

  super()._check_params_vs_input(X, default_n_init=10)


[2 1 1 1 1 1 1 0 0 2 1 1 1 2 1 1 2 1 1 1 2 1 1 2 0 0 1 0 2 0 1 1 0 2 1 1 1
 1 2 1 1 2 2 2 2 2 1 1 0 0 1 1 2 0 1 1 2 2 1 1 0 0 1 1 1 2 0 0 1 1 1 2 1 1
 1 2 2 2 2 2 0 2 0 0 2 1 2 1 1 2 2 1 1 0 0 0 0 0 0 0 2 1 2 0 0 1 1 0 0 0 1
 2 1 2 1 1 0 0 2 0 1 0 1 0 2 0 2 0 1 1 1 0 0 2 2 2 1 0 1 0 0 0 0 0 0 2 1 2
 0 0 1 0 0 0 0 0 2 0 2 0 0 0 2 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0
 1 1 1 1 1 0 0 0 1 1 0 0 0 0 1 2 1 0 1 1 0 0 1 0 2 0 0 0 0 0 1 1 1 1 1 0 0
 1 1 1 0 1 1 0 0 1 0 0 1 2 0 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 2 2 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 2
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 0 1 0 1 1 1 1 1 2 1
 1 1 2 1 1 1 1 1 1 1 2 2 1 1 0 1 2 1 1 2 2 1 0 1 2 2 0 1 2 1 2 2 1 1 1 0 2
 1 2 1 0 1 1 1 1 2 2 1 0 0 1 2 2 1 2 1 2 2 1 2 0 2 2 1 2 1 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 1 1 1 2 0 1 0 1 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 2 0 1
 1 0 1 0 1 0 0 1 1 0 1 1 

In [45]:
# Create a copy of the df_songs_pca DataFrame and name it as df_songs_pca_predictions
df_songs_pca_predictions = df_songs_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_songs_pca_predictions["SongCluster"] = songs_clusters

# Review the DataFrame
df_songs_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,SongCluster
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,0.006036,1.617743,2
1980,-1.105745,-0.423752,1
2000,-1.733037,-0.654147,1
1990,-0.629238,0.529163,1
1980,0.034172,-1.160874,1


In [46]:
# Create the scatter plot with x="PC1" and y="PC2"
df_songs_pca_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="SongCluster",
    title = "Scatter Plot by Stock Segment - PCA=2"
)

### Bonus

* Use the elbow method to find the best value for `k` using the PCA data. Use a range from 1 to 11.

* Plot a line chart with all the inertia values computed with the different values of k to visually identify the optimal value for `k`.

In [47]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1, 11))

In [48]:
# Create an empy list to store the inertia values
inertia = []

In [49]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_songs_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_songs_pca)
    inertia.append(model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [50]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {
    "k": k,
    "inertia": inertia
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)

In [51]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot_pca = df_elbow_pca.hvplot.line(x="k", y="inertia", title="Elbow Curve Using PCA Data", xticks=k)
elbow_plot_pca

**Question:** What is the best value for k when using the PCA data? Does it differ from the best k value found using the original data?

**Answer:** Based on this Elbow Curve, it looks like `k=3` is the correct one.