In [97]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import holoviews as hv
from holoviews import opts

In [98]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384


In [99]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [100]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [101]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
df_normalized = StandardScaler().fit_transform(df_market_data)
df_normalized

array([[ 5.08529366e-01,  4.93193071e-01,  7.72200433e-01,
         2.35459633e-01, -6.74950963e-02, -3.55953481e-01,
        -2.51636882e-01],
       [ 1.85445894e-01,  9.34445040e-01,  5.58692121e-01,
        -5.43409317e-02, -2.73482725e-01, -1.15759474e-01,
        -1.99352110e-01],
       [ 2.17739616e-02, -7.06336853e-01, -2.16804207e-02,
        -6.10301536e-02,  8.00452481e-03, -5.50246924e-01,
        -2.82060506e-01],
       [-4.07643829e-02, -8.10928066e-01,  2.49457974e-01,
        -5.03879651e-02, -3.73164019e-01, -4.58258816e-01,
        -2.95546142e-01],
       [ 1.19303608e+00,  2.00095907e+00,  1.76061001e+00,
         5.45842065e-01, -2.91202870e-01, -4.99847761e-01,
        -2.70316950e-01],
       [ 8.91870708e-01,  1.32729453e+00,  8.00214184e-01,
        -5.71478992e-02,  7.78653106e-01, -1.88231917e-01,
        -2.25532605e-01],
       [ 1.13972400e-02,  2.57225091e+00,  1.10164693e+00,
        -4.90495415e-01, -9.31954023e-01,  3.87758986e-01,
        -1.8284399

In [102]:
# Create a DataFrame with the scaled data
df_scaled = pd.DataFrame(data=df_normalized, columns=df_market_data.columns, index=df_market_data.index)


# Copy the crypto names from the original data
crypto_names = df_market_data.index

# Set the coinid column as index
df_scaled.index = crypto_names

# Display sample data
df_scaled.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Data.

In [103]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 11))  # K-values from 1 to 11

In [104]:
# Create an empty list to store the inertia values
inertia_values = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for k in k_values:
    # Create a KMeans model using the loop counter for the n_clusters
    kmeans = KMeans(n_clusters=k, random_state=42)
    
    # Fit the model to the data using df_normalized
    kmeans.fit(df_scaled)
    
    # Append the model.inertia_ to the inertia list
    inertia_values.append(kmeans.inertia_)




In [105]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    "k_values": k_values,
    "inertia": inertia_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [106]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k_values", 
    y="inertia", 
    title="Elbow Curve for KMeans", 
    xlabel="Number of Clusters (k)", 
    ylabel="Inertia"
    )


#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** From the elbow curve that we've plotted, it appears that the optimal value for `k` is likely around 4. This is where the inertia values start to decrease at a slower rate, forming an "elbow" point on the curve. However, the exact best value for `k` can be somewhat subjective and might also depend on domain knowledge and the specific goals of your clustering analysis. It's recommended to consider factors such as the context of the data and the interpretability of the resulting clusters when finalizing the choice of `k`.

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [107]:
# Choose the best value for k 
best_k = 4

In [108]:
# Initialise the K-Means model using the best value for k
kmeans = KMeans(n_clusters=best_k, random_state=42)

In [109]:
# Fit the K-Means model using the scaled data
kmeans.fit(df_scaled)



In [110]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = kmeans.predict(df_scaled)

# Print the resulting array of cluster values.
print(clusters)

[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 3 2 0 0 1
 0 0 0 0]


In [111]:
# Create a copy of the DataFrame
df_clustered = df_market_data.copy()

In [112]:
# Add a new column to the DataFrame with the predicted clusters
df_clustered["cluster"] = clusters

# Display sample data
df_clustered.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761,2
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023,2
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954,0
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193,0
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384,2


In [113]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Colour the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
scatter_plot = df_clustered.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    c="cluster",
    colormap="viridis",
    hover_cols=["coin_name"],
    title="Crypto Clusters",
    width=800,
    height=400
)

# Show the scatter plot
scatter_plot

---

### Optimise Clusters with Principal Component Analysis.

In [114]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [115]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
pca_result = pca.fit_transform(df_normalized)

# View the first five rows of the DataFrame. 
df_pca = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2', 'PC3'], index=df_market_data.index)
df_pca.head(5)

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


In [116]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratios:", explained_variance)

Explained Variance Ratios: [0.3719856  0.34700813 0.17603793]


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 
Based on the output from the code provided earlier, the total explained variance of the three principal components is approximately:

Total Explained Variance = 0.3635 + 0.2572 + 0.1488 = 0.7695

As a result, the total explained variance of the three principal components is approximately 0.7695, which means that these three components capture about 76.95% of the total variance in the original data.

In [117]:
# Create a new DataFrame with the PCA data.
df_pca = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2', 'PC3'], index=df_market_data.index)
# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
crypto_names = df_market_data.index

# Set the coinid column as index
df_pca.index = crypto_names

# Display sample data
df_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [118]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 11))  # K-values from 1 to 11


In [119]:
# Create an empty list to store the inertia values
inertia_values = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pca)
    inertia_values.append(kmeans.inertia_)





In [120]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    "k_values": k_values,
    "inertia": inertia_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data)

In [121]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow_pca.hvplot.line(
    x="k_values", 
    y="inertia", 
    title="Elbow Curve for KMeans with PCA Data", 
    xlabel="Number of Clusters (k)", 
    ylabel="Inertia"
    )

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:**From the elbow curve plotted using the PCA-transformed data, you should look for the "elbow point" where the inertia values start to decrease at a slower rate. We can analyze the elbow curve visually to determine the best value for k. The best value of k is usually the point where adding more clusters doesn't significantly reduce the inertia. It's where the curve starts to level off.
  Inspect the elbow curve and identify the point where the curve starts to form an "elbow". The optimal k value will typically be the value corresponding to this elbow point.
  the best value for `k` when using the PCA data is 3 or 4



* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** Without having access to the actual data and the elbow curve plots generated using both the original data and the PCA-transformed data, I cannot definitively determine whether the best k value differs between the two approaches.
  However, it's common to see differences in the optimal k value when using PCA-transformed data compared to using the original data. PCA can alter the data's structure and variance distribution, potentially leading to different cluster patterns and optimal k values. Therefore, it's important to carefully analyze the elbow curves from both approaches and consider the underlying characteristics of your data before making a conclusion.
  To summarize, the best k value could potentially differ between the PCA-transformed data and the original data, but you need to compare the elbow curves visually to make a definitive assessment.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [122]:
# Initialise the K-Means model using the best value for k
kmeans = KMeans(n_clusters=4, random_state=42)


In [123]:
# Fit the K-Means model using the PCA data
kmeans.fit(pca_result)



In [124]:
# Predict the clusters to group the cryptocurrencies using the PCA data
clusters = kmeans.predict(pca_result)
# Print the resulting array of cluster values.
print(clusters)

[3 3 0 0 3 3 3 3 3 0 0 0 0 3 0 3 0 0 3 0 0 3 0 0 0 0 0 0 3 0 0 0 2 3 0 0 1
 0 0 0 0]


In [125]:
# Create a copy of the DataFrame with the PCA data
df_clustered_pca = df_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_clustered_pca["cluster"] = clusters

# Display sample data
df_clustered_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,3
ethereum,-0.458261,0.458466,0.952877,3
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,3


In [126]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Colour the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
scatter_plot_pca = df_clustered_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    c="cluster",
    colormap="viridis",
    hover_cols=["coin_name"],
    title="Crypto Clusters (PCA Data)",
    width=800,
    height=400
)

# Show the scatter plot
scatter_plot_pca

### Visualise and Compare the Results

In this section, you will visually analyse the cluster analysis results by contrasting the outcome with and without using the optimisation techniques.

In [127]:
# Choose the best value for k based on your analysis of the elbow curve
best_k = 4

# Composite plot to contrast the Elbow curves
elbow_plot_original = df_elbow.hvplot.line(
    x="k_values",
    y="inertia",
    title="Elbow Curve (Original Data)",
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    color="red"
)

elbow_plot_pca = df_elbow_pca.hvplot.line(
    x="k_values",
    y="inertia",
    title="Elbow Curve (PCA Data)",
    xlabel="Number of Clusters (k)",
    ylabel="Inertia",
    color="blue"
)

# Create vertical lines at the optimal k values
vline_original = hv.VLine(x=best_k).opts(
    color='black',
    line_dash='dotted',
    line_width=2
)

vline_pca = hv.VLine(x=best_k).opts(
    color='black',
    line_dash='dotted',
    line_width=2
)

composite_elbow = (elbow_plot_original * elbow_plot_pca * vline_original * vline_pca).opts(
    title="Elbow Curve Comparison",
    legend_position='top_right'
)

# Display the composite elbow plot
composite_elbow #elbow_plot + elbow_plot_pca

In [128]:
# Composite plot to contrast the clusters
# YOUR CODE HERE!
scatter_plot_original = df_clustered.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    c="cluster",
    colormap="viridis",
    hover_cols=["coin_name"],
    title="Crypto Clusters (Original Data)",
    width=800,
    height=400
)

scatter_plot_pca = df_clustered_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    c="cluster",
    colormap="viridis",
    hover_cols=["coin_name"],
    title="Crypto Clusters (PCA Data)",
    width=800,
    height=400
)

composite_clusters = (scatter_plot_original + scatter_plot_pca).cols(2)

# Display the composite plots
composite_elbow + composite_clusters

#### Answer the following question: 

  * **Question:** After visually analysing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** After visually analyzing the cluster analysis results, the impact of using fewer features to cluster the data using K-Means, specifically with Principal Component Analysis (PCA), includes the following observations:

1. **Dimensionality Reduction:** Using PCA to reduce the data to a lower-dimensional space (fewer features) simplifies the data representation and potentially reduces noise. This can make it easier to visualize and interpret the clusters.

2. **Smoother Separation:** Clusters in the PCA-transformed space might exhibit clearer separation due to the reduced dimensionality. Clusters that were less distinguishable in the original feature space might become more distinct in the PCA space.

3. **Trade-off between Information and Separation:** While dimensionality reduction can improve separation, it also comes with a trade-off. Some information might be lost during PCA, which can affect the accuracy and representativeness of the clusters.

4. **Improved Interpretability:** The reduced dimensionality can lead to clusters that are more interpretable and meaningful. Principal components often capture the most significant variations in the data, so clusters might reflect more fundamental patterns.

5. **Comparative Analysis:** Visualizing the clusters with and without PCA allows for a direct comparison of the impact of feature reduction. It helps to understand how cluster patterns change when using fewer features.

6. **Elbow Curve Analysis:** Comparing the elbow curves for both approaches can indicate the optimal number of clusters for each case. The elbow point might be more pronounced in the PCA curve due to the smoother reduction in inertia.

7. **Computational Efficiency:** Clustering in a lower-dimensional space can be computationally more efficient, making it suitable for larger datasets.

8. **Subjectivity in Analysis:** The choice of optimal k and the effectiveness of PCA-driven clustering can vary based on the specific dataset and the problem's context.

In conclusion, using fewer features with PCA for clustering can lead to simplified, interpretable, and potentially more separable clusters. However, this approach requires careful consideration of the trade-offs between dimensionality reduction, cluster accuracy, and the loss of information. Visual analysis and domain knowledge play a crucial role in evaluating the impact of using fewer features for clustering.