In [263]:
# Import required libraries and dependencies
# The Bokeh extension: "import hvplot.pandas hvplot.extension('bokeh')" is loaded to display the charts.

import pandas as pd
import hvplot.pandas
import hvplot.pandas
hvplot.extension('bokeh')
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [264]:
# checking current environment to ensure charts can be displayed

import sys
print(sys.executable)


C:\Users\Lis\anaconda3\python.exe


In [265]:
# checking the packages installed 

!conda list hvplot
!conda list pandas



# packages in environment at C:\Users\Lis\anaconda3:
#
# Name                    Version                   Build  Channel
hvplot                    0.9.2                      py_0    pyviz
# packages in environment at C:\Users\Lis\anaconda3:
#
# Name                    Version                   Build  Channel
geopandas-base            0.9.0                      py_1  
pandas                    2.0.3           py311hf62ec03_0  


In [266]:
# Load the data into a Pandas DataFrame
market_data_df = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
market_data_df.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [267]:
# Generate summary statistics
market_data_df.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [268]:
# Check datatypes of numerical columns
market_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41 entries, bitcoin to digibyte
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price_change_percentage_24h   41 non-null     float64
 1   price_change_percentage_7d    41 non-null     float64
 2   price_change_percentage_14d   41 non-null     float64
 3   price_change_percentage_30d   41 non-null     float64
 4   price_change_percentage_60d   41 non-null     float64
 5   price_change_percentage_200d  41 non-null     float64
 6   price_change_percentage_1y    41 non-null     float64
dtypes: float64(7)
memory usage: 2.6+ KB


In [269]:
# Plot your data to see what's in your DataFrame
market_data_df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [270]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Scaling data using the `StandardScaler()` module to initialize the object
# the object can then be used to fit and transfrom the data. 
# Standardization:scaling all numerical columns of the data to have a mean of 0 and Stdv of 1

# Selecting numeric columns from DataFrame to scale (Datatype = `float64`)
numeric_columns = df_market_data.select_dtypes(include=['float64'])

# Initiating StandardScaler() module
scaler = StandardScaler()

# Fitting and transforming the scaler on the numeric columns
scaled_market_data = scaler.fit_transform(numeric_columns)

# another way without initializing StandardScaler:
# data_scaled = StandardScaler().fit_transform(df[[specific_columns]])

In [271]:
# Create a DataFrame with the scaled data

scaled_market_data_df = pd.DataFrame(scaled_market_data, columns=numeric_columns.columns, index=market_data_df.index)

# Copy the crypto names from the original data
# using `.copy()` ensures that original DataFrame is not modified, but the new copy 

coin_id_column_copy = market_data_df.index.copy()

# Set the "coin_id" column as index

scaled_market_data_df.index = coin_id_column_copy

# Display sample data

scaled_market_data_df.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


In [272]:
# Reducing the number of columns

scaled_market_data_df = scaled_market_data_df.drop(columns=["price_change_percentage_200d", "price_change_percentage_1y"])

scaled_market_data_df.head()
                                       
                                       

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203


---

### Find the Best Value for k Using the Original Data.

In [273]:
# Create a list with the number of k-values from 1 to 11

k_values = list(range(1, 10))


In [274]:

# Create an empty list to store the inertia values

inertia = []

# Create a for loop to compute the inertia with each possible value of k
# The loop iterates over a range of values for `k`. 
# Inside the loop:

for i in k_values: 

# 1. Create a KMeans model using the loop counter for the n_clusters

    kmeans_model = KMeans(n_clusters=i, random_state=1)

# 2. Fit the model to the data using `df_market_data_scaled`

    kmeans_model.fit(scaled_market_data_df)

# 3. Append the model.inertia_ to the inertia list
# `.inertia_` attribute of the fitted `KMeans` model holds the sum of squared distances between each point in a cluster and the centroid of that cluster.
# It measures how well the data points have been clustered

    inertia.append(kmeans_model.inertia_)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [275]:
# Create a dictionary with the data to plot the Elbow curve

elbow_curve_scaled_data = {"k": k_values, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve

elbow_curve_scaled_data_df = pd.DataFrame(elbow_curve_scaled_data)

elbow_curve_scaled_data_df.head()


Unnamed: 0,k,inertia
0,1,205.0
1,2,142.150526
2,3,96.147929
3,4,70.273584
4,5,59.284781


In [276]:
# Plot a line chart with all the inertia values computed with 

elbow_curve_scaled_data_plot = elbow_curve_scaled_data_df.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)


# Graph: the lowest value of inertia, at which incrementing the value of k does not alter the inertia value, is when k=3 or 4.

elbow_curve_scaled_data_plot


#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** 

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [277]:
# Initialize the K-Means model using the best value for k

optimal_clusters_kmodel =KMeans(n_clusters=4, random_state=1)





In [278]:
# Fit the K-Means model using the scaled data

optimal_clusters_kmodel.fit(scaled_market_data_df)




  super()._check_params_vs_input(X, default_n_init=10)


In [279]:
# Predict the clusters to group the cryptocurrencies using the scaled data

optimal_clusters_kmodel_prediction = optimal_clusters_kmodel.predict(scaled_market_data_df)

# Print the resulting array of cluster values.

print(optimal_model_cluster_prediction)


[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 0 2 0 0 1
 0 0 0 0]


In [280]:
# Create a copy of the DataFrame

scaled_market_data_predictions_df = scaled_market_data_df.copy()




In [281]:
# Add a new column to the DataFrame with the predicted clusters
# In this DataFrame, the "Market Clusters" column provides the number of cluster (0, 1, or 2), in which each Cryptocurrency "coin_id" falls into

scaled_market_data_predictions_df["Marker_Clusters"] = optimal_model_cluster_prediction

# Display sample data

scaled_market_data_predictions_df.head()



Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,Marker_Clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,2
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,2
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,0
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,0
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,2


In [301]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
# When plotting the data, the scatter plot shows the three clusters overlapping one another, which is not clear. 
# PCA is then used to reduce the number of features into 2 to 3 components. KMeans is re-run again to better visualize the distinct clusters. 

scaled_market_data_predictions_plot = scaled_market_data_predictions_df.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="Marker_Clusters",
    hover_cols=['coin_id'],
    tittle="Cryptocurrency Clustering",
    xlabel="24h Price Change (%)",
    ylabel="7d Price Change(%)"
    
)
    

scaled_market_data_predictions_plot




---

### Optimize Clusters with Principal Component Analysis.

In [283]:
# Create a PCA model instance and set `n_components=3`.

pca_model_instance = PCA(n_components=3)


In [284]:
# Use the PCA model with `fit_transform` to reduce to three principal components.
# using the original scaled DataFrame without the predisted clusters. 

pca_scaled_market_data = pca_model_instance.fit_transform(scaled_market_data_df)


# View the first five rows of the DataFrame. 
# Review the first 3 rows of the array

pca_scaled_market_data[:3]


array([[ 0.79510406, -0.66290603, -0.15388459],
       [ 0.41376913, -1.04777288,  0.15762251],
       [-0.19550822,  0.51826138, -0.20153516]])

In [285]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
# PCA_1=48%, PCA_2=27%, PCA_3=17%, TOTAL= 92%

pca_model_instance.explained_variance_ratio_


array([0.47862164, 0.26608254, 0.1684978 ])

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 

In [286]:
# Create a new DataFrame with the PCA data (from array to DataFrame)

pca_scaled_market_data_df = pd.DataFrame(pca_scaled_market_data, columns=["PCA1", "PCA2", "PCA3"])

# Copy the crypto names from the original data 
# using previous variable created "coin_id_column_copy"

pca_scaled_market_data_df["coin_id"] = coin_id_column_copy

# Set the coinid column as index

pca_scaled_market_data_df = pca_scaled_market_data_df.set_index("coin_id")


# Display sample data

pca_scaled_market_data_df.head()


Unnamed: 0_level_0,PCA1,PCA2,PCA3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,0.795104,-0.662906,-0.153885
ethereum,0.413769,-1.047773,0.157623
tether,-0.195508,0.518261,-0.201535
ripple,-0.260748,0.340601,-0.145741
bitcoin-cash,1.961019,-2.239724,-0.182981


---

### Find the Best Value for k Using the PCA Data

In [287]:
# Create a list with the number of k-values from 1 to 11

pca_k_values = list(range(1, 12))



In [288]:
# Create an empty list to store the inertia values

pca_inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:

for i in pca_k_values:

# 1. Create a KMeans model using the loop counter for the n_clusters

    pca_k_model = KMeans(n_clusters=i, random_state=0)

# 2. Fit the model to the data using `df_market_data_pca`

    pca_k_model.fit(pca_scaled_market_data_df)

# 3. Append the model.inertia_ to the inertia list

    pca_inertia.append(pca_k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [289]:
print(len(pca_k_values))
print(len(pca_inertia))


11
11


In [290]:
# Create a dictionary with the data to plot the Elbow curve

pca_elbow_curve_data = {"k": pca_k_values, "inertia": pca_inertia}

# Create a DataFrame with the data to plot the Elbow curve

pca_elbow_curve_data_df = pd.DataFrame(pca_elbow_curve_data)

# Review the PCA DataFrame

pca_elbow_curve_data_df.head()

Unnamed: 0,k,inertia
0,1,187.206406
1,2,124.570023
2,3,78.622143
3,4,53.184194
4,5,42.373171


In [291]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

pca_elbow_curve_data_plot = pca_elbow_curve_data_df.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

pca_elbow_curve_data_plot



#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:**


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** 

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [292]:
# Initialize the K-Means model using the best value for k

pca_k_model = KMeans(n_clusters=4)




In [293]:
# Fit the K-Means model using the PCA data

pca_k_model.fit(pca_scaled_market_data_df)




  super()._check_params_vs_input(X, default_n_init=10)


In [294]:
# Predict the clusters to group the cryptocurrencies using the PCA data

pca_clusters = pca_k_model.predict(pca_scaled_market_data_df)

# Print the resulting array of cluster values.

print(pca_clusters)


[1 1 0 0 1 1 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 3 1 0 0 2
 0 0 0 0]


In [295]:
# Create a copy of the DataFrame with the PCA data

pca_scaled_market_data_predictions_df = pca_scaled_market_data_df.copy()


# Add a new column to the DataFrame with the predicted clusters

pca_scaled_market_data_predictions_df["Crypto Optimal Clusters"] = pca_clusters

# Display sample data

pca_scaled_market_data_predictions_df.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3,Crypto Optimal Clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,0.795104,-0.662906,-0.153885,1
ethereum,0.413769,-1.047773,0.157623,1
tether,-0.195508,0.518261,-0.201535,0
ripple,-0.260748,0.340601,-0.145741,0
bitcoin-cash,1.961019,-2.239724,-0.182981,1


In [300]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.


pca_scaled_market_data_plot = pca_scaled_market_data_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="Crypto Optimal Clusters")


pca_scaled_market_data_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [299]:
# Composite plot to contrast the Elbow curves
#pca_elbow_curve_data_plot = pca_elbow_curve_data_df.hvplot.line(
    # x="k",
    # y="inertia",
    # title="Elbow Curve",
    # xticks=k)

#elbow_curve_scaled_data_plot = elbow_curve_scaled_data_df.hvplot.line(
    # x="k",
    # y="inertia",
    # title="Elbow Curve",
    # xticks=k)

composite_elbow_curve_plot = elbow_curve_scaled_data_plot + pca_elbow_curve_data_plot

composite_elbow_curve_plot




In [304]:
# Composite plot to contrast the clusters
# YOUR CODE HERE!

# scaled_market_data_predictions_plot = scaled_market_data_predictions_df.hvplot.scatter(
    # x="price_change_percentage_24h",
    # y="price_change_percentage_7d",
    # by="Marker_Clusters",
    # hover_cols=['coin_id'],
    # tittle="Cryptocurrency Clustering",
    # xlabel="24h Price Change (%)",
    # ylabel="7d Price Change(%)")
    
# pca_scaled_market_data_plot = pca_scaled_market_data_predictions_df.hvplot.scatter(
    # x="PCA1",
    # y="PCA2",
    # by="Crypto Optimal Clusters")

composite_clusters_data_plot = scaled_market_data_predictions_plot + pca_scaled_market_data_plot 


composite_clusters_data_plot



#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** 