In [87]:
#
# Import required libraries and dependencies
#

from   datetime              import datetime

print('Loading Libraries',datetime.now());

# Intentionally ignoring certain waring messages that we know are not relevant to this program, l
# like deprecated functions or changed default parameters.
# This is done to avoid cluttering the output with warnings that are not relevant to this program.
# other condittions with higher seriousness will still be displayed.

import warnings
warnings.filterwarnings("ignore")

import pandas                    as pd;
import hvplot.pandas;
from   sklearn.cluster       import KMeans;
from   sklearn.decomposition import PCA;
from   sklearn.preprocessing import StandardScaler;

print('Libraries Loaded ',datetime.now());

# displaying time for loading libraries, just to evaluate the time it takes to load libraries
# program is being tested under two architectures:
# one is a MacBook Pro M2 Ultra with 32GB of RAM.
# the other is a Alienware M18 with 32 GB of RAM.

Loading Libraries 2023-08-27 17:18:04.909924
Libraries Loaded  2023-08-27 17:18:04.910053


In [88]:
#
# Data Dicitionary, with variables used in the program
#
# Camel Case variables are used for variables that are used in the program
#
# df_Source_Data        :  DataFrame with the source data, CryptoCurrency data
# crypto_Scaled_Array   :  Array with the scaled data
# crypto_Transformed    :  DataFrame with the transformed data
# coins_Names           :  Array with the coins names
# inertia_1             :  Array with the inertia data
# inertia_2             :  Array with the inertia data
# elbow_Data_1          :  dictionary with the Elbow data 1
# elbow_Data_2          :  dictionary with the Elbow data 2
# elbow_DF_1            :  DataFrame with the Elbow data 1
# elbow_DF_2            :  DataFrame with the Elbow data 2
# clusters_Predicted    :  Array with the predicted clusters
# cluster_PCA_Data      :  DataFrame with the PCA data
# cluster_PCA_DF        :  DataFrame with the PCA data and the predicted clusters

In [89]:
# 
# Load the data into a Pandas DataFrame
# The overall asumption is that the data is already clean and ready to be used


print('Loading Data',datetime.now())
df_Source_Data = pd.read_csv("Resources/crypto_market_data.csv",index_col="coin_id")

#
# Display sample data
#

print(dfSourceData.head(10))
print('Data Loaded ',datetime.now())

Loading Data 2023-08-27 17:18:13.058292
                 price_change_percentage_24h  price_change_percentage_7d  \
coin_id                                                                    
bitcoin                              1.08388                     7.60278   
ethereum                             0.22392                    10.38134   
tether                              -0.21173                     0.04935   
ripple                              -0.37819                    -0.60926   
bitcoin-cash                         2.90585                    17.09717   
binancecoin                          2.10423                    12.85511   
chainlink                           -0.23935                    20.69459   
cardano                              0.00322                    13.99302   
litecoin                            -0.06341                     6.60221   
bitcoin-cash-sv                      0.92530                     3.29641   

                 price_change_percentage_14d  p

In [90]:
# Generate summary statistics
print('Summary Statistics',datetime.now())
df_Source_Data.describe()

Summary Statistics 2023-08-27 17:18:18.417780


Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [91]:

#
# Plot your data to see what's in your DataFrame
#
print('Plotting Data',datetime.now())
df_Source_Data.hvplot.line(width=1200,height=600,rot=90)


Plotting Data 2023-08-27 17:18:28.022848


---

### Prepare the Data

In [92]:
#
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# According to the graph above, at least one cryptocurrency is in the thousands, while the rest are in the hundreds.
# This means that the data is skewed and needs to be normalized.
# 
print('Normalizing Data',datetime.now())
crypto_Scaled_Array = StandardScaler().fit_transform(df_Source_Data)
print(crypto_Scaled_Array)
print('Data Normalized',datetime.now())

Normalizing Data 2023-08-27 17:19:47.945307
[[ 5.08529366e-01  4.93193071e-01  7.72200433e-01  2.35459633e-01
  -6.74950963e-02 -3.55953481e-01 -2.51636882e-01]
 [ 1.85445894e-01  9.34445040e-01  5.58692121e-01 -5.43409317e-02
  -2.73482725e-01 -1.15759474e-01 -1.99352110e-01]
 [ 2.17739616e-02 -7.06336853e-01 -2.16804207e-02 -6.10301536e-02
   8.00452481e-03 -5.50246924e-01 -2.82060506e-01]
 [-4.07643829e-02 -8.10928066e-01  2.49457974e-01 -5.03879651e-02
  -3.73164019e-01 -4.58258816e-01 -2.95546142e-01]
 [ 1.19303608e+00  2.00095907e+00  1.76061001e+00  5.45842065e-01
  -2.91202870e-01 -4.99847761e-01 -2.70316950e-01]
 [ 8.91870708e-01  1.32729453e+00  8.00214184e-01 -5.71478992e-02
   7.78653106e-01 -1.88231917e-01 -2.25532605e-01]
 [ 1.13972400e-02  2.57225091e+00  1.10164693e+00 -4.90495415e-01
  -9.31954023e-01  3.87758986e-01 -1.82843995e-02]
 [ 1.02529802e-01  1.50800146e+00  6.48885061e-01  3.28959245e-01
  -4.86348899e-01  6.50796233e-02 -1.55428416e-01]
 [ 7.74971820e-02  3

In [93]:
#
# Create a DataFrame with the scaled data
#
print('Before DataFrame transformation',datetime.now())
crypto_Transformed = pd.DataFrame(crypto_Scaled_Array, columns=['price_change_percentage_24h', 
                                                                'price_change_percentage_7d',
                                                                'price_change_percentage_14d', 
                                                                'price_change_percentage_30d', 
                                                                'price_change_percentage_60d', 
                                                                'price_change_percentage_200d',	
                                                                'price_change_percentage_1y'])
print(crypto_Transformed)
#
# Copy the crypto names from the original data
#
coins_Names                   = list(dfSourceData.index) #create a list of the coins names
print(coins_Names)
#
# Set the coinid column as index
#
crypto_Transformed['coin_id'] = coins_Names                             #create a new column with the coin names
crypto_Transformed            = crypto_Transformed.set_index('coin_id') #set the coin names as index
#
# Display sample data
#
#print('After DataFrame transformation',datetime.now())
print(crypto_Transformed)


Before DataFrame transformation 2023-08-27 17:20:45.406000
    price_change_percentage_24h  price_change_percentage_7d  \
0                      0.508529                    0.493193   
1                      0.185446                    0.934445   
2                      0.021774                   -0.706337   
3                     -0.040764                   -0.810928   
4                      1.193036                    2.000959   
5                      0.891871                    1.327295   
6                      0.011397                    2.572251   
7                      0.102530                    1.508001   
8                      0.077497                    0.334297   
9                      0.448952                   -0.190684   
10                     0.331280                   -1.614844   
11                     0.034352                   -0.733026   
12                     0.155710                   -0.922491   
13                     0.262723                    1.792602

---

### Find the Best Value for k Using the Original Data.

In [94]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,12))
print('Values of k',k)

Values of k [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [95]:
#
# Create an empty list to store the inertia values
#
inertia_1 = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1,n_init=10,max_iter=1000) #n_init=10,max_iter=1000 provided to avoid depcrecation warning
    k_model.fit(cryptoTransformed)
    inertia_1.append(k_model.inertia_)
print('Values of Inertia ',inertia_1)

Values of Inertia  [287.0, 195.82021818036043, 123.19048183836958, 79.02243535120977, 63.85866780584266, 53.05778846567061, 44.4067905846164, 37.07823336746088, 32.83218742836354, 28.165433305979256, 24.964637877542405]


In [124]:
# Create a dictionary with the data to plot the Elbow curve
print('Creating Elbow Curve',datetime.now())
elbow_Data_1 = {"k": k, "inertia": inertia_1}


# Create a DataFrame with the data to plot the Elbow curve
elbow_DF_1 = pd.DataFrame(elbow_Data_1)
elbow_DF_1

Creating Elbow Curve 2023-08-27 17:32:11.736754


Unnamed: 0,k,inertia
0,1,287.0
1,2,195.820218
2,3,123.190482
3,4,79.022435
4,5,63.858668
5,6,53.057788
6,7,44.406791
7,8,37.078233
8,9,32.832187
9,10,28.165433


In [125]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
print('Plotting Elbow Curve',datetime.now())
elbow_DF_1.hvplot.line(x="k",y="inertia",title= "Elbow Curve",xticks=k,width=1200,height=600)

Plotting Elbow Curve 2023-08-27 17:32:18.907535


#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for 'k' is **4**

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [126]:
# Initialize the K-Means model using the best value for k
print('Initializing K-Means Model',datetime.now())
model = KMeans(n_clusters=4, random_state=1)
print('K-Means Model Initialized ',datetime.now())

Initializing K-Means Model 2023-08-27 17:32:23.246257
K-Means Model Initialized  2023-08-27 17:32:23.246338


In [127]:
# Fit the K-Means model using the scaled data
print('Fitting K-Means Model',datetime.now())
model.fit(crypto_Transformed)
print('K-Means Model Fitted ',datetime.now())

Fitting K-Means Model 2023-08-27 17:32:25.897469
K-Means Model Fitted  2023-08-27 17:32:25.907527


In [128]:
#
# Predict the clusters to group the cryptocurrencies using the scaled data
#
print('Predicting Clusters',datetime.now())
k4 = model.predict(crypto_Transformed)

#
# Print the resulting array of cluster values.
#

print(k4)
print('Clusters Predicted ',datetime.now())


Predicting Clusters 2023-08-27 17:32:27.783720
[2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 2 0 0 2 0 0 2 0 0 0 0 0 0 2 0 0 0 3 2 0 0 1
 0 0 0 0]
Clusters Predicted  2023-08-27 17:32:27.785690


In [129]:
# Create a copy of the DataFrame
print('Creating copy of DataFrame',datetime.now())
clusters_Predicted = crypto_Transformed.copy()
print('Copy of DataFrame Created ',datetime.now())

Creating copy of DataFrame 2023-08-27 17:32:34.407124
Copy of DataFrame Created  2023-08-27 17:32:34.407342


In [130]:
# Add a new column to the DataFrame with the predicted clusters
print('Adding Predicted Clusters',datetime.now())
clusters_Predicted['predicted_cluster'] = k4

# Display sample data
print(clusters_Predicted.head())
print('Predicted Clusters Added',datetime.now())

Adding Predicted Clusters 2023-08-27 17:32:37.876367
              price_change_percentage_24h  price_change_percentage_7d  \
coin_id                                                                 
bitcoin                          0.508529                    0.493193   
ethereum                         0.185446                    0.934445   
tether                           0.021774                   -0.706337   
ripple                          -0.040764                   -0.810928   
bitcoin-cash                     1.193036                    2.000959   

              price_change_percentage_14d  price_change_percentage_30d  \
coin_id                                                                  
bitcoin                          0.772200                     0.235460   
ethereum                         0.558692                    -0.054341   
tether                          -0.021680                    -0.061030   
ripple                           0.249458                    -0.0

In [131]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
print('Plotting Predicted Clusters',datetime.now())
clusters_Predicted.hvplot.scatter(x='price_change_percentage_24h',y='price_change_percentage_7d',
                                 by='predicted_cluster',hover_cols = 'coin_id',legend='top_right',width=1200,height=600)

Plotting Predicted Clusters 2023-08-27 17:32:42.069615


---

### Optimize Clusters with Principal Component Analysis.

In [132]:
# Create a PCA model instance and set `n_components=3`.
print('Creating PCA Model',datetime.now())
pca = PCA(n_components=3)
print('PCA Model Created ',datetime.now())  

Creating PCA Model 2023-08-27 17:32:46.054390
PCA Model Created  2023-08-27 17:32:46.054463


In [133]:
#
# Use the PCA model with `fit_transform` to reduce to three principal components.
#
print('Fitting PCA Model',datetime.now())   
clusters_PCA = pca.fit_transform(clustersPredicted)
print('PCA Model Fitted ',datetime.now())
#
# View the first five rows of the DataFrame. 
#
clusters_PCA[:5]

Fitting PCA Model 2023-08-27 17:32:50.354175
PCA Model Fitted  2023-08-27 17:32:50.355331


array([[ 0.44890795, -1.24537573, -0.8506404 ],
       [ 0.49536709, -0.899823  , -1.31755943],
       [-0.81884571,  0.07189909,  0.69501507],
       [-0.84035711,  0.08005407,  0.54435952],
       [ 0.81324004, -2.66952035, -1.64332113]])

In [134]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
print('Explained Variance',datetime.now())  
pca.explained_variance_ratio_

Explained Variance 2023-08-27 17:32:54.141265


array([0.34871677, 0.31363391, 0.22627118])

In [135]:
#calculate the Total Explained Variance by summing all 3 Explained Variance Ratios
print('Total Explained Variance',datetime.now())
sum(pca.explained_variance_ratio_)

Total Explained Variance 2023-08-27 17:32:56.449623


0.8886218549859446

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 0.34871677 + 0.31363391 + 0.22627118 = 0.88862186

**Answer may change depending on re-execution of the whole code**

In [136]:
# Create a new DataFrame with the PCA data.
print('Creating PCA DataFrame',datetime.now())  
cluster_PCA_df = pd.DataFrame(clusters_PCA,columns = ["PCA1", "PCA2", "PCA3"])
print(cluster_PCA_df)

# Copy the crypto names from the original data
cluster_PCA_df['coin_id'] = list(clusters_Predicted.index)
print(cluster_PCA_df)

# Set the coinid column as index
cluster_PCA_df = cluster_PCA_df.set_index('coin_id')

# Display sample data
print(cluster_PCA_df)
print('PCA DataFrame Created ',datetime.now())


Creating PCA DataFrame 2023-08-27 17:32:59.282607
        PCA1      PCA2      PCA3
0   0.448908 -1.245376 -0.850640
1   0.495367 -0.899823 -1.317559
2  -0.818846  0.071899  0.695015
3  -0.840357  0.080054  0.544360
4   0.813240 -2.669520 -1.643321
5   0.822871 -1.682096 -0.905203
6   0.913868 -1.128684 -2.730038
7   0.811102 -1.113378 -1.598655
8   0.258728 -0.946884 -1.103560
9  -1.003921 -0.119410  0.278564
10 -1.363289  1.245349  0.950712
11 -0.830643  0.077047  0.707661
12 -1.135458  0.171401  0.441569
13  2.084978 -2.688979 -0.776469
14 -0.633198 -0.016911  0.249426
15 -0.161908 -0.310904 -1.818521
16 -0.996987  1.292574  0.793701
17 -0.990512  0.449877  0.052002
18  0.878129 -1.093136 -1.774115
19 -0.874815  0.008933  0.694993
20 -1.135199  0.792113  0.499484
21  0.434801 -1.232506 -0.839794
22 -1.110348  0.540593  0.854934
23 -0.758629  0.094085  0.466658
24  0.222636  0.083977  1.584956
25 -0.826142  0.049070  0.700753
26 -1.059844  0.097044 -0.149781
27 -1.808829  1.520236  0.

---

### Find the Best Value for k Using the PCA Data

In [137]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,12))
print('k-values ',k)

k-values  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [138]:
# Create an empty list to store the inertia values
inertia_2 = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1,n_init=10,max_iter=1000)
    k_model.fit(cluster_PCA_df)
    inertia_2.append(k_model.inertia_)
print('Values of Inertia 2 ', inertia_2)

Values of Inertia 2  [290.6226939879639, 203.5797257883004, 112.854846058156, 44.13060225321705, 33.217427255152344, 23.37167668829717, 17.17220012359737, 13.593410892098134, 11.155098209251477, 8.958913480156944, 7.345260405169598]


In [139]:
# Create a dictionary with the data to plot the Elbow curve
print('Creating Elbow Curve 2',datetime.now())  
elbow_Data_2 = {"k": k, "inertia": inertia_2}
# Create a DataFrame with the data to plot the Elbow curve
elbow_DF_2 = pd.DataFrame(elbow_Data_2)
print(elbow_DF_2)
print('Elbow Curve 2 Created ',datetime.now())

Creating Elbow Curve 2 2023-08-27 17:33:13.803813
     k     inertia
0    1  290.622694
1    2  203.579726
2    3  112.854846
3    4   44.130602
4    5   33.217427
5    6   23.371677
6    7   17.172200
7    8   13.593411
8    9   11.155098
9   10    8.958913
10  11    7.345260
Elbow Curve 2 Created  2023-08-27 17:33:13.804939


In [140]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
print('Plotting Elbow Curve 2',datetime.now())
elbow_DF_2.hvplot.line(x="k", title="Elbow Curve", xticks=k,width=1200,height=600)


Plotting Elbow Curve 2 2023-08-27 17:33:16.777651


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** The best k-value is  `k=4` when using PCA data


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No, it is the same `k` value as found using the original data

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [141]:
# Initialize the K-Means model using the best value for k
print('Initializing K-Means Model 2',datetime.now())    
model = KMeans(n_clusters=4, random_state=1)
print('K-Means Model 2 Initialized ',datetime.now())

Initializing K-Means Model 2 2023-08-27 17:33:20.441508
K-Means Model 2 Initialized  2023-08-27 17:33:20.441583


In [142]:
# Fit the K-Means model using the PCA data
print('Fitting K-Means Model 2',datetime.now())
model.fit(cluster_PCA_df)
print('K-Means Model 2 Fitted ',datetime.now())

Fitting K-Means Model 2 2023-08-27 17:33:22.240866
K-Means Model 2 Fitted  2023-08-27 17:33:22.250480


In [143]:
# Predict the clusters to group the cryptocurrencies using the PCA data
k4 = model.predict(cluster_PCA_df)
# Print the resulting array of cluster values.
k4

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 1, 0, 0, 2, 0, 0, 0, 0],
      dtype=int32)

In [144]:
# Create a copy of the DataFrame with the PCA data
print('Creating copy of DataFrame with PCA data',datetime.now())    
copy_Cluster_PCA_df= cluster_PCA_df.copy()

# Add a new column to the DataFrame with the predicted clusters
copy_Cluster_PCA_df['predicted_cluster'] = k4

# Display sample data
print(copy_Cluster_PCA_df)
print('Copy of DataFrame with PCA data Created ',datetime.now())

Creating copy of DataFrame with PCA data 2023-08-27 17:33:28.766337
                          PCA1      PCA2      PCA3  predicted_cluster
coin_id                                                              
bitcoin               0.448908 -1.245376 -0.850640                  1
ethereum              0.495367 -0.899823 -1.317559                  1
tether               -0.818846  0.071899  0.695015                  0
ripple               -0.840357  0.080054  0.544360                  0
bitcoin-cash          0.813240 -2.669520 -1.643321                  1
binancecoin           0.822871 -1.682096 -0.905203                  1
chainlink             0.913868 -1.128684 -2.730038                  1
cardano               0.811102 -1.113378 -1.598655                  1
litecoin              0.258728 -0.946884 -1.103560                  1
bitcoin-cash-sv      -1.003921 -0.119410  0.278564                  0
crypto-com-chain     -1.363289  1.245349  0.950712                  0
usd-coin             -

In [145]:
# Create a scatter plot using hvPlot by setting 
# `x="PCA1"` and `y="PCA2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
print('Plotting Predicted Clusters 2',datetime.now())   
copy_Cluster_PCA_df.hvplot.scatter(x="PCA1",y="PCA2",by ='predicted_cluster',hover_cols='coin_id',legend='top_right',width=1200,height=600)

Plotting Predicted Clusters 2 2023-08-27 17:33:32.682675


### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [146]:
# Composite plot to contrast the Elbow curves

elbow_DF_1.hvplot.line(x="k", y="inertia", title="Elbow Curve 1", xticks=k) + \
elbow_DF_2.hvplot.line(x="k", y="inertia", title="Elbow Curve 2", xticks=k)

In [None]:
# Composite plot to contrast the clusters
clustersPredicted.hvplot.scatter( x='price_change_percentage_24h',  
                                  y='price_change_percentage_7d', 
                                  by='predicted_cluster', hover_cols = 'coin_id') + \
copyClusterPCAdf.hvplot.scatter(x="PCA1", y="PCA2", by = 'predicted_cluster', hover_cols='coin_id')

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** The impact of using PCA data resulted in tighter grouped clusters, with more entries within cluster 0 and cluster 1 than the original analysis did.