# Imports

In [194]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

---

# Global variables and constants

In [195]:
# Set the random state for reproducibility
random_state=1

---

# Function Definitions

In [196]:
# Create a function to compute inertia values for K-Means clustering
def compute_inertia_values(k_values, data):
    """
    Compute inertia values for K-Means clustering.

    Inputs:
        k_values (list): A list of integers representing the number of clusters to evaluate.
        data (DataFrame): The data to be clustered.

    Output:
        list: A list of inertia values, one for each value of k.
    """

    # Create an empty list to store the inertia values
    inertia = []

    # Create a for loop to compute the inertia with each possible value of k
    for i in k_values:

        # Create a KMeans model using the loop counter for the n_clusters
        k_model = KMeans(n_clusters=i, random_state=random_state)

        # Fit the model to the data using `data`
        k_model.fit(data)

        # Append the model.inertia_ to the inertia list
        inertia.append(k_model.inertia_)

    # Return the list of inertia
    return inertia


In [197]:

# Create a DataFrame for plotting the Elbow curve
def elbow(k_values, inertia_values):
    """
    Create a DataFrame for plotting the Elbow curve.

    Inputs:
        k_values (list): A list of integers representing the number of clusters to evaluate.
        inertia_values (list): A list of inertia values corresponding to each value of k.

    Outputs:
        DataFrame: A DataFrame containing the data for plotting the Elbow curve.
    """


    # Create a dictionary with the data to plot the Elbow curve
    elbow_data = {"k": k_values, "inertia": inertia_values}
    
    # Create a DataFrame with the data to plot the Elbow curve
    df_elbow = pd.DataFrame(elbow_data)
    
    # Return the dataframe 
    return df_elbow


In [198]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k
def plot_elbow_curve(df_elbow, x_col, y_col, title, xticks):
    """
    Plot an Elbow curve to visualize inertia values for different values of k.

    Inputs:
        df_elbow (DataFrame): A DataFrame containing data for plotting the Elbow curve.
        x_col (string): The column name to be used for the x-axis.
        y_col (string): The column name to be used for the y-axis.
        title (string): The title for the plot.
        xticks (list): A list of values to be used as x-axis ticks.

    Outputs:
        plot_curve: A plot representing the Elbow curve.
    """
    
    # Plot an Elbow curve 
    plot_curve = df_elbow.hvplot.line(x=x_col, y=y_col, title=title, xticks=xticks)
    
    # Return the plot
    return plot_curve


In [199]:

# Cluster data using K-Means
def cluster_data_with_kmeans(model, data, cluster_col_name):
    """
    Cluster data using K-Means.

    Inputs:
        model (object): The K-Means model to be used for clustering.
        data (DataFrame): The data to be clustered.
        cluster_col_name (string): The name of the column to store cluster labels.

    Outputs:
        DataFrame: A copy of the input data with an additional column for cluster labels.
        kmeans_predictions: An array of cluster labels for the data.
    """
    
    # Fit the K-Means model to the data
    model.fit(data)
    
    #Predict the clusters to group the cryptocurrencies
    kmeans_predictions = model.predict(data)
    
    # Create a copy of the data and add the cluster predictions as a new column
    data_with_clusters = data.copy()
    data_with_clusters[cluster_col_name] = kmeans_predictions
    
    #Return the Dataframe and the Kmeans predictions array
    return data_with_clusters, kmeans_predictions

In [200]:
# Create a scatter plot using hvplot.scatter
def generate_scatter_plot(data, x_col, y_col, clusters, hover_cols, title):
    """
    Generate a scatter plot for the given data.

    Inputs:
        data (DataFrame): The data to be plotted.
        x_col (string): The name of the column to use for the x-axis.
        y_col (string): The name of the column to use for the y-axis.
        clusters (string): The name of the column to use for grouping data points.
        hover_cols (list of strings): The columns to display in the hover tooltip.
        title (string): The title of the scatter plot.

    Outputs:
        scatter_plot: A scatter plot of the data.
    """
    # Plot a scatter
    scatter_plot = data.hvplot.scatter(
        x=x_col,
        y=y_col,
        by=clusters,
        hover_cols=hover_cols
    ).opts(
        title=title
    )

    # Return the plot
    return scatter_plot

---

# Main Code

In [201]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [202]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [203]:
# Plot the data to see what's in the DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [204]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
market_data_scaled = StandardScaler().fit_transform(df_market_data)

# Display the five first rows
market_data_scaled[:5]

array([[ 0.50852937,  0.49319307,  0.77220043,  0.23545963, -0.0674951 ,
        -0.35595348, -0.25163688],
       [ 0.18544589,  0.93444504,  0.55869212, -0.05434093, -0.27348273,
        -0.11575947, -0.19935211],
       [ 0.02177396, -0.70633685, -0.02168042, -0.06103015,  0.00800452,
        -0.55024692, -0.28206051],
       [-0.04076438, -0.81092807,  0.24945797, -0.05038797, -0.37316402,
        -0.45825882, -0.29554614],
       [ 1.19303608,  2.00095907,  1.76061001,  0.54584206, -0.29120287,
        -0.49984776, -0.27031695]])

In [205]:
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(market_data_scaled)

# Copy the crypto names from the original data: df_market_data
column_names = df_market_data.columns.tolist()
df_market_data_scaled.columns = column_names

# Set the coinid column as index
crypto_names = df_market_data.index
df_market_data_scaled.index = crypto_names

# Display sample data
df_market_data_scaled .head(5)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


---

### Find the Best Value for k Using the Original Data.

In [206]:
# Create a list with the number of k-values from 1 to 11
k_values_origin_data = list(range(1, 11))

# compute inertia values
inertia_values_origin_data = compute_inertia_values(k_values_origin_data, df_market_data_scaled)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [207]:
# Create a DataFrame for plotting the Elbow curve using original data
df_elbow_origin_data=elbow(k_values_origin_data, inertia_values_origin_data)

# Display the DataFrame 
df_elbow_origin_data

Unnamed: 0,k,inertia
0,1,287.0
1,2,195.820218
2,3,123.190482
3,4,79.022435
4,5,63.858668
5,6,53.057788
6,7,44.406791
7,8,37.078233
8,9,32.832187
9,10,28.165433


In [208]:
# Create the Elbow Curve Plot for the original data
elbow_plot_origin_data = plot_elbow_curve(df_elbow_origin_data, x_col="k", y_col="inertia", title="Elbow Curve Original Data", xticks=k_values_origin_data)

# Display the Elbow Curve Plot
elbow_plot_origin_data

#### Answer the following question: 

**Question:** What is the best value for `k`? 

**Answer:** we have two elbows, the first at 3 and the second at 4. The best value for `k` is 4 because that's where the inertia starts really to level off and with having 4 clusters, it would result in more specific categorization

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [209]:
# Initialize the K-Means model using the best value for k
model_origin_data = KMeans(n_clusters=4, random_state=1)

In [210]:
# Cluster the original data using K-Means model
# df_market_origin_data_predict contains df_market_data_scaled with cluster labels
# kmeans_predictions_origin_data contains the predicted cluster labels
df_market_origin_data_predict,kmeans_predictions_origin_data = cluster_data_with_kmeans(model_origin_data , df_market_data_scaled, 'clusters')


  super()._check_params_vs_input(X, default_n_init=10)


In [211]:
# Print the resulting array of cluster values
kmeans_predictions_origin_data

array([2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0, 0])

In [212]:
# Display sample data
df_market_origin_data_predict.head(5)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,2
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,2
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,0
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,0
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,2


In [213]:
# Create the scatter plot using hvPlot
scatter_plot_origin_data= generate_scatter_plot(
    # Define the Dataframe
    df_market_origin_data_predict,
    
    # Set x="price_change_percentage_24h"` and `y="price_change_percentage_7d"
    "price_change_percentage_24h",
    "price_change_percentage_7d",
    
    # Color the graph points with the labels found using K-Means
    'clusters',
    
    # Add the crypto name in the `hover_cols` parameter
    ["coin_id"],
    
    # Add a title
    "Scatter Plot with K-Means Clustering (Original Data)"
)

# Display the plot
scatter_plot_origin_data

---

### Optimize Clusters with Principal Component Analysis.

In [214]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [215]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components 
market_data_pca= pca.fit_transform(df_market_data_scaled)

# Review the first five rows
market_data_pca[:5]



array([[-0.60066733,  0.84276006,  0.46159457],
       [-0.45826071,  0.45846566,  0.95287678],
       [-0.43306981, -0.16812638, -0.64175193],
       [-0.47183495, -0.22266008, -0.47905316],
       [-1.15779997,  2.04120919,  1.85971527]])

In [216]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
ex_var=pca.explained_variance_ratio_

# Display the array
ex_var

array([0.3719856 , 0.34700813, 0.17603793])

In [217]:
# Calculate the total explained variance of the three principal components
total_ex_var=ex_var[0]+ex_var[1]+ex_var[2]

# Display the total
total_ex_var

0.8950316570309841

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** The total explained variance of the three principal components is approximately 89.50%. This means that these three principal components summarize a significant portion of the data's characteristics.

In [218]:
# Create a new DataFrame with the PCA data.
df_market_data_pca = pd.DataFrame(market_data_pca)

# Rename the columns
new_column_names = ["PCA1", "PCA2", "PCA3"]
df_market_data_pca.columns = new_column_names

# Copy the crypto names from the original data
df_market_data_pca['coin_id'] = df_market_data.index

# Set the coinid column as index
df_market_data_pca = df_market_data_pca.set_index("coin_id")

# Display sample data
df_market_data_pca.head(5)

Unnamed: 0_level_0,PCA1,PCA2,PCA3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


---

### Find the Best Value for k Using the PCA Data

In [219]:
# Create a list with the number of k-values from 1 to 11
k_values_pca_data = list(range(1, 11))

# compute inertia values
inertia_values_pca_data = compute_inertia_values(k_values_pca_data, df_market_data_pca)

  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [220]:
# Create a DataFrame for plotting the Elbow curve using the PCA data
df_elbow_pca_data=elbow(k_values_pca_data, inertia_values_pca_data)

# Display the DataFrame 
df_elbow_pca_data

Unnamed: 0,k,inertia
0,1,256.874086
1,2,165.901994
2,3,93.774626
3,4,49.665497
4,5,38.072792
5,6,27.720626
6,7,21.145312
7,8,17.208546
8,9,13.719376
9,10,10.559358


In [221]:
# Create the Elbow Curve Plot for the PCA data
elbow_plot_pca_data = plot_elbow_curve(df_elbow_pca_data, x_col="k", y_col="inertia", title="Elbow Curve PCA Data", xticks=k_values_origin_data)

# Display the Elbow Curve Plot
elbow_plot_pca_data

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** 4


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No, it doesn't differ from the best k value found using the original data for the smae reason, it would result in more specific categorization.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [222]:
# Initialize the K-Means model using the best value for k
model_pca_data = KMeans(n_clusters=4, random_state=1)

In [223]:
# Cluster the original data using K-Means model
# df_market_pca_data_predict contains the df_market_data_pca data with cluster labels
# kmeans_predictions_pca_data contains the predicted cluster labels
df_market_pca_data_predict,kmeans_predictions_pca_data  = cluster_data_with_kmeans(model_pca_data , df_market_data_pca, 'clusters')

  super()._check_params_vs_input(X, default_n_init=10)


In [224]:
# Print the resulting array of cluster values
kmeans_predictions_pca_data 

array([2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 1, 0, 0, 0, 0])

In [225]:
# Display sample data
df_market_pca_data_predict.head(5)

Unnamed: 0_level_0,PCA1,PCA2,PCA3,clusters
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,2
ethereum,-0.458261,0.458466,0.952877,2
tether,-0.43307,-0.168126,-0.641752,0
ripple,-0.471835,-0.22266,-0.479053,0
bitcoin-cash,-1.1578,2.041209,1.859715,2


In [226]:
# Create the scatter plot using hvPlot
scatter_plot_pca_data = generate_scatter_plot(
    
    # Define the Dataframe
    df_market_pca_data_predict,
    # Set x="PCA1"` and `y="PCA2"
    "PCA1",
    "PCA2",
    
    # Color the graph points with the labels found using K-Means
    'clusters',
    
    # Add the crypto name in the `hover_cols` parameter
    ["coin_id"],
    
    # Add a title
    "Scatter Plot with K-Means Clustering (PCA Data)"
)

# Display the plot
scatter_plot_pca_data

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [227]:
# Composite plot to contrast the Elbow curves
elbow_plot= elbow_plot_origin_data+elbow_plot_pca_data

# Display the plot
elbow_plot

In [228]:
# Composite plot to contrast the clusters
scatter_plot=scatter_plot_origin_data+scatter_plot_pca_data

# Display the plot
scatter_plot

#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** PCA identifies and emphasizes the dimensions that capture the most variance in the data. By reducing the dimensionality, the data points become concentrated along these dimensions, making the main clusters more compact and closer to each other in the PCA-transformed space. Smaller clusters with limited data points may exhibit more variance and appear further apart.