In [124]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

In [44]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [45]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,39.0,39.0,39.0,39.0,39.0,39.0,39.0
mean,-0.00109,4.604278,-0.174902,-1.216084,-5.873395,150.767061,112.630111
std,1.655446,6.509199,7.475493,13.647696,31.885268,198.144148,166.32529
min,-4.56089,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60591,-0.035725,-4.40188,-9.776995,-26.066295,19.860765,0.272725
50%,-0.06341,3.29641,0.10974,-0.04237,-12.8889,83.5184,59.23821
75%,0.552555,7.87839,5.159615,3.82669,0.265955,186.38602,149.027735
max,4.84033,20.69459,18.41097,38.95974,82.86094,882.65105,701.37599


In [46]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.scatter(
    width=800,
    height=400,
    rot=90
)

In [47]:
df_market_data.columns

Index(['price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y'],
      dtype='object')

---

### Prepare the Data

In [48]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
df_market_data_scaled = StandardScaler().fit_transform(df_market_data[['price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y']])

In [49]:
# Create a DataFrame with the scaled data

df_market_scaled=pd.DataFrame(df_market_data_scaled, 
                             columns=['price_change_percentage_24h', 'price_change_percentage_7d','price_change_percentage_14d', 'price_change_percentage_30d',
                                       'price_change_percentage_60d', 'price_change_percentage_200d','price_change_percentage_1y'])

# Copy the crypto names from the original data
df_market_scaled['Crypto_names']=df_market_data.index
# Set the coinid column as index
df_market_scaled=df_market_scaled.set_index('Crypto_names')

# Display sample data
df_market_scaled.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
Crypto_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.663962,0.466678,0.914753,0.659808,0.083293,-0.343829,-0.457504
ethereum,0.137698,0.899125,0.675345,0.100046,-0.2229,0.184097,-0.064989
tether,-0.128904,-0.708915,0.02457,0.087125,0.19552,-0.77087,-0.685901
ripple,-0.230771,-0.81142,0.328599,0.107681,-0.371072,-0.568688,-0.787141
bitcoin-cash,1.778941,1.944357,2.023063,1.259325,-0.24924,-0.660097,-0.597739
binancecoin,1.288379,1.284135,0.946165,0.094624,1.341059,0.024809,-0.261532
chainlink,-0.145806,2.504248,1.284163,-0.742406,-1.201691,1.29079,1.294329
cardano,0.002638,1.461236,0.776479,0.840407,-0.539317,0.581567,0.264756
litecoin,-0.038138,0.310952,1.011544,0.18058,-0.361132,-0.630245,-0.763156
bitcoin-cash-sv,0.566917,-0.203553,-0.229252,0.304741,-0.603706,-0.732877,-0.115114


---

### Find the Best Value for k Using the Original Data.

In [82]:
# Create a list with the number of k-values from 1 to 11
k=list(range(1,22))

In [83]:
# Create an empty list to store the inertia values

inertia=[]
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_market_scaled)
    inertia.append(k_model.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [84]:
# Create a dictionary with the data to plot the Elbow curve
elbow = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow)

df_elbow

Unnamed: 0,k,inertia
0,1,273.0
1,2,205.07381
2,3,155.062331
3,4,123.199863
4,5,105.800219
5,6,90.424816
6,7,78.912122
7,8,67.658597
8,9,58.7786
9,10,47.470286


In [85]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** 4

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [86]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4, random_state=0)

In [87]:
# Fit the K-Means model using the scaled data
model.fit(df_market_scaled)

KMeans(n_clusters=4, random_state=0)

In [88]:
# Predict the clusters to group the cryptocurrencies using the scaled data
kmeans_predictions = model.predict(df_market_scaled)

# Print the resulting array of cluster values.
kmeans_predictions



array([2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 2, 0, 3, 2,
       0, 0, 3, 0, 0, 3, 2, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3])

In [89]:
# Create a copy of the DataFrame
df_market_kmeans=df_market_scaled.copy()

In [90]:
# Add a new column to the DataFrame with the predicted clusters
df_market_kmeans['Prediction']=kmeans_predictions

# Display sample data
df_market_kmeans.head()




Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,Prediction
Crypto_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.663962,0.466678,0.914753,0.659808,0.083293,-0.343829,-0.457504,2
ethereum,0.137698,0.899125,0.675345,0.100046,-0.2229,0.184097,-0.064989,2
tether,-0.128904,-0.708915,0.02457,0.087125,0.19552,-0.77087,-0.685901,0
ripple,-0.230771,-0.81142,0.328599,0.107681,-0.371072,-0.568688,-0.787141,0
bitcoin-cash,1.778941,1.944357,2.023063,1.259325,-0.24924,-0.660097,-0.597739,2


In [92]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
market_scaled_plot=df_market_kmeans.hvplot.scatter(
                    x="price_change_percentage_24h",
                    y="price_change_percentage_7d",
                    by="Prediction",
                    hover_cols=['Crypto_names']
                )

market_scaled_plot

---

### Optimize Clusters with Principal Component Analysis.

In [93]:
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)

In [94]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
df_market_pca = pca.fit_transform(df_market_data)
# View the first five rows of the DataFrame. 
df_market_pca[:5]

array([[ -99.48475664,   16.68097962,    6.58227536],
       [  20.62731616,   30.20365743,  -11.2411963 ],
       [-187.50869014,   -7.24171581,   15.4059625 ],
       [-168.37462328,   28.29048152,   -7.36190735],
       [-162.3107328 ,   -4.62491388,    3.09933669]])

In [95]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.93779743, 0.04562332, 0.01307869])

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** About 99.98% of the total variance is condensed into the 2 PCA variables

In [96]:
# Create a new DataFrame with the PCA data.
pca_df = pd.DataFrame(
    df_market_pca,
    columns=["PCA1", "PCA2" , "PCA3"]
)
# Creating a DataFrame with the PCA data
pca_df["Crypto_names"]=df_market_data.index
# Copy the crypto names from the original data
pca_df=pca_df.set_index('Crypto_names')


# Display sample data
pca_df.head(10)




Unnamed: 0_level_0,PCA1,PCA2,PCA3
Crypto_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-99.484757,16.68098,6.582275
ethereum,20.627316,30.203657,-11.241196
tether,-187.50869,-7.241716,15.405962
ripple,-168.374623,28.290482,-7.361907
bitcoin-cash,-162.310733,-4.624914,3.099337
binancecoin,-21.790273,41.084118,38.202911
chainlink,327.847989,-9.710365,-51.956007
cardano,114.56329,37.03622,-24.472094
litecoin,-175.195155,18.036108,-5.201941
bitcoin-cash-sv,-123.300467,-77.049932,-3.393851


---

### Find the Best Value for k Using the PCA Data

In [99]:
# Create a list with the number of k-values from 1 to 11
k_pca=list(range(1,22))

In [100]:
# Create an empy list to store the inertia values

inertia=[]
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list

for i in k_pca:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(pca_df)
    inertia.append(k_model.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [102]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {'kpca': k_pca , 'inertia': inertia}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data)
df_elbow_pca.head()

Unnamed: 0,kpca,inertia
0,1,2583631.0
1,2,826158.3
2,3,428667.6
3,4,261117.5
4,5,191665.5


In [103]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(
    x="kpca", 
    y="inertia", 
    title="Elbow Curve*", 
    xticks=k
)
elbow_pca_plot

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** 2


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:**  yes

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [108]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=5, random_state=0)

In [109]:
# Fit the K-Means model using the PCA data
model.fit(pca_df)

KMeans(n_clusters=5, random_state=0)

In [110]:
# Predict the clusters to group the cryptocurrencies using the PCA data
k_predict = model.predict(pca_df)

# Print the resulting array of cluster values.
k_predict



array([0, 3, 0, 0, 0, 3, 2, 4, 0, 0, 4, 0, 0, 3, 3, 3, 3, 0, 3, 0, 3, 0,
       0, 0, 4, 0, 0, 4, 3, 1, 0, 0, 0, 1, 2, 0, 3, 0, 2])

In [111]:
# Create a copy of the DataFrame with the PCA data
pca_prediction_df = pca_df.copy()
# Add a new column to the DataFrame with the predicted clusters
pca_prediction_df['pca_prediction']= k_predict

# Display sample data
pca_prediction_df.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3,pca_prediction
Crypto_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-99.484757,16.68098,6.582275,0
ethereum,20.627316,30.203657,-11.241196,3
tether,-187.50869,-7.241716,15.405962,0
ripple,-168.374623,28.290482,-7.361907,0
bitcoin-cash,-162.310733,-4.624914,3.099337,0


In [112]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
market_pca_plot=pca_prediction_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="pca_prediction",
    hover_cols=['Crypto_names']
)
market_pca_plot

In [115]:

two_clusters= pd.concat([pca_prediction_df,df_market_kmeans], axis=1)
two_clusters.head(10)

all_data=pd.concat([two_clusters,df_market_data], axis=1)
all_data.head(10)

all_data.to_csv('Resources/two_predictions.csv')



In [116]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
cluster_originaldata=two_clusters.hvplot.scatter(
                    x="price_change_percentage_24h",
                    y="price_change_percentage_60d",
                    by="pca_prediction",
                    hover_cols=['Crypto_names']
                )

market_scaled_plot

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [127]:

# Choose a new color scale from the Plotly Express library
color_scale = px.colors.qualitative.Set1

# Create scatter plot with custom color scale
market_pca_prediction_df = pca_prediction_df.reset_index()
fig = px.scatter_3d(market_pca_prediction_df, x='PCA1', y='PCA2', z='PCA3',
                    color='pca_prediction', color_discrete_sequence=color_scale,
                    hover_data=['Crypto_names'])

fig.show()

In [81]:
# Composite plot to contrast the clusters

elbow_plot + elbow_pca_plot


#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:**  I particularly did not like the two set grouping.  I change parameters to have more clusters in the Kmeans and I think it works best that way.

PD: the data had some outlayers that were affecting the clusters significantly. after cleaning the data I omited those two data points.