# Clustering Crypto

In [2]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [11]:
# Load the crypto_data.csv dataset.

file_path = "/Users/lukehg/Desktop/DA_class/week_18/cryptocurrencies/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head()
# YOUR CODE HERE

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [20]:
# Keep all the cryptocurrencies that are being traded.

crypto_df.drop(crypto_df.loc[crypto_df['IsTrading']==False].index, inplace=True)
crypto_df['IsTrading']
# YOUR CODE HERE

0       True
1       True
2       True
3       True
4       True
        ... 
1243    True
1244    True
1245    True
1246    True
1247    True
Name: IsTrading, Length: 1144, dtype: bool

In [19]:
# Keep all the cryptocurrencies that have a working algorithm.

crypto_df.drop(crypto_df.loc[crypto_df['Algorithm']==""].index, inplace=True)
crypto_df['Algorithm']
# YOUR CODE HERE

0            Scrypt
1               X11
2            Scrypt
3           SHA-256
4           SHA-256
           ...     
1243         Ethash
1244        SHA-256
1245    CryptoNight
1246       Equihash
1247         Scrypt
Name: Algorithm, Length: 1144, dtype: object

In [17]:
# Remove the "IsTrading" column. 

crypto2_df = crypto_df.drop(columns=['IsTrading'])
crypto2_df
# YOUR CODE HERE

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,PoW,,1000000000
1244,UOS,UOS,SHA-256,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [18]:
# Remove rows that have at least 1 null value.
crypto2_df = crypto2_df.dropna()

crypto2_df
# YOUR CODE HERE

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [24]:
# Keep the rows where coins are mined.
crypto2_df.drop(crypto_df.loc[crypto_df['TotalCoinsMined'] == 0].index, inplace=True)

crypto2_df['TotalCoinsMined']
# YOUR CODE HERE

0       4.199995e+01
2       1.055185e+09
5       2.927942e+10
7       1.792718e+07
8       1.076842e+08
            ...     
1238    2.000000e+09
1242    1.493105e+07
1245    9.802226e+08
1246    7.296538e+06
1247    1.283270e+05
Name: TotalCoinsMined, Length: 533, dtype: float64

In [27]:
# Create a new DataFrame that holds only the cryptocurrencies names.

crypto_names_df = pd.DataFrame(crypto2_df, index=crypto2_df.index, columns=['CoinName'])
crypto_names_df
# YOUR CODE HERE

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [34]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.

crypto2_df = crypto2_df.drop(columns=['CoinName', 'Unnamed: 0'])

crypto2_df

# YOUR CODE HERE

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [35]:
# Use get_dummies() to create variables for text features.
x = pd.get_dummies(crypto2_df, columns=['Algorithm', 'ProofType'], dtype=float)

x

# YOUR CODE HERE

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.055185e+09,532000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.927942e+10,314159265359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.792718e+07,21000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.076842e+08,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1242,1.493105e+07,250000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1245,9.802226e+08,1400222610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1246,7.296538e+06,21000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Standardize the data with StandardScaler().

X_scaled = StandardScaler().fit_transform(x)

X_scaled
# YOUR CODE HERE

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [45]:
# Using PCA to reduce dimension to three principal components.

pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)

# YOUR CODE HERE

In [46]:
# Create a DataFrame with the three principal components.

df_pca_crypto = pd.DataFrame(crypto_pca, columns=['principal component 1', 'principal component 2', 'principal component 3'])

# YOUR CODE HERE

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.33371,0.977952,-0.541601
1,-0.317024,0.977992,-0.542051
2,2.323364,1.519185,-0.715651
3,-0.142111,-1.296245,0.124576
4,-0.155348,-1.982557,0.334265


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [50]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
inertia = []
k = list(range(1, 11))

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca_crypto)
    inertia.append(km.inertia_)
    
# Create the elbow curve

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=4`

In [60]:
# Initialize the K-Means model.
# YOUR CODE HERE
model = KMeans(n_clusters=3, random_state=3)

# Fit the model
# YOUR CODE HERE
model.fit(df_pca_crypto)

# Predict clusters
# YOUR CODE HERE

predictions = model.predict(df_pca_crypto)
df_pca_crypto["class"] = model.labels_

df_pca_crypto

Unnamed: 0,principal component 1,principal component 2,principal component 3,class
0,-0.333710,0.977952,-0.541601,0
1,-0.317024,0.977992,-0.542051,0
2,2.323364,1.519185,-0.715651,0
3,-0.142111,-1.296245,0.124576,0
4,-0.155348,-1.982557,0.334265,0
...,...,...,...,...
528,2.472486,0.960090,-0.295325,0
529,-0.331752,0.977825,-0.541621,0
530,0.325138,-2.253248,0.382551,0
531,-0.156523,-2.000693,0.615064,0


In [78]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE
clustered_df = pd.DataFrame(crypto2_df, columns=["Algorithm", "ProofType", "TotalCoinsMined", "TotalCoinSupply"])
clustered_df['PC1'] = df_pca_crypto['principal component 1'].values
clustered_df['PC2'] = df_pca_crypto['principal component 2'].values
clustered_df['PC3'] = df_pca_crypto['principal component 3'].values

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE
clustered_df['CoinName'] = crypto_names_df['CoinName']

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE
clustered_df["Class"] = model.labels_
clustered_df
# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df

(533, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
0,Scrypt,PoW/PoS,4.199995e+01,42,-0.333710,0.977952,-0.541601,42 Coin,0
2,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.317024,0.977992,-0.542051,404Coin,0
5,X13,PoW/PoS,2.927942e+10,314159265359,2.323364,1.519185,-0.715651,EliteCoin,0
7,SHA-256,PoW,1.792718e+07,21000000,-0.142111,-1.296245,0.124576,Bitcoin,0
8,Ethash,PoW,1.076842e+08,0,-0.155348,-1.982557,0.334265,Ethereum,0
...,...,...,...,...,...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000,2.472486,0.960090,-0.295325,ZEPHYR,0
1242,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.331752,0.977825,-0.541621,Gapcoin,0
1245,CryptoNight,PoW,9.802226e+08,1400222610,0.325138,-2.253248,0.382551,Beldex,0
1246,Equihash,PoW,7.296538e+06,21000000,-0.156523,-2.000693,0.615064,Horizen,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [79]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE
fig= px.scatter_3d(clustered_df, x='PC1', y='PC2', z='PC3', color='Class', symbol='Class', size='Class', width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [82]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE
clustered_df


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
0,Scrypt,PoW/PoS,4.199995e+01,42,-0.333710,0.977952,-0.541601,42 Coin,0
2,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.317024,0.977992,-0.542051,404Coin,0
5,X13,PoW/PoS,2.927942e+10,314159265359,2.323364,1.519185,-0.715651,EliteCoin,0
7,SHA-256,PoW,1.792718e+07,21000000,-0.142111,-1.296245,0.124576,Bitcoin,0
8,Ethash,PoW,1.076842e+08,0,-0.155348,-1.982557,0.334265,Ethereum,0
...,...,...,...,...,...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000,2.472486,0.960090,-0.295325,ZEPHYR,0
1242,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.331752,0.977825,-0.541621,Gapcoin,0
1245,CryptoNight,PoW,9.802226e+08,1400222610,0.325138,-2.253248,0.382551,Beldex,0
1246,Equihash,PoW,7.296538e+06,21000000,-0.156523,-2.000693,0.615064,Horizen,0


In [19]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

There are 532 tradable cryptocurrencies.


In [20]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [21]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,1
ETH,0.0,0.000109,Ethereum,1
LTC,8.4e-05,6.4e-05,Litecoin,1
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,1
ETC,0.00021,0.000115,Ethereum Classic,1
ZEC,2.1e-05,7e-06,ZCash,1


In [22]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
