# Clustering Crypto

In [237]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [238]:
# Read in the crypto_data.csv to the Pandas DataFrame named crypto_df
file_path = Path("./crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [239]:
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [240]:
# Keep all the cryptocurrencies that are being traded.
df_crypto = crypto_df[crypto_df['IsTrading'] == True]
df_crypto.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [241]:
# Keep all the cryptocurrencies that have a working algorithm.
df_crypto = df_crypto[df_crypto['Algorithm'].isna() == False]
df_crypto.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [242]:
# Remove the "IsTrading" column. 
df_crypto.drop(columns=["IsTrading"], inplace=True)
df_crypto.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [243]:
# Remove rows that have at least 1 null value.
df_crypto = df_crypto.dropna()
df_crypto.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [244]:
# Keep the rows where coins are mined.
df_crypto = df_crypto.loc[df_crypto['TotalCoinsMined'] > 0]
df_crypto.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0


In [245]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names = df_crypto[["CoinName"]]
crypto_names.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [246]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
df_crypto = df_crypto.drop(['CoinName'], axis=1)
df_crypto.head(20)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [247]:
df_crypto = df_crypto.sort_values(by='Algorithm', ascending=True)
df_crypto["Algorithm"].unique()

array(['1GB AES Pattern Search', '536', 'Argon2d', 'BLAKE256', 'Blake',
       'Blake2S', 'Blake2b', 'C11', 'Cloverhash', 'Counterparty',
       'CryptoNight', 'CryptoNight Heavy', 'CryptoNight-V7',
       'Cryptonight-GPU', 'DPoS', 'Dagger', 'Dagger-Hashimoto',
       'ECC 256K1', 'Equihash', 'Equihash+Scrypt', 'Ethash', 'Exosis',
       'Green Protocol', 'Groestl', 'HMQ1725', 'HybridScryptHash256',
       'IMesh', 'Jump Consistent Hash', 'Keccak', 'Leased POS', 'Lyra2RE',
       'Lyra2REv2', 'Lyra2Z', 'M7 POW', 'Multiple', 'NIST5', 'NeoScrypt',
       'Ouroboros', 'PHI1612', 'POS 2.0', 'POS 3.0', 'PoS',
       'Proof-of-Authority', 'Proof-of-BibleHash', 'QUAIT', 'QuBit',
       'Quark', 'SHA-256', 'SHA-256 + Hive', 'SHA-256D', 'SHA-512',
       'SHA3', 'Scrypt', 'Semux BFT consensus', 'Shabal256', 'Skein',
       'SkunkHash', 'SkunkHash v2 Raptor', 'Stanford Folding', 'TRC10',
       'Time Travel', 'Tribus', 'VBFT', 'VeChainThor Authority', 'X11',
       'X11GOST', 'X13', 'X14', 'X15

In [248]:
df_crypto = df_crypto.sort_values(by='ProofType', ascending=True)
df_crypto["ProofType"].unique()

array(['DPOS', 'DPoS', 'HPoW', 'LPoS', 'POBh', 'PoA', 'PoC', 'PoS',
       'PoS/LPoS', 'PoS/PoW', 'PoS/PoW/PoT', 'PoST', 'PoW', 'PoW + Hive',
       'PoW and PoS', 'PoW/PoS', 'PoW/PoS ', 'PoW/PoW', 'PoW/nPoS', 'Pos',
       'Proof of Authority', 'Proof of Trust', 'TPoS',
       'Zero-Knowledge Proof', 'dPoW/PoW'], dtype=object)

In [249]:
# Use the get_dummies() method to create variables for the two text features, Algorithm and ProofType, 
# and store the resulting data in a new DataFrame named X
X = pd.get_dummies(df_crypto, columns=['Algorithm', 'ProofType'])
X.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
AAC,1000000000.0,1000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SEM,1231147.0,100000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
EOS,1020545000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LSK,120012100.0,159918400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ARK,108202100.0,125000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ALX,1000000000.0,1000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BTT,989988700000.0,990000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TAU,288090600.0,500000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OXY,1122382000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEPH,2000000000.0,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [250]:
# Standardize the data with StandardScaler().
crypto_scaled = StandardScaler().fit_transform(X)
print(crypto_scaled[0:5])

[[-0.09517967 -0.13809342 -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  23.04343724 -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
  -0.72111026 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 23.04343724 -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Deliverable 2: Reducing Data Dimensions Using PCA

In [251]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [252]:
# Create a DataFrame with the three principal components.
# Create a new DataFrame named pcs_df that includes the following columns, PC 1, PC 2, and PC 3, 
# and uses the index of the df_crypto DataFrame as the index.

pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=df_crypto.index
)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
AAC,-0.339801,3.515328,14.782316
SEM,3.953294,1.506027,0.125925
EOS,3.758935,1.66112,0.028384
LSK,3.749074,1.660659,0.028711
ARK,3.748682,1.660667,0.02872


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [253]:
# Using the pcs_df DataFrame, create an elbow curve using hvPlot to find the best value for K.
inertia = []
k = list(range(1, 11))

In [254]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [255]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

Running K-Means with `k=4`

In [256]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)

[3 0 0 0 0 0 2 0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [257]:
# Create a new DataFrame named clustered_df by concatenating the df_crypto and pcs_df DataFrames on the same columns. 
# The index should be the same as the df_crypto DataFrame.
clustered_df = df_crypto.join(pcs_df, how='inner')

In [258]:
#  Add the CoinName column that holds the names of the cryptocurrencies to the clustered_df 
clustered_df['CoinName'] = crypto_names

In [259]:
#  Add another new column to the clustered_df named Class that holds the predictions, i.e., model.labels_
clustered_df["Class"] = model.labels_

In [260]:
# Print the shape of the clustered_df
clustered_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
AAC,ECC 256K1,DPOS,1000000000.0,1000000000,-0.339801,3.515328,14.782316,Acute Angle Cloud,3
SEM,Semux BFT consensus,DPoS,1231147.0,100000000,3.953294,1.506027,0.125925,Semux,0
EOS,DPoS,DPoS,1020545000.0,0,3.758935,1.66112,0.028384,EOS,0
LSK,DPoS,DPoS,120012100.0,159918400,3.749074,1.660659,0.028711,Lisk,0
ARK,DPoS,DPoS,108202100.0,125000000,3.748682,1.660667,0.02872,ARK,0
ALX,DPoS,DPoS,1000000000.0,1000000000,3.765764,1.660714,0.028254,ALAX,0
BTT,TRC10,DPoS,989988700000.0,990000000000,34.050119,1.734365,-1.003699,BitTorrent,2
TAU,DPoS,DPoS,288090600.0,500000000,3.753533,1.660598,0.028599,Lamden Tau,0
OXY,DPoS,DPoS,1122382000.0,0,3.760178,1.661165,0.028344,Oxycoin,0
ZEPH,SHA-256,DPoS,2000000000.0,2000000000,2.488176,0.681499,0.012031,ZEPHYR,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [261]:
# Create a 3D scatter plot using the Plotly Express scatter_3d() function to plot the three clusters 
# from the clustered_df DataFrame.
    # Add the CoinName and Algorithm columns to the hover_name and hover_data parameters
fig = px.scatter_3d(
    clustered_df, 
    x="PC 1", 
    y="PC 2", 
    z="PC 3", 
    color="Class", 
    symbol="Class", 
    hover_name="CoinName", 
    hover_data=["Algorithm"])
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [262]:
# Create a table with tradable cryptocurrencies using the hvplot.table() function.
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'Class'],
                          sortable=True, selectable=True)

In [263]:
# Print the total number of tradable cryptocurrencies.
clustered_df['CoinName'].count()

532

In [264]:
# Use the MinMaxScaler().fit_transform method to scale the TotalCoinSupply and TotalCoinsMined columns 
# between the given range of zero and one.
X_minmax = MinMaxScaler().fit_transform(clustered_df[['TotalCoinSupply', 'TotalCoinsMined']])
X_minmax


array([[1.00000000e-03, 1.01011248e-03],
       [1.00000000e-04, 1.24355458e-06],
       [0.00000000e+00, 1.03086476e-03],
       ...,
       [7.65000000e-05, 7.63547361e-05],
       [1.00000000e-03, 1.01011248e-03],
       [2.00000000e-04, 1.17070111e-04]])

In [265]:
# Create a new DataFrame using the clustered_df DataFrame index that contains the scaled data
index_values = (clustered_df.index.tolist())
plot_df = pd.DataFrame(
    data = X_minmax, columns=["TotalCoinSupply_scaled", "TotalCoinsMined_scaled"], index = index_values)

# Add the CoinName column from the clustered_df DataFrame to the new DataFrame.
plot_df['CoinName'] = clustered_df['CoinName'] 

# Add the Class column from the clustered_df DataFrame to the new DataFrame. 
plot_df['Class'] = clustered_df['Class'] 

plot_df.head(10)

Unnamed: 0,TotalCoinSupply_scaled,TotalCoinsMined_scaled,CoinName,Class
AAC,0.001,0.00101,Acute Angle Cloud,3
SEM,0.0001,1e-06,Semux,0
EOS,0.0,0.001031,EOS,0
LSK,0.00016,0.000121,Lisk,0
ARK,0.000125,0.000109,ARK,0
ALX,0.001,0.00101,ALAX,0
BTT,0.99,1.0,BitTorrent,2
TAU,0.0005,0.000291,Lamden Tau,0
OXY,0.0,0.001134,Oxycoin,0
ZEPH,0.002,0.00202,ZEPHYR,0


In [266]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined_scaled", y="TotalCoinSupply_scaled", by="Class",
                          xlabel="Total Coins Mined",
                          ylabel="Total Coin Supply",
                          )
