# Clustering Crypto

In [208]:
# Initial imports
import requests
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import hvplot.pandas 
import numpy as np
import holoviews as hv
from holoviews import dim, opts
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [168]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [169]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [170]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")
df = pd.read_csv(file_path)
# Create a DataFrame
df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [171]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
cryptodf = df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]
cryptodf

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [172]:
# Keep only cryptocurrencies that are trading
cryptodf = cryptodf.loc[cryptodf['IsTrading']==True]

cryptodf

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [173]:
# Keep only cryptocurrencies with a working algorithm
cryptodf['Algorithm'].unique()



array(['Scrypt', 'X11', 'SHA-256', 'X13', 'Ethash', 'CryptoNight-V7',
       'Equihash', 'SHA-512', 'Multiple', 'X15', 'NIST5', 'Quark',
       'Groestl', 'PoS', 'NeoScrypt', 'SHA3', 'HybridScryptHash256',
       'Scrypt-n', 'PHI1612', 'Lyra2REv2', 'CryptoNight', 'Shabal256',
       'Counterparty', 'Blake', 'Momentum', 'Stanford Folding', 'QuBit',
       'XG Hash', 'M7 POW', 'Curve25519', 'Lyra2RE', 'QUAIT', 'vDPOS',
       'Blake2b', 'BLAKE256', '1GB AES Pattern Search', 'Dagger',
       'CryptoNight-Lite', 'X11GOST', 'SHA-256D', 'POS 3.0',
       'Progressive-n', 'DPoS', 'Lyra2Z', 'X14', 'Time Travel', 'Argon2',
       'Keccak', 'Blake2S', 'Dagger-Hashimoto', '536', 'Argon2d',
       'Cloverhash', 'Skein', 'SkunkHash v2 Raptor',
       'VeChainThor Authority', 'Ouroboros', 'POS 2.0', 'SkunkHash',
       'C11', 'Proof-of-BibleHash', 'SHA-256 + Hive',
       'Proof-of-Authority', 'XEVAN', 'VBFT', 'YescryptR16', 'IMesh',
       'Green Protocol', 'Semux BFT consensus', 'X16R', 'Tribus',


In [174]:
# Remove the "IsTrading" column
cryptodf1 = cryptodf.drop(columns=['IsTrading'])

In [175]:
cryptodf1

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [176]:
# Remove rows with at least 1 null value
cryptodf1.dropna(inplace=True)

cryptodf1

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [177]:
# Remove rows with cryptocurrencies having no coins mined
cryptodf1 = cryptodf1.loc[cryptodf1['TotalCoinsMined']>=1]

cryptodf1

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [178]:
# Drop rows where there are 'N/A' text values
cryptodf1.isna().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [179]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
cryptodf1coinname = cryptodf1['CoinName']

cryptodf1.drop(columns=['CoinName'],inplace=True)
cryptodf1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [180]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm


In [181]:
# Create dummy variables for text features
cryptodf2 = pd.get_dummies(cryptodf1[['Algorithm','ProofType']])
cryptodf2

cryptodf3 = pd.concat([cryptodf1[['TotalCoinsMined','TotalCoinSupply']],cryptodf2],join="inner",axis="columns")
                      

In [182]:
cryptodf3

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [183]:
# Standardize data
scaly = StandardScaler()
scalier = scaly.fit_transform(cryptodf3)

### Reducing Dimensions Using PCA

In [184]:
# Use PCA to reduce dimensions to 3 principal components
scalypca = PCA(n_components=3)

In [185]:
# Create a DataFrame with the principal components data
scalierpca = scalypca.fit_transform(scalier)
scalierpca

array([[-0.3130956 ,  1.11621131, -0.5104681 ],
       [-0.29643125,  1.11686151, -0.51068414],
       [ 2.31613136,  1.79745454, -0.56948032],
       ...,
       [ 0.30723668, -2.29899588,  0.36440135],
       [-0.16587486, -2.1552736 ,  0.27386083],
       [-0.28068424,  0.84822967, -0.25227863]])

In [186]:
scalierpcadf = pd.DataFrame(
    data=scalierpca, columns=["PCA 1", "PCA 2", "PCA 3"]
)
scalierpcadf

Unnamed: 0,PCA 1,PCA 2,PCA 3
0,-0.313096,1.116211,-0.510468
1,-0.296431,1.116862,-0.510684
2,2.316131,1.797455,-0.569480
3,-0.118643,-1.313043,0.193105
4,-0.140701,-2.037179,0.394109
...,...,...,...
527,2.487688,0.541535,-0.217855
528,-0.311143,1.116157,-0.510468
529,0.307237,-2.298996,0.364401
530,-0.165875,-2.155274,0.273861


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [187]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(scalierpca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
scalyelbow = {"k": k, "inertia": inertia}
scalierelbow = pd.DataFrame(scalyelbow)
scalierelbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



Running K-Means with `k=<your best value for k here>`

In [188]:
# Initialize the K-Means model
scalymodel = KMeans(n_clusters=4, random_state=0)

# Fit the model
scalymodel.fit(scalierpcadf)

# Predict clusters
scalypredictions = scalymodel.predict(scalierpcadf)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
scalierpcadf["class"] = scalymodel.labels_

scalierpcadf


Unnamed: 0,PCA 1,PCA 2,PCA 3,class
0,-0.313096,1.116211,-0.510468,0
1,-0.296431,1.116862,-0.510684,0
2,2.316131,1.797455,-0.569480,0
3,-0.118643,-1.313043,0.193105,3
4,-0.140701,-2.037179,0.394109,3
...,...,...,...,...
527,2.487688,0.541535,-0.217855,0
528,-0.311143,1.116157,-0.510468,0
529,0.307237,-2.298996,0.364401,3
530,-0.165875,-2.155274,0.273861,3


In [222]:
cryptodf4 = pd.concat([cryptodf1coinname,cryptodf1,scalierpcadf],join="inner",axis="columns")
cryptodf4

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PCA 1,PCA 2,PCA 3,class
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42,-0.313096,1.116211,-0.510468,0
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000,2.316131,1.797455,-0.569480,0
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359,-0.148819,-1.086200,-0.007497,3
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000,-0.135120,-2.254916,0.410816,3
8,Ethereum,Ethash,PoW,1.076842e+08,0,-0.139145,-2.037230,0.394111,3
...,...,...,...,...,...,...,...,...,...
516,RoyalCoin,X13,PoS,2.500124e+06,2500124,-0.120284,-1.823290,0.343396,3
522,GanjaCoin V2,X14,PoW/PoS,1.000000e+08,100000000,-0.230216,0.621891,-0.051884,0
523,TeamUP,PoS,PoS,1.781868e+07,301000000,-0.592701,3.635223,7.681732,1
525,LanaCoin,SHA-256D,PoW/PoS,1.082163e+09,7506000000,34.040611,2.358704,-0.811893,2


### Visualizing Results

#### 3D-Scatter with Clusters

In [227]:
threedee = px.scatter_3d(
    cryptodf4,
    x="PCA 1",
    y="PCA 2",
    z="PCA 3",
    color="class",
    hover_name="CoinName",
    hover_data=["Algorithm"]
)

threedee.show()

#### Table of Tradable Cryptocurrencies

In [210]:
# Table with tradable cryptos
cryptodf1coinname

0           42 Coin
2           404Coin
5         EliteCoin
7           Bitcoin
8          Ethereum
           ...     
1238         ZEPHYR
1242        Gapcoin
1245         Beldex
1246        Horizen
1247    BitcoinPlus
Name: CoinName, Length: 532, dtype: object

In [211]:
# Print the total number of tradable cryptocurrencies
cryptodf1coinname.count()

532

#### Scatter Plot with Tradable Cryptocurrencies

In [231]:
# Scale data to create the scatter plot
#scalycryptodf3 = scaly.fit_transform(cryptodf3[["TotalCoinsMined","TotalCoinSupply"]])
#scalycryptodf3columns = ["TotalCoinsMined","TotalCoinSupply"]
#scalycryptodf3.columns= scalycryptodf3columns
scalycryptodf3

array([[-0.11710817, -0.1528703 ],
       [-0.09396955, -0.145009  ],
       [ 0.52494561,  4.48942416],
       ...,
       [-0.09561336, -0.13217937],
       [-0.11694817, -0.15255998],
       [-0.11710536, -0.15285552]])

In [230]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
cryptodf4.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply",hover_cols=["CoinName"])