# Clustering Crypto

In [1]:
# Initial imports
import pandas as pd
# import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
# Additional dependencies for plotting in SageMaker
!pip install -U altair

Requirement already up-to-date: altair in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (4.0.1)
Requirement not upgraded as not directly required: jsonschema in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (2.6.0)
Requirement not upgraded as not directly required: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (2.10)
Requirement not upgraded as not directly required: entrypoints in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (0.2.3)
Requirement not upgraded as not directly required: numpy in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (1.14.3)
Requirement not upgraded as not directly required: pandas in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from altair) (0.24.2)
Requirement not upgraded as not directly required: toolz in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (fro

In [3]:
import altair as alt

### Data Preprocessing

In [4]:
# Load the cryptocurrencies data
file_path = Path("crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [5]:
# Check dtypes
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [6]:
# Check value counts
crypto_df['IsTrading'].value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [7]:
# Keep only cryptocurrencies that are on trading
index_names = crypto_df[crypto_df['IsTrading'] == False].index

crypto_df.drop(index_names, inplace=True)

In [8]:
# Check value counts
crypto_df['IsTrading'].value_counts()

True    1144
Name: IsTrading, dtype: int64

In [9]:
# Keep only cryptocurrencies with a working algorithm
crypto_df.isnull().sum()

CoinName             0
Algorithm            0
IsTrading            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [10]:
# Remove the "IsTrading" column
crypto_df.drop('IsTrading', axis = 1, inplace=True)

In [11]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True)

In [12]:
# Remove rows with cryptocurrencies without coins mined
index_names = crypto_df[crypto_df['TotalCoinsMined'] == 0].index

crypto_df.drop(index_names, inplace=True)

In [13]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df
coin_names = crypto_df['CoinName']

In [14]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm
crypto_df.drop('CoinName', axis = 1, inplace=True)

In [15]:
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [16]:
# Convert dtype
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].astype(float)

In [17]:
# Create dummies variables for text features
crypto_df_enc = crypto_df.copy()

crypto_df_enc = pd.get_dummies(
    crypto_df_enc, 
    columns=['Algorithm', 'ProofType'],
    drop_first=True
)

crypto_df_enc.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
crypto_df_enc.shape

(533, 98)

In [19]:
# Standardize data
crypto_df_enc_scaled = StandardScaler().fit_transform(crypto_df_enc)


Data with input dtype uint8, float64 were all converted to float64 by StandardScaler.


Data with input dtype uint8, float64 were all converted to float64 by StandardScaler.



### Reducing Dimensions Using PCA

In [20]:
# Use PCA to reduce dimension to 3 principal components
pca = PCA(n_components=3)

crypto_pca = pca.fit_transform(crypto_df_enc_scaled)

In [21]:
# Create a DataFrame with the principal components data
crypto_pca_df = pd.DataFrame(
    data=crypto_pca, 
    columns=["PC1", "PC2", "PC3"],
    index=crypto_df.index
)

crypto_pca_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.323181,1.019952,-0.570899
404,-0.306497,1.019984,-0.571337
1337,2.300751,1.580843,-0.701533
BTC,-0.151349,-1.302064,0.196468
ETH,-0.166018,-1.976599,0.3553


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [22]:
inertia = []
list_k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(crypto_pca_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": list_k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)

# df_elbow.hvplot.line(x="k", y="inertia", xticks=list_k, title="Elbow Curve")

alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia'
)

### Running K-Means with `k=4`

In [23]:
# Initialize the K-Means model
model = KMeans(
    n_clusters=4,
    random_state=42
)

# Fit the model
model.fit(crypto_pca_df)
# Predict clusters
predictions = model.predict(crypto_pca_df)

# Add clusters column
crypto_pca_df["cluster"] = model.labels_

In [24]:
# Create a new DataFrame including predicted clusters and cryptocurrencies feature
crypto_clustered_df = pd.concat([crypto_df, crypto_pca_df], axis = 1)

In [25]:
# Add coin names back
crypto_clustered_df.insert(0, 'CoinName', coin_names)

crypto_clustered_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,cluster
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0,-0.323181,1.019952,-0.570899,1
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.306497,1.019984,-0.571337,1
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159300000.0,2.300751,1.580843,-0.701533,1
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0,-0.151349,-1.302064,0.196468,0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0,-0.166018,-1.976599,0.3553,0


### Visualizing Results

#### 3D-Scatter with Clusters

In [37]:
# Create a 3D-Scatter with the PCA data and the clusters
'''
fig = px.scatter_3d(
    crypto_clustered_df,
    x="PC3",
    y="PC2",
    z="PC1",
    hover_name = coin_names,
    hover_data = ["Algorithm"],
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()
'''

alt.Chart(crypto_clustered_df).mark_circle(size=60).encode(
    x='PC1',
    y='PC2',
    color = alt.Color('cluster:O',
        scale=alt.Scale(
        domain=['0', '1', '3', '4'],
        range=['red', 'green', 'yellow', 'black'])),
    tooltip=['CoinName:O', 'Algorithm:O', 'TotalCoinsMined:N', 'TotalCoinSupply:N']
).interactive()

#### Table of Tradable Cryptocurrencies

In [42]:
# Table with tradable cryptos
# crypto_clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 
#                                 'TotalCoinsMined','cluster'], width=800)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    display(crypto_clustered_df)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,cluster
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0,-0.323181,1.019952,-0.570899,1
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.306497,1.019984,-0.571337,1
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159300000.0,2.300751,1.580843,-0.701533,1
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000.0,-0.151349,-1.302064,0.196468,0
ETH,Ethereum,Ethash,PoW,107684200.0,0.0,-0.166018,-1.976599,0.3553,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000.0,-0.154783,-1.101582,0.028498,0
DASH,Dash,X11,PoW/PoS,9031294.0,22000000.0,-0.409748,1.17565,-0.551791,1
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0.0,-0.159501,-2.214903,0.483899,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000.0,-0.164458,-1.976718,0.355291,0
ZEC,ZCash,Equihash,PoW,7383056.0,21000000.0,-0.149917,-1.959839,0.355087,0


In [43]:
# Print the total number of tradable cryptocurrencies
print(f'There are {crypto_clustered_df.cluster.count()} tradable cryptos')

There are 533 tradable cryptos


#### Scatter Plot with Tradable Cryptocurrencies

In [44]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
crypto_clustered_df.plot(kind='scatter', x = 'TotalCoinsMined', y = 'TotalCoinSupply')

<matplotlib.axes._subplots.AxesSubplot at 0x7f72e0409f98>