In [1]:
!pip install -U altair

Requirement already up-to-date: altair in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (4.1.0)
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import altair

In [3]:
# Loade the cryptocurrencies data
file_path = Path("Resources/crypto_data.csv")
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [4]:
# Keep only cryptocurrencies that are on trading

crypto_df = crypto_df.loc[(crypto_df['IsTrading'] == True)]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [5]:
# Keep only cryptocurrencies with a working algorithm

crypto_df = crypto_df.loc[crypto_df['Algorithm'] != 'N/A']
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [6]:
# Remove the "IsTrading" column

crypto_df = crypto_df.drop(['IsTrading'], axis = 1)
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [7]:
# Remove rows with at least 1 null value

crypto_df = crypto_df.dropna()
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [8]:
# Remove rows with cryptocurrencies without coins mined

crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] != 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [9]:
# Fetch the cryptocurrencies names prior to drop them from crypto_df

coinname = crypto_df.CoinName
coinname

42          42 Coin
404         404Coin
1337      EliteCoin
BTC         Bitcoin
ETH        Ethereum
           ...     
ZEPH         ZEPHYR
GAP         Gapcoin
BDX          Beldex
ZEN         Horizen
XBC     BitcoinPlus
Name: CoinName, Length: 533, dtype: object

In [10]:
# Remove the cryptocurrency name since it's not going to be used on the clustering algorithm

crypto_df = crypto_df.drop(['CoinName'], axis = 1)
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Equihash,PoW,7.296538e+06,21000000


In [11]:
# Create dummies variables for text features

crypto_dummies = pd.get_dummies(data=crypto_df, columns=['Algorithm', 'ProofType'])
crypto_dummies

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Standardize data

crypto_scaler = StandardScaler()
crypto_scaler.fit(crypto_dummies)

crypto_scaled = crypto_scaler.transform(crypto_dummies)
crypto_scaled

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

In [13]:
# Use PCA to reduce dimension to 3 principal components

pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

crypto_pca

array([[-0.32069538,  1.1013833 , -0.56799668],
       [-0.3039783 ,  1.10169536, -0.56835384],
       [ 2.31399687,  1.68008486, -0.68325207],
       ...,
       [ 0.32405514, -2.34147983,  0.36745313],
       [-0.14139928, -2.07384188,  0.51688291],
       [-0.2774858 ,  0.87030575, -0.27184924]])

In [14]:
# Create a DataFrame with the principal components data

pca_df = pd.DataFrame(crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=coinname.index)

pca_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.320695,1.101383,-0.567997
404,-0.303978,1.101695,-0.568354
1337,2.313997,1.680085,-0.683252
BTC,-0.144949,-1.332194,0.174209
ETH,-0.153572,-2.001602,0.390270
...,...,...,...
ZEPH,2.447848,0.749468,-0.086762
GAP,-0.318734,1.101286,-0.568016
BDX,0.324055,-2.341480,0.367453
ZEN,-0.141399,-2.073842,0.516883


In [15]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using altair
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,3646.302441
1,2,2474.498868
2,3,1487.452626
3,4,589.805186
4,5,393.919069


In [16]:
# Altair Elbow curve

fig0 = altair.Chart(df_elbow).mark_circle(size=60).encode(
    x="k",
    y="inertia"
)

fig0.display()

In [17]:
def get_clusters(k, data):
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create a new DataFrame including predicted clusters and cryptocurrencies features
    data["class"] = model.labels_

    return data

In [18]:
clusters_4 = get_clusters(4, pca_df)
clusters_4

Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.320695,1.101383,-0.567997,0
404,-0.303978,1.101695,-0.568354,0
1337,2.313997,1.680085,-0.683252,0
BTC,-0.144949,-1.332194,0.174209,1
ETH,-0.153572,-2.001602,0.390270,1
...,...,...,...,...
ZEPH,2.447848,0.749468,-0.086762,0
GAP,-0.318734,1.101286,-0.568016,0
BDX,0.324055,-2.341480,0.367453,1
ZEN,-0.141399,-2.073842,0.516883,1


In [19]:
clustered_df = pd.concat([coinname, crypto_df, clusters_4], axis=1)

clustered_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42,-0.320695,1.101383,-0.567997,0
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.303978,1.101695,-0.568354,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359,2.313997,1.680085,-0.683252,0
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000,-0.144949,-1.332194,0.174209,1
ETH,Ethereum,Ethash,PoW,1.076842e+08,0,-0.153572,-2.001602,0.390270,1
...,...,...,...,...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000,2.447848,0.749468,-0.086762,0
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.318734,1.101286,-0.568016,0
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610,0.324055,-2.341480,0.367453,1
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000,-0.141399,-2.073842,0.516883,1


In [20]:
# Create a Scatter with the PCA data and the clusters
# tool tips: "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"

fig1 = altair.Chart(clustered_df).mark_circle(size=60).encode(
    x="PC 1",
    y="PC 2",
    color="class",
    tooltip = ["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
)

fig1.display()

In [21]:
# Table with tradable cryptos

crypto_table = clustered_df.drop(['PC 1', 'PC 2', 'PC 3'], axis = 1)

altair.Chart(crypto_table).mark_text().display(renderer='svg', theme='string', actions='dict')

In [22]:
# Print the total number of tradable cryptocurrencies

clustered_df.count()

CoinName           533
Algorithm          533
ProofType          533
TotalCoinsMined    533
TotalCoinSupply    533
PC 1               533
PC 2               533
PC 3               533
class              533
dtype: int64

In [23]:
# Scale data to create the scatter plot
col_names = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'PC 1', 'PC 2', 'PC 3', 'class']

features = clustered_df[col_names]

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler

clustered_scaled = ColumnTransformer(
    [('somename', StandardScaler(), ['TotalCoinsMined', 'TotalCoinSupply'])], remainder='passthrough'
)

clustered_scaled = clustered_scaled.fit_transform(features)

col_names = ['TotalCoinsMined', 'TotalCoinSupply', 'CoinName', 'Algorithm', 'ProofType', 'PC 1', 'PC 2', 'PC 3', 'class']

clustered_scaled = pd.DataFrame(clustered_scaled, columns = col_names, index = clustered_df.index)
clustered_scaled

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,CoinName,Algorithm,ProofType,PC 1,PC 2,PC 3,class
42,-0.116748,-0.152865,42 Coin,Scrypt,PoW/PoS,-0.320695,1.10138,-0.567997,0
404,-0.0935889,-0.144996,404Coin,Scrypt,PoW/PoS,-0.303978,1.1017,-0.568354,0
1337,0.525872,4.49376,EliteCoin,X13,PoW/PoS,2.314,1.68008,-0.683252,0
BTC,-0.116354,-0.152554,Bitcoin,SHA-256,PoW,-0.144949,-1.33219,0.174209,1
ETH,-0.114384,-0.152865,Ethereum,Ethash,PoW,-0.153572,-2.0016,0.39027,1
...,...,...,...,...,...,...,...,...,...
ZEPH,-0.0728522,-0.123283,ZEPHYR,SHA-256,DPoS,2.44785,0.749468,-0.0867617,0
GAP,-0.11642,-0.149167,Gapcoin,Scrypt,PoW/PoS,-0.318734,1.10129,-0.568016,0
BDX,-0.0952341,-0.132154,Beldex,CryptoNight,PoW,0.324055,-2.34148,0.367453,1
ZEN,-0.116588,-0.152554,Horizen,Equihash,PoW,-0.141399,-2.07384,0.516883,1


In [24]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

fig2 = altair.Chart(clustered_scaled).mark_circle(size=60).encode(
    x="TotalCoinsMined", 
    y="TotalCoinSupply",
    color="class",
    tooltip = ["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
)

fig2