# Clustering Crypto

In [1915]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from pathlib import Path

### Fetching Cryptocurrency Data

In [1916]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [1917]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
get_req = requests.get(url)
data = get_req.json()
data['Data']
raw_df = pd.DataFrame(data['Data'])
raw_df = raw_df.transpose()
raw_df = raw_df.reset_index()

In [1918]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,128326.99634,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,21491213.46445,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [1919]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
df = df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']]

df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,128326.99634,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,21491213.46445,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [1920]:
# Keep only cryptocurrencies that are trading
df = df.loc[df.IsTrading, :]
df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,True,PoW,7296537.50000,21000000


In [1921]:
# Keep only cryptocurrencies with a working algorithm
df.Algorithm.unique() #to read/see algos

    #what? are working algorithms? There are no 'non working ones' or Nan or N/As
#df

array(['Scrypt', 'X11', 'SHA-256', 'X13', 'Ethash', 'CryptoNight-V7',
       'Equihash', 'SHA-512', 'Multiple', 'X15', 'NIST5', 'Quark',
       'Groestl', 'PoS', 'NeoScrypt', 'SHA3', 'HybridScryptHash256',
       'Scrypt-n', 'PHI1612', 'Lyra2REv2', 'CryptoNight', 'Shabal256',
       'Counterparty', 'Blake', 'Momentum', 'Stanford Folding', 'QuBit',
       'XG Hash', 'M7 POW', 'Curve25519', 'Lyra2RE', 'QUAIT', 'vDPOS',
       'Blake2b', 'BLAKE256', '1GB AES Pattern Search', 'Dagger',
       'CryptoNight-Lite', 'X11GOST', 'SHA-256D', 'POS 3.0',
       'Progressive-n', 'DPoS', 'Lyra2Z', 'X14', 'Time Travel', 'Argon2',
       'Keccak', 'Blake2S', 'Dagger-Hashimoto', '536', 'Argon2d',
       'Cloverhash', 'Skein', 'SkunkHash v2 Raptor',
       'VeChainThor Authority', 'Ouroboros', 'POS 2.0', 'SkunkHash',
       'C11', 'Proof-of-BibleHash', 'SHA-256 + Hive',
       'Proof-of-Authority', 'XEVAN', 'VBFT', 'YescryptR16', 'IMesh',
       'Green Protocol', 'Semux BFT consensus', 'X16R', 'Tribus',


In [1922]:
# Remove the "IsTrading" column
df.drop('IsTrading', inplace=True, axis=1)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.00000,0
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,PoW,7296537.50000,21000000


In [1923]:
# Remove rows with at least 1 null value
df = df.dropna(axis='rows')
df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
4,808,SHA-256,PoW/PoS,0.00000,0
5,EliteCoin,X13,PoW/PoS,29279424622.50270,314159265359
7,Bitcoin,SHA-256,PoW,17927175.00000,21000000
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,1999999995.30560,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,14931046.15466,250000000
1245,Beldex,CryptoNight,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,PoW,7296537.50000,21000000


In [1924]:
# Remove rows with cryptocurrencies having no coins mined

#reformat sci notif if needed
#pd.options.display.float_format = '{:.5f}'.format

df.drop(df[df['TotalCoinsMined'] <= 0].index, inplace = True)
df = df.reset_index(drop=True) #reset index back to order etc
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
2,EliteCoin,X13,PoW/PoS,29279424622.50270,314159265359
3,Bitcoin,SHA-256,PoW,17927175.00000,21000000
4,Ethereum,Ethash,PoW,107684222.68650,0
...,...,...,...,...,...
527,ZEPHYR,SHA-256,DPoS,1999999995.30560,2000000000
528,Gapcoin,Scrypt,PoW/PoS,14931046.15466,250000000
529,Beldex,CryptoNight,PoW,980222595.00000,1400222610
530,Horizen,Equihash,PoW,7296537.50000,21000000


In [1925]:
# Drop rows where there are 'N/A' text values
#CoinName Algorithm ProofType, are all text value columns
df = df[df.CoinName.isin(['N/A']) == False]
df = df[df.Algorithm.isin(['N/A']) == False]
df = df[df.ProofType.isin(['N/A']) == False]



#didnt drop any rows etc?
df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
2,EliteCoin,X13,PoW/PoS,29279424622.50270,314159265359
3,Bitcoin,SHA-256,PoW,17927175.00000,21000000
4,Ethereum,Ethash,PoW,107684222.68650,0
...,...,...,...,...,...
527,ZEPHYR,SHA-256,DPoS,1999999995.30560,2000000000
528,Gapcoin,Scrypt,PoW/PoS,14931046.15466,250000000
529,Beldex,CryptoNight,PoW,980222595.00000,1400222610
530,Horizen,Equihash,PoW,7296537.50000,21000000


In [1926]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name_df = pd.DataFrame(df['CoinName'])
coin_name_df

Unnamed: 0,CoinName
0,42 Coin
1,404Coin
2,EliteCoin
3,Bitcoin
4,Ethereum
...,...
527,ZEPHYR
528,Gapcoin
529,Beldex
530,Horizen


In [1927]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df.drop('CoinName', inplace=True, axis=1)
df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
1,Scrypt,PoW/PoS,1055184902.04000,532000000
2,X13,PoW/PoS,29279424622.50270,314159265359
3,SHA-256,PoW,17927175.00000,21000000
4,Ethash,PoW,107684222.68650,0
...,...,...,...,...
527,SHA-256,DPoS,1999999995.30560,2000000000
528,Scrypt,PoW/PoS,14931046.15466,250000000
529,CryptoNight,PoW,980222595.00000,1400222610
530,Equihash,PoW,7296537.50000,21000000


In [1928]:
# Create dummy variables for text features
df = pd.get_dummies(data=df, columns=['Algorithm', 'ProofType']) #are they the correct text features
df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1055184902.04000,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,29279424622.50270,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,17927175.00000,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,107684222.68650,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,1999999995.30560,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
528,14931046.15466,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
529,980222595.00000,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
530,7296537.50000,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1929]:
# Standardize data
df_standardized = StandardScaler().fit_transform(df)

### Reducing Dimensions Using PCA

In [1930]:
# Use PCA to reduce dimensions to 3 principal components
# Initialize PCA model
pca = PCA(n_components=3)

# Get two principal components for the data.
three_pca = pca.fit_transform(df_standardized)

three_pca

array([[-0.32996357,  1.03388286, -0.48376986],
       [-0.31331743,  1.03388582, -0.48407021],
       [ 2.31065222,  1.5612982 , -0.59333699],
       ...,
       [ 0.33066263, -2.29081035,  0.30689802],
       [-0.16132069, -2.32810577,  0.36777121],
       [-0.2806291 ,  0.90730475, -0.1873943 ]])

In [1931]:
# Create a DataFrame with the principal components data
df_pca = pd.DataFrame(
    three_pca, columns=["PC 1", "PC 2", "PC 3"]
)
df_pca.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.32996,1.03388,-0.48377
1,-0.31332,1.03389,-0.48407
2,2.31065,1.5613,-0.59334
3,-0.1455,-1.24917,0.15963
4,-0.15573,-1.93217,0.35476


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [1932]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=<your best value for k here>`

In [1933]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
#the amount of clusters appropriate does vary with different data

# Fit the model
model.fit(df_pca)

# Predict clusters
predictions = model.predict(df_pca)

# Create a new DataFrame including predicted clusters and cryptocurrencies features

df_pred_feats = pd.DataFrame(df_pca)
df_pred_feats['CoinName'] = coin_name_df #add coin names
df_pred_feats["class"] = model.labels_ #classes/features labels column
df_pred_feats["predictions"] = predictions #add predictions columns

#adding both coins minted and coin supply
df_pred_feats[['TotalCoinsMined', 'TotalCoinSupply']] = df[['TotalCoinsMined', 'TotalCoinSupply']] 


#new df with index as coin names.. 
#coins_df = df_pred_feats
#coins_df = coins_df.set_index('CoinName')

print(df_pred_feats.shape)
df_pred_feats.head(20)

(532, 8)


Unnamed: 0,PC 1,PC 2,PC 3,CoinName,class,predictions,TotalCoinsMined,TotalCoinSupply
0,-0.32996,1.03388,-0.48377,42 Coin,0,0,41.99995,42
1,-0.31332,1.03389,-0.48407,404Coin,0,0,1055184902.04,532000000
2,2.31065,1.5613,-0.59334,EliteCoin,0,0,29279424622.5027,314159265359
3,-0.1455,-1.24917,0.15963,Bitcoin,1,1,17927175.0,21000000
4,-0.15573,-1.93217,0.35476,Ethereum,1,1,107684222.6865,0
5,-0.15857,-1.08465,-0.00507,Litecoin,1,1,63039243.3,84000000
6,-0.41638,1.15799,-0.46534,Dash,0,0,9031294.37563,22000000
7,-0.16154,-2.26766,0.26171,Monero,1,1,17201143.14491,0
8,-0.15417,-1.93229,0.35474,Ethereum Classic,1,1,113359703.0,210000000
9,-0.16132,-2.32811,0.36777,ZCash,1,1,7383056.25,21000000


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [1936]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
scaled_data = mm_scaler.fit_transform(df_pred_feats[['TotalCoinsMined','TotalCoinSupply']])

plot_df = pd.DataFrame(scaled_data, columns=[['TotalCoinsMined','TotalCoinSupply']], index=df_pred_feats.index)
plot_df['CoinName'] = df_pred_feats['CoinName']
plot_df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,CoinName
0,0.00000,0.00000,42 Coin
1,0.00107,0.00053,404Coin
2,0.02958,0.31416,EliteCoin
3,0.00002,0.00002,Bitcoin
4,0.00011,0.00000,Ethereum
...,...,...,...
527,0.00202,0.00200,ZEPHYR
528,0.00002,0.00025,Gapcoin
529,0.00099,0.00140,Beldex
530,0.00001,0.00002,Horizen


In [1938]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply")

#plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", hover_cols=["CoinName"])

ValueError: Dimensions specified as a tuple must be a tuple consisting of the name and label not: ('TotalCoinsMined',)

In [None]:
plot_minted_supply = df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", title='Total Minted Total Supply')
plot_minted_supply

#### Table of Tradable Cryptocurrencies

In [1941]:
# Table with tradable cryptos
plot_df #other data (algos, predictions, class, proof of type etc), is in other dataframes. #TotalCoinsMined	TotalCoinSupply	CoinName


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,CoinName
0,0.00000,0.00000,42 Coin
1,0.00107,0.00053,404Coin
2,0.02958,0.31416,EliteCoin
3,0.00002,0.00002,Bitcoin
4,0.00011,0.00000,Ethereum
...,...,...,...
527,0.00202,0.00200,ZEPHYR
528,0.00002,0.00025,Gapcoin
529,0.00099,0.00140,Beldex
530,0.00001,0.00002,Horizen


In [1942]:
# Print the total number of tradable cryptocurrencies
print(f'The total number of tradable cryptocurrencies: {len(plot_df.index)}.')

The total number of tradable cryptocurrencies: 532.
