# Clustering Crypto

In [408]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Fetching Cryptocurrency Data

In [409]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"


In [410]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

r = requests.get(url)
data = r.json()

In [411]:
# Alternatively, use the provided csv file:
#file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
list(data.keys())



In [412]:
data = data['Data']

In [413]:
crcl_df = pd.DataFrame.from_dict(data)

In [414]:
crcl_df = crcl_df.T
crcl_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,0.000244,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


### Data Preprocessing

In [415]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

crcl_df.columns
crcl_df.drop(columns = [
    
    'Id',
    'Url',
    'ImageUrl',
    'ContentCreatedOn',
    'Name',
    'Symbol',
    'FullName',
    'Description',
    'AssetTokenStatus',
    'MktCapPenalty',
    'IsUsedInDefi',
    'PlatformType',
    'AlgorithmType',
    'Difficulty',
    'BuiltOn',
    'SmartContractAddress',
    'DecimalPoints',
    'SortOrder',
    'Sponsored',
    'Taxonomy',
    'Rating',
    'CirculatingSupply',
    'BlockNumber',
    'NetHashesPerSecond',
    'BlockReward',
    'BlockTime',
    'AssetLaunchDate',
    'AssetWhitepaperUrl',
    'AssetWebsiteUrl',
    'IsUsedInNft'
], inplace = True)

In [416]:
crcl_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,IsTrading,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,True,41.999952,42.0
300,300 token,,,True,300.0,300.0
365,365Coin,X11,PoW/PoS,True,0.0,-1.0
404,404Coin,Scrypt,PoW/PoS,True,0.0,-1.0
433,433 Token,,,False,,


In [417]:
# Keep only cryptocurrencies that are trading

crcl_df = crcl_df.loc[crcl_df['IsTrading'] == True]

In [418]:
# Keep only cryptocurrencies with a working algorithm

crcl_df = crcl_df.loc[crcl_df['Algorithm'] != 'N/A']

In [419]:
# Remove the 'IsTrading' column

crcl_df = crcl_df.drop(columns = 'IsTrading')

In [420]:
# Remove rows with at least 1 null value
crcl_df.isnull().sum()


CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    944
MaxSupply          944
dtype: int64

In [421]:
crcl_df = crcl_df.dropna()
crcl_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
MaxSupply          0
dtype: int64

In [422]:
# Remove rows with cryptocurrencies having no coins mined
crcl_df = crcl_df.loc[crcl_df['TotalCoinsMined'] != 0]


In [423]:
# Drop rows where there are 'N/A' text values

crcl_df = crcl_df[crcl_df!='N/A']

In [424]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 

coinName_df = pd.DataFrame(
    data = crcl_df, columns = ['CoinName'])

In [425]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm

crcl_df = crcl_df.drop(columns = 'CoinName')
crcl_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.999952,42
NSR,PoS,PoS,6173272217.8311,0
TRI,X13,PoW/PoS,191620.842403,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1


In [426]:
# Create dummy variables for text features

crcl_dummies= pd.get_dummies(crcl_df, columns=['Algorithm', 'ProofType'])
crcl_dummies.head()

Unnamed: 0,TotalCoinsMined,MaxSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-2 Token,Algorithm_BEP-20 Token,Algorithm_BEP2 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,41.999952,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6173272217.8311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,191620.842403,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000.0,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [427]:
# Standardize data

crcl_scaled = StandardScaler().fit_transform(crcl_dummies)
print(crcl_scaled[0:1])

[[-0.05716655 -0.0616752  -0.0571662  -0.09933993 -0.0571662  -0.93378328
  -0.0571662  -0.08097763 -0.0571662  -0.0571662  -0.09933993 -0.08097763
  -0.09933993 -0.0571662  -0.0571662  -0.0571662  -0.1927749  -0.08097763
  -0.0571662  -0.0571662  -0.0571662  -0.1927749  -0.0571662  -0.0571662
  -0.17378533 -0.0571662  -0.0571662  -0.0571662  -0.0571662  -0.08097763
  -0.0571662  -0.0571662  -0.0571662  -0.0571662  -0.0571662  -0.0571662
  -0.0571662  -0.12867125 -0.08097763 -0.0571662  -0.0571662  -0.0571662
  -0.08097763 -0.14118624 -0.0571662  -0.0571662  -0.08097763 -0.08097763
  -0.21028002 -0.08097763 -0.0571662  -0.0571662  -0.0571662   3.43389963
  -0.0571662  -0.0571662  -0.14118624 -0.0571662  -0.08097763 -0.114897
  -0.0571662  -0.14118624 -0.08097763 -0.0571662  -0.0571662  -0.0571662
  -0.0571662  -0.0571662  -0.0571662  -0.16357216 -0.0571662  -0.0571662
  -0.08097763 -0.08097763 -0.0571662  -0.20168779 -0.0571662  -0.0571662
  -0.0571662  -0.51320024  3.35857112 -0.05716

In [428]:
crcl_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.999952,42
NSR,PoS,PoS,6173272217.8311,0
TRI,X13,PoW/PoS,191620.842403,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1


In [429]:
crcl_scaled = crcl_scaled[~np.isnan(crcl_scaled).any(axis=1)]
np.isnan(crcl_scaled).sum()

0

### Reducing Dimensions Using PCA

In [430]:
# Use PCA to reduce dimensions to 3 principal components

pca = PCA(n_components=3)

crcl_pca = pca.fit_transform(crcl_scaled)

In [431]:
# Create a DataFrame with the principal components data

pca_df = pd.DataFrame(
    data=crcl_pca, columns=["PC 1", "PC 2", "PC 3"],index= crcl_df.index
)
pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
42,1.210695,0.016365,-0.002007
NSR,0.028795,-0.060582,-0.001279
TRI,0.5928,-0.019782,-0.001929
CMTC,2.130434,0.074707,-0.001574
CHAT,1.210695,0.016365,-0.002007


In [432]:
crcl_df.shape

(307, 4)

### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [433]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crcl_pca)
    inertia.append(km.inertia_)   

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=<your best value for k here>`

In [434]:
# Initialize the K-Means model
model = KMeans(n_clusters=5, random_state=0)
# Fit the model
model.fit(pca_df)
# Predict clusters
predictions = model.predict(pca_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pca_df['class'] = model.labels_

df_merged = crcl_df.join(coinName_df, how='outer')
df= pd.concat([df_merged,pca_df],axis=1,sort=False)

df.head()


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,CoinName,PC 1,PC 2,PC 3,class
42,Scrypt,PoW/PoS,41.999952,42,42 Coin,1.210695,0.016365,-0.002007,2
NSR,PoS,PoS,6173272217.8311,0,NuShares,0.028795,-0.060582,-0.001279,0
TRI,X13,PoW/PoS,191620.842403,0,Triangles Coin,0.5928,-0.019782,-0.001929,2
CMTC,Scrypt,PoW,872830.0,0,CometCoin,2.130434,0.074707,-0.001574,2
CHAT,Scrypt,PoW/PoS,1000000000.0,-1,OpenChat,1.210695,0.016365,-0.002007,2


### Visualizing Results

#### 3D-Scatter with Clusters

In [443]:
# Create a 3D-Scatter with the PCA data and the clusters

fig = px.scatter_3d(
    df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    hover_name='CoinName',
    hover_data= ['Algorithm'],
    color="class",
    symbol="class",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Table of Tradable Cryptocurrencies

In [436]:
# Table with tradable cryptos
#columns = ['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'MaxSupply', 'class']


In [437]:
# Print the total number of tradable cryptocurrencies
df.hvplot.table(columns)

In [438]:
df['class']


42       2
NSR      0
TRI      2
CMTC     2
CHAT     2
        ..
HPB      0
ZEN      2
ICX      0
KCASH    0
KMD      2
Name: class, Length: 307, dtype: int32

#### Scatter Plot with Tradable Cryptocurrencies

In [441]:
# Scale data to create the scatter plot

scaler=MinMaxScaler()
plotting= scaler.fit_transform(df[['MaxSupply','TotalCoinsMined']])
plotting= pd.DataFrame(plotting, columns= ['MaxSupply','TotalCoinsMined'], index=df.index )
plotting['CoinName']= df['CoinName']
plotting['class']= df['class']
plotting.head()

Unnamed: 0,MaxSupply,TotalCoinsMined,CoinName,class
42,2.047619e-12,5.243511e-10,42 Coin,2
NSR,4.761905e-14,5.244173e-10,NuShares,0
TRI,4.761905e-14,5.243511e-10,Triangles Coin,2
CMTC,4.761905e-14,5.243511e-10,CometCoin,2
CHAT,0.0,5.243618e-10,OpenChat,2


In [442]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

plotting.hvplot.scatter(
    x="TotalCoinsMined", 
    y="MaxSupply", 
    by='class', 
    hover_cols=['CoinName']
)
