 # Clustering Crypto

 ## Installing External Libraries

In [1]:
# Install the altair plotting library: https://altair-viz.github.io/
!pip install -U altair

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
[K     |████████████████████████████████| 727 kB 8.6 MB/s eta 0:00:01     |█████▍                          | 122 kB 8.6 MB/s eta 0:00:01
Installing collected packages: altair
Successfully installed altair-4.1.0


In [2]:
# Initial imports
import requests
import pandas as pd
import altair as alt
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [3]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [4]:
# Create a DataFrame
crypto_df = pd.DataFrame(response['Data']).T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,BlockReward,BlockTime,AssetLaunchDate,MaxSupply,MktCapPenalty,PlatformType,BuiltOn,SmartContractAddress,Difficulty,IsUsedInDefi
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,0.0,0.0,0000-00-00,0.0,0.0,,,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,0.0,0.0,2017-07-01,-1.0,0.0,token,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,,
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,,,,,,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,0.0,0.0,0000-00-00,0.0,0.0,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [5]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
# crypto_df = pd.read_csv(file_path, index_col=0)
# crypto_df.head(10)

 ### Data Preprocessing

In [6]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df=crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]
crypto_df.head(10)

KeyError: "['TotalCoinSupply'] not in index"

In [None]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df["IsTrading"] == True]
print(crypto_df.shape)
crypto_df.head(10)

In [None]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df["Algorithm"] != "N/A"]
print(crypto_df.shape)
crypto_df.head(10)

In [None]:
# Remove the "IsTrading" column
crypto_df.drop("IsTrading", axis=1, inplace=True)
print(crypto_df.shape)
crypto_df.head(10)

In [None]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna(axis=0, how="any")
print(crypto_df.shape)
crypto_df.head(10)

In [7]:
# Remove rows with cryptocurrencies withouhaving no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
print(crypto_df.shape)
crypto_df.head(10)

(408, 30)


Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,BlockReward,BlockTime,AssetLaunchDate,MaxSupply,MktCapPenalty,PlatformType,BuiltOn,SmartContractAddress,Difficulty,IsUsedInDefi
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,0.0,0,2017-07-01,-1.0,0,token,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,,
MAID,5293,/coins/maid/overview,/media/352247/maid.png,1430209540,MAID,MAID,MaidSafe Coin,MaidSafe Coin (MAID),MaidSafe is a fully decentralized platform on ...,Finished,...,0.0,0,2014-06-12,-1.0,0,blockchain,,,,
MONA,5296,/coins/mona/overview,/media/35309574/mona.png,1430209574,MONA,MONA,MonaCoin,MonaCoin (MONA),"Monacoin, conceived in December of 2013, is th...",,...,12.5,94,2014-01-01,-1.0,0,blockchain,,,3277770.0,
TRI,5341,/coins/tri/overview,/media/350568/tri.png,1430210244,TRI,TRI,Triangles Coin,Triangles Coin (TRI),Triangle is a PoW/PoS hybrid with a 33% annual...,,...,0.0,0,2014-10-11,0.0,0,,,,125.809,
DGD,18907,/coins/dgd/overview,/media/37305723/dgd.png,1461915042,DGD,DGD,Digix DAO,Digix DAO (DGD),Digix is an asset tokenization platform that p...,Finished,...,0.0,0,2016-04-28,2000000.0,0,token,ETH,0xe0b7927c4af23765cb51314a0e0521a9645f0e2a,,
BNT,22327,/coins/bnt/overview,/media/1383549/bnt.jpg,1467197288,BNT,BNT,Bancor Network Token,Bancor Network Token (BNT),The Bancor Protocol is a blockchain-based syst...,Finished,...,0.0,0,2017-02-13,-1.0,0,token,ETH,0x1f573d6fb3f13d689ff844b4ce37794d79a7ff1c,,
KMD,26132,/coins/kmd/overview,/media/35651353/komodo300x300.jpg,1472484166,KMD,KMD,Komodo,Komodo (KMD),Komodo is a privacy-centric cryptocurrency tha...,Finished,...,3.0,60,2016-09-13,200000000.0,0,blockchain,,,197528000.0,
ARDR,30173,/coins/ardr/overview,/media/351736/ardr.png,1476417509,ARDR,ARDR,Ardor,Ardor (ARDR),Ardor is a multichain blockchain platform with...,,...,0.0,0,2017-09-24,998999495.0,0,blockchain,,,,
GNT,33022,/coins/gnt/overview,/media/351995/golem_logo.png,1479426901,GNT,GNT,Golem Network Token,Golem Network Token (GNT),The Golem Network is a decentralized computati...,Finished,...,0.0,0,2016-11-17,-1.0,0,token,ETH,0xa74476443119A942dE498590Fe1f2454d7D4aC0d,,
MKR,41192,/coins/mkr/overview,/media/1382296/mkr.png,1485505585,MKR,MKR,Maker,Maker (MKR),Maker DAO is a decentralized autonomous organi...,,...,0.0,0,2015-08-15,-1.0,0,,ETH,0x9f8f72aa9304c8b593d555f12ef6589cc3a579a2,,


In [8]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != 'N/A'].dropna()
crypto_df.head(10)

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,BlockReward,BlockTime,AssetLaunchDate,MaxSupply,MktCapPenalty,PlatformType,BuiltOn,SmartContractAddress,Difficulty,IsUsedInDefi


In [9]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = pd.DataFrame(crypto_df["CoinName"], index=crypto_df.index)
print(coins_name.shape)
coins_name.head()

(0, 1)


Unnamed: 0,CoinName


In [10]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop("CoinName", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(0, 29)


Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,FullName,Description,AssetTokenStatus,Algorithm,...,BlockReward,BlockTime,AssetLaunchDate,MaxSupply,MktCapPenalty,PlatformType,BuiltOn,SmartContractAddress,Difficulty,IsUsedInDefi


In [11]:
# Create dummy variables for text features
X = pd.get_dummies(data=crypto_df, columns=["Algorithm", "ProofType"])
print(X.shape)
X.head(10)

(0, 27)


Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,FullName,Description,AssetTokenStatus,SortOrder,...,BlockReward,BlockTime,AssetLaunchDate,MaxSupply,MktCapPenalty,PlatformType,BuiltOn,SmartContractAddress,Difficulty,IsUsedInDefi


In [12]:
# Standardize data
X = StandardScaler().fit_transform(X)
X[:5]

ValueError: Found array with 0 sample(s) (shape=(0, 27)) while a minimum of 1 is required by StandardScaler.

 ### Reducing Dimensions Using PCA

In [13]:
# Use PCA to reduce dimension to 3 principal components
n_comp = 3
pca = PCA(n_components=n_comp)
principal_components = pca.fit_transform(X)
principal_components

ValueError: Found array with 0 sample(s) (shape=(0, 27)) while a minimum of 1 is required.

In [14]:
# Create a DataFrame with the principal components data
col_names = [f"PC {i}" for i in range(1, n_comp + 1)]
pcs_df = pd.DataFrame(principal_components, columns=col_names, index=crypto_df.index)
print(pcs_df.shape)
pcs_df.head(10)

NameError: name 'principal_components' is not defined

 ### Clustering Crytocurrencies Using K-Means

 #### Finde the Best Value for `k` Using the Elbow Curve

In [15]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(x="k", y="inertia")


NameError: name 'pcs_df' is not defined

 Running K-Means with `k=4`

In [16]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df], axis=1, sort=False)
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df["Class"] = model.labels_
print(clustered_df.shape)
clustered_df.head(10)


NameError: name 'pcs_df' is not defined

 ### Visualizing Results

 #### Scatter Plot for Clusters

In [17]:
# Scatter plot to visualize clusters using two principal components
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x="PC 1",
    y="PC 2",
    color=alt.Color(
        "Class",
        scale=alt.Scale(domain=[0, 1, 2, 3], range=["red", "green", "blue", "orange"]),
    ),
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"],
).interactive()


NameError: name 'clustered_df' is not defined

 #### Scatter Plot with Tradable Cryptocurrencies

In [18]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    clustered_df[["TotalCoinSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["TotalCoinSupply", "TotalCoinsMined"], index=clustered_df.index
)
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df["Class"] = clustered_df["Class"]
plot_df.head()



NameError: name 'clustered_df' is not defined

In [19]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(plot_df).mark_circle(size=60).encode(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    color=alt.Color(
        "Class",
        scale=alt.Scale(domain=[0, 1, 2, 3], range=["red", "green", "blue", "orange"]),
    ),
    tooltip=["CoinName", "TotalCoinsMined", "TotalCoinSupply"],
).interactive()


NameError: name 'plot_df' is not defined

 #### Table of Tradable Cryptocurrencies

In [20]:
# Table with tradable cryptos
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        clustered_df[
            [
                "CoinName",
                "Algorithm",
                "ProofType",
                "TotalCoinSupply",
                "TotalCoinsMined",
                "Class",
            ]
        ]
    )



NameError: name 'clustered_df' is not defined