# Clustering Crypto

In [79]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

### Deliverable 1: Preprocessing the Data for PCA

In [2]:
# Load the crypto_data.csv dataset.
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [3]:
# Dataframe shape
crypto_df.shape

(1252, 7)

In [4]:
# List dataframe info
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


In [5]:
# Set index to the unamed:0 column
crypto_df.set_index("Unnamed: 0", inplace=True)

In [6]:
# Conditional statement for .loc
(crypto_df["IsTrading"] == True)

Unnamed: 0
42       True
365      True
404      True
611      True
808      True
        ...  
XBC      True
DVTC    False
GIOT    False
OPSC    False
PUNK    False
Name: IsTrading, Length: 1252, dtype: bool

In [7]:
# Use .loc to get all IsTrading == True
crypto_trading_df = crypto_df.loc[(crypto_df["IsTrading"] == True)]

crypto_trading_df.shape

(1144, 6)

In [8]:
# Drop "IsTrading" column
crypto_trading_df.drop(columns=["IsTrading"], inplace=True)
crypto_trading_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [9]:
crypto_trading_df.shape

(1144, 5)

In [10]:
# Find null values
for column in crypto_trading_df.columns:
    print(f"Columnn {column} has {crypto_trading_df[column].isnull().sum()} null values")

Columnn CoinName has 0 null values
Columnn Algorithm has 0 null values
Columnn ProofType has 0 null values
Columnn TotalCoinsMined has 459 null values
Columnn TotalCoinSupply has 0 null values


In [11]:
# Drop null rows
clean_crypto_df = crypto_trading_df.dropna()

In [12]:
clean_crypto_df.shape

(685, 5)

In [13]:
# Keep the rows where coins are mined.
clean_crypto_df = clean_crypto_df[(clean_crypto_df["TotalCoinsMined"] > 0)]
clean_crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [14]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df=clean_crypto_df.copy()
crypto_names_df=pd.DataFrame(crypto_names_df["CoinName"], index=crypto_names_df.index)
crypto_names_df

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [15]:
crypto_names_df.shape

(532, 1)

In [16]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
clean_crypto_df = clean_crypto_df.drop(columns=["CoinName"])
clean_crypto_df.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [17]:
clean_crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.8+ KB


In [18]:
# Recast TotalCoinSupply as a nmeric using pandas
clean_crypto_df["TotalCoinSupply"] = pd.to_numeric(clean_crypto_df["TotalCoinSupply"],errors='coerce')
clean_crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [19]:
clean_crypto_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [20]:
# Save our clean_crypto_df as clean_crypto.csv
output_file_path = "Resources/clean_crypto.csv"
clean_crypto_df.to_csv(output_file_path,index=True)

In [21]:
# Use get_dummies() to create variables for text features.
X_encoded = pd.get_dummies(clean_crypto_df, columns=["Algorithm","ProofType"])
X_encoded.shape

(532, 98)

In [22]:
#Save X_encoded as csv
output_file_path = "Resources/X_encoded.csv"
X_encoded.to_csv(output_file_path,index=True)

In [23]:
X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 98 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   TotalCoinsMined                   532 non-null    float64
 1   TotalCoinSupply                   532 non-null    float64
 2   Algorithm_1GB AES Pattern Search  532 non-null    uint8  
 3   Algorithm_536                     532 non-null    uint8  
 4   Algorithm_Argon2d                 532 non-null    uint8  
 5   Algorithm_BLAKE256                532 non-null    uint8  
 6   Algorithm_Blake                   532 non-null    uint8  
 7   Algorithm_Blake2S                 532 non-null    uint8  
 8   Algorithm_Blake2b                 532 non-null    uint8  
 9   Algorithm_C11                     532 non-null    uint8  
 10  Algorithm_Cloverhash              532 non-null    uint8  
 11  Algorithm_Counterparty            532 non-null    uint8  
 12  Algorithm_Cr

In [24]:
# Standardize the data with StandardScaler().
scaler = StandardScaler()

#train/fit our data
X_scaled = scaler.fit_transform(X_encoded)
X_scaled[:5]

X_encoded

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,4.199995e+01,4.200000e+01,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,5.320000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,3.141593e+11,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2.000000e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,2.500000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1.400223e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Deliverable 2: Reducing Data Dimensions Using PCA

In [26]:
# Using PCA to reduce dimension to three principal components.

# Initialize PCA model for 3 principal components
pca=PCA(n_components=3)

# Get 3 principal components for the X_scaled data where X is our feature matrix
X_pca=pca.fit_transform(X_scaled)

X_pca

array([[-0.34644617,  1.06540953, -0.57502066],
       [-0.32980064,  1.0654939 , -0.57543409],
       [ 2.3156011 ,  1.60946883, -0.66612485],
       ...,
       [ 0.32764012, -2.33690239,  0.42756782],
       [-0.15626268, -2.00557073,  0.42194253],
       [-0.2924619 ,  0.82968789, -0.27694141]])

In [27]:
# Create a DataFrame with the three principal components.
X_pca_df = pd.DataFrame(data=X_pca,
                       columns=["PC 1", "PC 2", "PC 3"],
                       index=X_encoded.index)

X_pca_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.346446,1.065410,-0.575021
404,-0.329801,1.065494,-0.575434
1337,2.315601,1.609469,-0.666125
BTC,-0.148329,-1.297586,0.161358
ETH,-0.128509,-2.055625,0.396078
...,...,...,...
ZEPH,2.475722,0.902873,-0.134579
GAP,-0.344493,1.065279,-0.575045
BDX,0.327640,-2.336902,0.427568
ZEN,-0.156263,-2.005571,0.421943


In [31]:
# Varience ratio

pca.explained_variance_ratio_

array([0.027929  , 0.02139424, 0.02051146])

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [34]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X_pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [35]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)

# Add the predicted class columns
X_pca_df["class"] = model.labels_
X_pca_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.346446,1.06541,-0.575021,2
404,-0.329801,1.065494,-0.575434,2
1337,2.315601,1.609469,-0.666125,2
BTC,-0.148329,-1.297586,0.161358,0
ETH,-0.128509,-2.055625,0.396078,0


In [36]:
X_pca_df["class"].value_counts()

2    285
0    240
1      6
3      1
Name: class, dtype: int64

In [61]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
crypto_df = pd.read_csv("Resources/clean_crypto.csv")
crypto_df=crypto_df.set_index("Unnamed: 0")
clustered_df=pd.concat([crypto_df,X_pca_df],axis=1)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df["CoinName"]=crypto_names_df["CoinName"]

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,-0.346446,1.06541,-0.575021,2,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.329801,1.065494,-0.575434,2,404Coin
1337,X13,PoW/PoS,29279420000.0,314159300000.0,2.315601,1.609469,-0.666125,2,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.148329,-1.297586,0.161358,0,Bitcoin
ETH,Ethash,PoW,107684200.0,0.0,-0.128509,-2.055625,0.396078,0,Ethereum
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.174189,-1.096824,-0.003279,0,Litecoin
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.394604,1.195955,-0.49601,2,Dash
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.147832,-2.21527,0.393997,0,Monero
ETC,Ethash,PoW,113359700.0,210000000.0,-0.126952,-2.055737,0.39606,0,Ethereum Classic
ZEC,Equihash,PoW,7383056.0,21000000.0,-0.156262,-2.005571,0.421942,0,ZCash


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [67]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
    hover_name=clustered_df["CoinName"],
    #hover_data=clustered_df["Algorithm"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [68]:
# Create a table with tradable cryptocurrencies.

clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class' ], sortable=True, selectable=True)

In [69]:
# Print the total number of tradable cryptocurrencies.
print(f"The total number of tradable cryptocurrencies: {clustered_df.shape[0]}")

The total number of tradable cryptocurrencies: 532


In [70]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
mm_scaler=MinMaxScaler()

plot_data = mm_scaler.fit_transform(clustered_df[["TotalCoinsMined","TotalCoinSupply"]])

plot_data[:5]

array([[0.00000000e+00, 4.20000000e-11],
       [1.06585544e-03, 5.32000000e-04],
       [2.95755135e-02, 3.14159265e-01],
       [1.81084216e-05, 2.10000000e-05],
       [1.08773140e-04, 0.00000000e+00]])

In [71]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(plot_data, columns=["TotalCoinsMined","TotalCoinSupply"], index=clustered_df.index)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df["CoinName"]= clustered_df["CoinName"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["class"]=clustered_df["class"]

plot_df.head(10)

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,CoinName,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,0.0,4.2e-11,42 Coin,2
404,0.001066,0.000532,404Coin,2
1337,0.029576,0.3141593,EliteCoin,2
BTC,1.8e-05,2.1e-05,Bitcoin,0
ETH,0.000109,0.0,Ethereum,0
LTC,6.4e-05,8.4e-05,Litecoin,0
DASH,9e-06,2.2e-05,Dash,2
XMR,1.7e-05,0.0,Monero,0
ETC,0.000115,0.00021,Ethereum Classic,0
ZEC,7e-06,2.1e-05,ZCash,0


In [89]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(
x="TotalCoinsMined", y="TotalCoinSupply", hovers_color=["CoinName"], by="class")

#hvplot.save(plot,"plot.html")

