# Clustering Crypto

In [29]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [30]:
# Load the crypto_data.csv dataset.
file_path = 'crypto_data.csv'
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [31]:
# Keep all the cryptocurrencies that are being traded.
crypto_df_cleaned = crypto_df[crypto_df['IsTrading']==True]
crypto_df_cleaned.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [32]:
# drop the 'IsTrading' column because all values are =
crypto_df_cleaned.drop(columns='IsTrading', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [33]:
# check for null values
crypto_df.isnull().sum()

CoinName             0
Algorithm            0
IsTrading            0
ProofType            0
TotalCoinsMined    508
TotalCoinSupply      0
dtype: int64

In [34]:
# check for overview information
crypto_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   ProofType        1144 non-null   object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  1144 non-null   object 
dtypes: float64(1), object(4)
memory usage: 53.6+ KB


In [35]:
# remove rows with at least 1 null value - should return 685 rows because of TotalCoinsMined
crypto_df_cleaned.dropna(inplace=True)
# crypto_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [36]:
# keep only coins that have been mined
crypto_df_cleaned = crypto_df_cleaned[crypto_df_cleaned['TotalCoinsMined'] > 0]
mined = crypto_df_cleaned.TotalCoinsMined.unique().tolist()
mined = sorted(mined)
mined

[41.99995383,
 88.0,
 1177.0,
 42579.476901,
 84300.0,
 88213.0,
 128326.99633965,
 140777.753365,
 181919.2435974,
 200911.79151896,
 308179.0,
 329200.01639,
 406091.925,
 419275.38,
 485214.0,
 500000.0,
 595429.0,
 616448.0,
 619478.0,
 636462.55983338,
 657636.34549789,
 715659.44237941,
 795447.0,
 814671.0,
 845637.81347436,
 894026.0,
 978145.0,
 1039116.65144562,
 1042012.45227735,
 1104157.42169891,
 1104344.22938102,
 1120385.00502,
 1140734.91680375,
 1142732.14912776,
 1148324.0,
 1170292.5,
 1182153.5,
 1195525.0,
 1207310.0,
 1231147.0,
 1288862.0,
 1377917.0,
 1416663.06600024,
 1431851.00002479,
 1467841.0,
 1513704.0,
 1578281.31341127,
 1618033.0,
 1876146.443596,
 1934701.67524713,
 1939889.0,
 2022464.886973,
 2149688.0,
 2167827.1,
 2232901.0,
 2278150.0,
 2449577.41533168,
 2500124.0,
 2504486.227718,
 2526078.47525448,
 2581970.0,
 2622886.0,
 2689812.0,
 2716264.95302831,
 2922613.96424908,
 3115258.0,
 3220616.279225,
 3304487.74735637,
 3315789.0,
 3332922.5,

In [37]:
# create DF with only coin names
# coin_names_df = crypto_df['CoinName'] # this works but takes some extra steps
coin_names_df = crypto_df_cleaned.drop(columns=['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']) # easier way to keep column name and index
coin_names_df.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [38]:
# drop 'CoinName' - irrelevant for unsupervised learning
crypto_df_cleaned.drop(columns=['CoinName'], inplace=True)
crypto_df_cleaned.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [40]:
crypto_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.8+ KB


In [41]:
# use get_dummies() to create columns for object features
X = pd.get_dummies(crypto_df_cleaned, columns=['Algorithm', 'ProofType'])

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
X['TotalCoinSupply'] = pd.to_numeric(X['TotalCoinSupply'])

In [45]:
# standardize data with standard scalar
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [51]:
# use PCA algorithm to reduce number of fetures to 3 principal components
pca = PCA(n_components=3)

# fit scaled data and transform with PCA
X_pca = pca.fit_transform(X_scaled)

In [54]:
# create PCA dataframe with the new data
# set index to coin names index from cyrpto_cleaned_df
pcs_df = pd.DataFrame(X_pca, columns=['PC 1', 'PC 2', 'PC 3'], index=crypto_df_cleaned.index)
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.335775,0.987139,-0.533148
404,-0.319103,0.987165,-0.53353
1337,2.314772,1.583868,-0.584166
BTC,-0.139257,-1.340601,0.165375
ETH,-0.152787,-2.04385,0.353917


In [61]:
# check explained variance in the pca components
pca.explained_variance_ratio_ # [0.02793106, 0.02139633, 0.02048752] = 0.06981491404751769

array([0.02793106, 0.02139633, 0.02048752])

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [63]:
# Create elbow curve to find best value for K means analysis

# list for inertia values
inertia = []
# range of k values to test
k = list(range(1,11))

# calculate inertia for each of the K values by looping through KMeans
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    # fit data to KMeans
    km.fit(pcs_df)
    # get inertia from KMeans calculation and add to inertia list
    inertia.append(km.inertia_)


# create elbow curve with data
# dictionary of elbow data
elbow_data = {'k':k, 'inertia':inertia}
# df from dictionary
df_elbow = pd.DataFrame(elbow_data)
# plot data
df_elbow.hvplot.line(x='k', y='inertia', xticks=k, title='Elbow Curve') # 4 and 5 are good candidates for clusters

  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [64]:
# Initialize the K-Means model. Using cluster chosen from elbow curve
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

In [67]:
# Create new DF with predicted clusters and original features
clustered_df = pd.concat([crypto_df_cleaned, pcs_df, coin_names_df], axis=1, join='inner')
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.335775,0.987139,-0.533148,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.319103,0.987165,-0.53353,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.314772,1.583868,-0.584166,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.139257,-1.340601,0.165375,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.152787,-2.04385,0.353917,Ethereum


In [69]:
# add column for predictions 
clustered_df['Class'] = model.labels_

In [71]:
# check concat and new column
print(clustered_df.shape)
clustered_df.head()

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.335775,0.987139,-0.533148,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.319103,0.987165,-0.53353,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.314772,1.583868,-0.584166,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.139257,-1.340601,0.165375,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.152787,-2.04385,0.353917,Ethereum,1


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [82]:
# Create 3d scatter plot with PCA data dn clusters
fig = px.scatter_3d(
    clustered_df, 
    x='PC 1', y='PC 2', z='PC 3', 
    color='Class', symbol='Class', width=1000,
    hover_name='CoinName', hover_data=['Algorithm'])

# add legend to the cd-scatter
fig.update_layout(legend=dict(x=0, y=1))

# plot figure
fig.show()

In [92]:
# Create hvplot table with all tradeable cryptocurrencies
# get column names in list format
cols = clustered_df.columns.to_list()

# create hvplot.table
cluster_hv_table = clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'], sortable=True, selectable=True)
cluster_hv_table

In [94]:
# Print total number of tradeable cryptocurrencies
print(f"There are {clustered_df['CoinName'].count()} tradeable cryptocurrencies.")

There are 532 tradeable cryptocurrencies.


In [113]:
clustered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
 4   PC 1             532 non-null    float64
 5   PC 2             532 non-null    float64
 6   PC 3             532 non-null    float64
 7   CoinName         532 non-null    object 
 8   Class            532 non-null    int32  
dtypes: float64(4), int32(1), object(4)
memory usage: 39.5+ KB


In [114]:
# scale total coin supply and total coins mined with MinMaxScaler()
# make of copy df with only the columns we want 
clustered_MinMaxScaled_df = clustered_df.filter(['TotalCoinSupply', 'TotalCoinsMined', 'CoinName', 'Class'], axis=1)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,42,41.99995,42 Coin,0
404,532000000,1055185000.0,404Coin,0
1337,314159265359,29279420000.0,EliteCoin,0
BTC,21000000,17927180.0,Bitcoin,1
ETH,0,107684200.0,Ethereum,1


In [115]:
# convert TotalCoinSupply to numeric data
clustered_MinMaxScaled_df['TotalCoinSupply'] = pd.to_numeric(clustered_MinMaxScaled_df['TotalCoinSupply'])

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TotalCoinSupply  532 non-null    float64
 1   TotalCoinsMined  532 non-null    float64
 2   CoinName         532 non-null    object 
 3   Class            532 non-null    int32  
dtypes: float64(2), int32(1), object(1)
memory usage: 18.7+ KB


In [119]:
# scale coin supply and coins mined with MinMaxScalar()
# initialize MinMaxScaler()
mms = MinMaxScaler()

# scale the data 
clustered_MinMaxScaled_df[['TotalCoinSupply', 'TotalCoinsMined']] = mms.fit_transform(clustered_MinMaxScaled_df[['TotalCoinSupply', 'TotalCoinsMined']])
clustered_MinMaxScaled_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,1
ETH,0.0,0.000109,Ethereum,1
LTC,8.4e-05,6.4e-05,Litecoin,1
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,1
ETC,0.00021,0.000115,Ethereum Classic,1
ZEC,2.1e-05,7e-06,ZCash,1


In [121]:
# Create hvplot.scatter plot
clustered_MinMaxScaled_df.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', hover_cols=['CoinName'], by='Class')