In [275]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas

In [276]:
file_path = "./crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df = crypto_df.set_index("Unnamed: 0")

### Data Pre-Preocessing

In [277]:
#Remove all cryptocurrencies that are not on trading

indexNames = crypto_df[ crypto_df['IsTrading'] == False ].index
crypto_df.drop(indexNames , inplace=True)

In [278]:
#Remove IsTrading column
crypto_df = crypto_df.drop(["IsTrading"], axis=1)

In [279]:
#Remove all cryptocurrencies with at least one null value.

crypto_df  = crypto_df.dropna()

In [280]:
# Remove all cryptocurrencies without coins mined.

nocoins = crypto_df[ crypto_df['TotalCoinsMined'] == 0 ].index
crypto_df.drop(nocoins , inplace=True)

crypto_df


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [281]:
#Store the names of all crypto on a dataframe named coins name

coins_name = crypto_df['CoinName']

coins_name = pd.DataFrame(coins_name, index=crypto_df.index)
#set crypto_df.index as index


coins_name


Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [282]:
# Remove coinName column
crypto_df = crypto_df.drop(["CoinName"], axis=1)

In [283]:
#Create dummies variable for all the text features, store the resulting data on a dataframe named X

X = pd.get_dummies(crypto_df, columns=["Algorithm","ProofType"])

In [284]:
#use StandardScaler to standardize all data of the X dataframe
X_scaled = StandardScaler().fit_transform(X)

### Reducing Data Dimensions Using PCA

In [285]:
#Use PCA to reduce dimensions down to 3 principal components

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

X_pca


array([[-0.34690837,  0.97780829, -0.59869516],
       [-0.33037868,  0.9779745 , -0.5991718 ],
       [ 2.2832144 ,  1.65446572, -0.67440148],
       ...,
       [ 0.32739244, -2.25180938,  0.44095503],
       [-0.18940212, -1.72903384,  0.48658402],
       [-0.29787893,  0.61737026, -0.19567087]])

In [286]:
#Create dataframe named. pcs_df
column_names = ['PC 1', 'PC 2', 'PC 3']
pcs_df = pd.DataFrame(data= X_pca, columns = column_names, index=crypto_df.index)


pcs_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.346908,0.977808,-0.598695
404,-0.330379,0.977975,-0.599172
1337,2.283214,1.654466,-0.674401
BTC,-0.145106,-1.253471,0.200828
ETH,-0.137424,-2.042671,0.380796
...,...,...,...
ZEPH,2.568534,0.857065,-0.326931
GAP,-0.344970,0.977696,-0.598709
BDX,0.327392,-2.251809,0.440955
ZEN,-0.189402,-1.729034,0.486584


### Clustering Cryptocurrencies Using K-Means

In [287]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [288]:
# Create an elbow Curve to find the best value for k
#Store Values of K to Plot
inertia = []
k = list(range(1, 11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [289]:
model = KMeans(n_clusters=4, random_state=0)
model.fit(pcs_df)
predictions = model.predict(pcs_df)

In [290]:
#Create clustered_df Dataframe
predictions = pd.DataFrame(predictions, columns = ['Class'], index=pcs_df.index)
a = pd.DataFrame.join(crypto_df,pcs_df)
b = pd.DataFrame.join(a,coins_name)
clustered_df = pd.DataFrame.join(b,predictions)
clustered_df

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.346908,0.977808,-0.598695,42 Coin,0
404,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.330379,0.977975,-0.599172,404Coin,0
1337,X13,PoW/PoS,2.927942e+10,314159265359,2.283214,1.654466,-0.674401,EliteCoin,0
BTC,SHA-256,PoW,1.792718e+07,21000000,-0.145106,-1.253471,0.200828,Bitcoin,1
ETH,Ethash,PoW,1.076842e+08,0,-0.137424,-2.042671,0.380796,Ethereum,1
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000,2.568534,0.857065,-0.326931,ZEPHYR,0
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.344970,0.977696,-0.598709,Gapcoin,0
BDX,CryptoNight,PoW,9.802226e+08,1400222610,0.327392,-2.251809,0.440955,Beldex,1
ZEN,Equihash,PoW,7.296538e+06,21000000,-0.189402,-1.729034,0.486584,Horizen,1


### Visualizing Results

In [300]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=600,
    hover_name="CoinName",
    hover_data=["Algorithm"] 

)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [304]:
#Data Table
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType','TotalCoinSupply','TotalCoinsMined','Class'], width=600)

In [306]:
# Scatterplot 2

clustered_df.hvplot.scatter(x='TotalCoinsMined', y ='TotalCoinSupply', by='Class', hover_cols=["CoinName"])