# Clustering Crypto

In [689]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from pathlib import Path

### Fetching Cryptocurrency Data

In [690]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [691]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

#data = requests.get(url) # works
#data.json()
#text = json.loads(data)

In [692]:
# Alternatively, use the provided csv file:
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,128326.99634,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,21491213.46445,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [693]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
df = df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']]
#or drop unnamed

df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,128326.99634,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,21491213.46445,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [694]:
# Keep only cryptocurrencies that are trading
df = df.loc[df.IsTrading, :]
df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,True,PoW,7296537.50000,21000000


In [695]:
# Keep only cryptocurrencies with a working algorithm
df.Algorithm.unique() #to read/see algos

#what?
df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.00000,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,True,PoW,7296537.50000,21000000


In [696]:
# Remove the "IsTrading" column
df.drop('IsTrading', inplace=True, axis=1)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.00000,0
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,PoW,7296537.50000,21000000


In [697]:
# Remove rows with at least 1 null value
#data['duration']=data['duration'].replace(0, np.nan) #replace 0 with NaN
df = df.dropna(axis='rows')
df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
4,808,SHA-256,PoW/PoS,0.00000,0
5,EliteCoin,X13,PoW/PoS,29279424622.50270,314159265359
7,Bitcoin,SHA-256,PoW,17927175.00000,21000000
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,1999999995.30560,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,14931046.15466,250000000
1245,Beldex,CryptoNight,PoW,980222595.00000,1400222610
1246,Horizen,Equihash,PoW,7296537.50000,21000000


In [698]:
# Remove rows with cryptocurrencies having no coins mined
#replace 0 with nan or drop 0 

#reformat sci notif
#pd.options.display.float_format = '{:.5f}'.format

#df['TotalCoinsMined'] = df['TotalCoinsMined'].round().fillna(0)

df.drop(df[df['TotalCoinsMined'] <= 0].index, inplace = True)
df = df.reset_index(drop=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
2,EliteCoin,X13,PoW/PoS,29279424622.50270,314159265359
3,Bitcoin,SHA-256,PoW,17927175.00000,21000000
4,Ethereum,Ethash,PoW,107684222.68650,0
...,...,...,...,...,...
527,ZEPHYR,SHA-256,DPoS,1999999995.30560,2000000000
528,Gapcoin,Scrypt,PoW/PoS,14931046.15466,250000000
529,Beldex,CryptoNight,PoW,980222595.00000,1400222610
530,Horizen,Equihash,PoW,7296537.50000,21000000


In [699]:
# Drop rows where there are 'N/A' text values

#df.drop(df[df[['CoinName', 'Algorithm', 'ProofType']] == 'N/A'], inplace = True)

#df[df[['CoinName', 'Algorithm', 'ProofType']].str.contains('N/A')==False ]

#df = df.drop(df[df[['CoinName', 'Algorithm', 'ProofType']] == 'N/A'].index, inplace = True)

df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,404Coin,Scrypt,PoW/PoS,1055184902.04000,532000000
2,EliteCoin,X13,PoW/PoS,29279424622.50270,314159265359
3,Bitcoin,SHA-256,PoW,17927175.00000,21000000
4,Ethereum,Ethash,PoW,107684222.68650,0
...,...,...,...,...,...
527,ZEPHYR,SHA-256,DPoS,1999999995.30560,2000000000
528,Gapcoin,Scrypt,PoW/PoS,14931046.15466,250000000
529,Beldex,CryptoNight,PoW,980222595.00000,1400222610
530,Horizen,Equihash,PoW,7296537.50000,21000000


In [700]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name_df = pd.DataFrame(df['CoinName'])
coin_name_df

Unnamed: 0,CoinName
0,42 Coin
1,404Coin
2,EliteCoin
3,Bitcoin
4,Ethereum
...,...
527,ZEPHYR
528,Gapcoin
529,Beldex
530,Horizen


In [701]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df.drop('CoinName', inplace=True, axis=1)
df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
1,Scrypt,PoW/PoS,1055184902.04000,532000000
2,X13,PoW/PoS,29279424622.50270,314159265359
3,SHA-256,PoW,17927175.00000,21000000
4,Ethash,PoW,107684222.68650,0
...,...,...,...,...
527,SHA-256,DPoS,1999999995.30560,2000000000
528,Scrypt,PoW/PoS,14931046.15466,250000000
529,CryptoNight,PoW,980222595.00000,1400222610
530,Equihash,PoW,7296537.50000,21000000


In [702]:
# Create dummy variables for text features
df = pd.get_dummies(data=df, columns=['Algorithm', 'ProofType'])

df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1055184902.04000,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,29279424622.50270,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,17927175.00000,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,107684222.68650,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,1999999995.30560,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
528,14931046.15466,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
529,980222595.00000,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
530,7296537.50000,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [703]:
# Standardize data
df_scaled = StandardScaler().fit_transform(df)

### Reducing Dimensions Using PCA

In [704]:
# Use PCA to reduce dimensions to 3 principal components
# Initialize PCA model
pca = PCA(n_components=3)

# Get two principal components for the iris data.
three_pca = pca.fit_transform(df_scaled)

In [720]:
# Create a DataFrame with the principal components data
df_pca = pd.DataFrame(
    data=three_pca, columns=["PC 1", "PC 2", "PC 3"]
)
df_pca.head()

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.33524,1.05147,-0.55221
1,-0.31856,1.05159,-0.55258
2,2.29155,1.53919,-0.64788
3,-0.15128,-1.35607,0.17044
4,-0.15368,-1.99691,0.35894


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [706]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=<your best value for k here>`

In [725]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(df_pca)

# Predict clusters
predictions = model.predict(df_pca)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
df_pred_feats = pd.DataFrame(df)
df_pred_feats["class"] = model.labels_
df_pred_feats["predictions"] = predictions
#df_pred_feats = pd.DataFrame({"class": model.labels_, "predictions": predictions})

df_pred_feats.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW,class,predictions
0,41.99995,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,1055184902.04,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,29279424622.5027,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,17927175.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,107684222.6865,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [726]:
# Scale data to create the scatter plot
clusters = df_pred_feats 
#scale?

In [727]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clusters.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by='class')

#### Table of Tradable Cryptocurrencies

In [728]:
# Table with tradable cryptos
df_pred_feats
coin_name_df
#

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW,class,predictions
0,41.99995,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,1055184902.04000,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,29279424622.50270,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,17927175.00000,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,107684222.68650,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,1999999995.30560,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
528,14931046.15466,250000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
529,980222595.00000,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
530,7296537.50000,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [733]:
# Print the total number of tradable cryptocurrencies
print(f'The total number of tradable cryptocurrencies: "{len(df_pred_feats.index)}".')

The total number of tradable cryptocurrencies: "532".
