# Clustering Crypto

In [40]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Deliverable 1: Preprocessing the Data for PCA

In [41]:
# Load the crypto_data.csv dataset.
file_path ="../Resources/crypto_data.csv"

crypto_df1 = pd.read_csv(file_path,
# Added name to index col in Excel. 
# Otherwise it added col 1 as "Unnamed: 0" 
# and added additional numeric index col.
                         index_col="ind"
                        )

# Also, col "TotalCoinSupply" had non-numeric data.
# I fixed in Excel before importing. 
# Retained col "TotalCoinSupply_orig" for ref.
# I consider this part of preprocessing.
crypto_df1.head(10)

# Why the sort order changed after adding new columns, I do not know.

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply,TotalCoinSupply_orig,sort_orig
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0,42.0,1
365,365Coin,X11,True,PoW/PoS,,2300000000.0,2300000000.0,2
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0,532000000.0,3
611,SixEleven,SHA-256,True,PoW,,611000.0,611000.0,4
808,808,SHA-256,True,PoW/PoS,0.0,0.0,0.0,5
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314000000000.0,314000000000.0,6
2015,2015 coin,X11,True,PoW/PoS,,0.0,0.0,7
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000.0,21000000.0,8
ETH,Ethereum,Ethash,True,PoW,107684200.0,0.0,0.0,9
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000.0,84000000.0,10


In [42]:
# Current col names and row count.
crypto_df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1252 entries, 42 to PUNK
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CoinName              1252 non-null   object 
 1   Algorithm             1252 non-null   object 
 2   IsTrading             1252 non-null   bool   
 3   ProofType             1252 non-null   object 
 4   TotalCoinsMined       744 non-null    float64
 5   TotalCoinSupply       1252 non-null   float64
 6   TotalCoinSupply_orig  1252 non-null   object 
 7   sort_orig             1252 non-null   int64  
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 79.5+ KB


In [43]:
# Verify that the col only has True and False values.
crypto_df1['IsTrading'].unique()

array([ True, False])

In [44]:
# Keep all the cryptocurrencies that are being traded.
crypto_df2 = crypto_df1[crypto_df1['IsTrading'] == True]

In [45]:
# Verify values are updated.
crypto_df2['IsTrading'].unique()

array([ True])

In [46]:
# Current col names and row count.
crypto_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CoinName              1144 non-null   object 
 1   Algorithm             1144 non-null   object 
 2   IsTrading             1144 non-null   bool   
 3   ProofType             1144 non-null   object 
 4   TotalCoinsMined       685 non-null    float64
 5   TotalCoinSupply       1144 non-null   float64
 6   TotalCoinSupply_orig  1144 non-null   object 
 7   sort_orig             1144 non-null   int64  
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 72.6+ KB


In [47]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE

# ###### This statement isn't explained, and it's not 
# in the instructions. I don't understand what's expected.
# What parameter would we use for "working algorithm???"
# I'm skipping it.

In [48]:
# Remove the "IsTrading" column. 
crypto_df3 = crypto_df2.drop(columns=["IsTrading", 
                                      "TotalCoinSupply_orig", 
                                      "sort_orig"], 
                             axis=1)

In [49]:
# Current col names and row count.
crypto_df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   ProofType        1144 non-null   object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  1144 non-null   float64
dtypes: float64(2), object(3)
memory usage: 53.6+ KB


In [50]:
# Remove rows that have at least 1 null value.
crypto_df4 = crypto_df3.dropna()

In [51]:
# Current col names and row count.
crypto_df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 685 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         685 non-null    object 
 1   Algorithm        685 non-null    object 
 2   ProofType        685 non-null    object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  685 non-null    float64
dtypes: float64(2), object(3)
memory usage: 32.1+ KB


In [52]:
# Keep the rows where coins are mined.
crypto_df5 = crypto_df4[crypto_df4['TotalCoinsMined'] > 0]

In [53]:
# Current col names and row count.
crypto_df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         532 non-null    object 
 1   Algorithm        532 non-null    object 
 2   ProofType        532 non-null    object 
 3   TotalCoinsMined  532 non-null    float64
 4   TotalCoinSupply  532 non-null    float64
dtypes: float64(2), object(3)
memory usage: 24.9+ KB


In [54]:
# Verify no zero values.
crypto_df5.describe()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply
count,532.0,532.0
mean,5340235000.0,10343170000.0
std,45645250000.0,67731090000.0
min,41.99995,0.0
25%,8359849.0,21000000.0
50%,24743970.0,78417600.0
75%,186725000.0,500000000.0
max,990000000000.0,1000000000000.0


In [55]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_df_coinname = crypto_df5[['CoinName']].copy()

In [56]:
# Verify df contains only required cols.
crypto_df_coinname.head()

Unnamed: 0_level_0,CoinName
ind,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [57]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df6 = crypto_df5.drop(columns=["CoinName"], axis=1)

In [58]:
# Current col names and row count.
crypto_df6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    float64
dtypes: float64(2), object(2)
memory usage: 20.8+ KB


In [59]:
# Compare to instructions for Deliverable 1.
crypto_df6.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314000000000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0
LTC,Scrypt,PoW,63039240.0,84000000.0
DASH,X11,PoW/PoS,9031294.0,22000000.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0
ETC,Ethash,PoW,113359700.0,210000000.0
ZEC,Equihash,PoW,7383056.0,21000000.0


In [60]:
# Use get_dummies() to create variables for text features.
X1 = pd.get_dummies(crypto_df6, columns=[
    "Algorithm",
    "ProofType"
])

In [61]:
# Current col names and row count.
X1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 98 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   TotalCoinsMined                   532 non-null    float64
 1   TotalCoinSupply                   532 non-null    float64
 2   Algorithm_1GB AES Pattern Search  532 non-null    uint8  
 3   Algorithm_536                     532 non-null    uint8  
 4   Algorithm_Argon2d                 532 non-null    uint8  
 5   Algorithm_BLAKE256                532 non-null    uint8  
 6   Algorithm_Blake                   532 non-null    uint8  
 7   Algorithm_Blake2S                 532 non-null    uint8  
 8   Algorithm_Blake2b                 532 non-null    uint8  
 9   Algorithm_C11                     532 non-null    uint8  
 10  Algorithm_Cloverhash              532 non-null    uint8  
 11  Algorithm_Counterparty            532 non-null    uint8  
 12  Algorithm_Cr

In [62]:
# Standardize the data with StandardScaler().
X2 = StandardScaler().fit_transform(X1)

In [63]:
X2.shape

(532, 98)

### Deliverable 2: Reducing Data Dimensions Using PCA

In [64]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X3 = pca.fit_transform(X2)

In [65]:
X3

array([[-0.33099627,  1.01050516, -0.52118525],
       [-0.31432791,  1.01080367, -0.52146029],
       [ 2.30872071,  1.70988676, -0.53351465],
       ...,
       [ 0.32781462, -2.2576194 ,  0.38977669],
       [-0.15168085, -2.11984549,  0.4300205 ],
       [-0.28708374,  0.8110956 , -0.28865468]])

In [66]:
# Create a DataFrame with the three principal components.
# Transform PCA data to a df
pcs_df = pd.DataFrame(
    # (Different from instructions. I don't use spaces in col names unless forced to.
    data = X3, columns = ["PC1", "PC2", "PC3"],
    index=crypto_df6.index
)
pcs_df.head(10)

Unnamed: 0_level_0,PC1,PC2,PC3
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.330996,1.010505,-0.521185
404,-0.314328,1.010804,-0.52146
1337,2.308721,1.709887,-0.533515
BTC,-0.144442,-1.343681,0.15095
ETH,-0.155638,-2.043498,0.304137
LTC,-0.162014,-1.138035,-0.016486
DASH,-0.395952,1.246703,-0.452033
XMR,-0.161112,-2.182366,0.442156
ETC,-0.154079,-2.043574,0.304128
ZEC,-0.15168,-2.119845,0.43002


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [67]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 5))

Running K-Means with `k=4`

In [68]:
# Initialize the K-Means model.
# YOUR CODE HERE
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    
# Fit the model
# YOUR CODE HERE
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Predict clusters
# YOUR CODE HERE
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", 
                     title="Elbow Curve", 
                     xticks=k)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [69]:
# Initialize the K-Means model.
# YOUR CODE HERE
# Fit the model
# YOUR CODE HERE
# Predict clusters
# YOUR CODE HERE

def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["Class"] = model.labels_

    return data

k1 = get_clusters(4, pcs_df)
k1


Unnamed: 0_level_0,PC1,PC2,PC3,Class
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.330996,1.010505,-0.521185,0
404,-0.314328,1.010804,-0.521460,0
1337,2.308721,1.709887,-0.533515,0
BTC,-0.144442,-1.343681,0.150950,3
ETH,-0.155638,-2.043498,0.304137,3
...,...,...,...,...
ZEPH,2.468549,0.647778,-0.086526,0
GAP,-0.329040,1.010419,-0.521198,0
BDX,0.327815,-2.257619,0.389777,3
ZEN,-0.151681,-2.119845,0.430020,3


In [70]:
# Create a new DataFrame including predicted clusters 
# and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE
clustered_df1 = pd.concat([crypto_df6, pcs_df])

#  Add a new column, "CoinName" to the clustered_df DataFrame 
# that holds the names of the cryptocurrencies. 
# YOUR CODE HERE
clustered_df2 = clustered_df1.join(crypto_df_coinname)
                                   
#  Add a new column, "Class" to the clustered_df DataFrame 
#  that holds the predictions.
# YOUR CODE HERE
clustered_df3 = pd.concat([clustered_df2, k1])


# Print the shape of the clustered_df
print(clustered_df3.shape)
clustered_df3.head(10)

(1596, 9)


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1337,X13,PoW/PoS,29279420000.0,314000000000.0,,,,EliteCoin,
1337,,,,,2.308721,1.709887,-0.533515,EliteCoin,
1CR,Scrypt,PoW,88213.0,92000000000.0,,,,1Credit,
1CR,,,,,0.489387,-1.172365,-0.019836,1Credit,
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,,,,404Coin,
404,,,,,-0.314328,1.010804,-0.52146,404Coin,
42,Scrypt,PoW/PoS,41.99995,42.0,,,,42 Coin,
42,,,,,-0.330996,1.010505,-0.521185,42 Coin,
8BIT,Scrypt,PoW/PoS,1467841.0,0.0,,,,8BIT Coin,
8BIT,,,,,-0.330978,1.010506,-0.521186,8BIT Coin,


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [71]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE
fig = px.scatter_3d(
    clustered_df3,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="CoinName",
    hover_data=["Algorithm"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
# Create a table with tradable cryptocurrencies.

# ###### I don't know the parameters to use to identify
# "tradable" cryptocurrencies.
clustered_df3.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'], sortable=True, selectable=True)

In [None]:
# Print the total number of tradable cryptocurrencies.
print(len(clustered_df3))

In [None]:
# Scaling data to create the scatter plot with 
# tradable cryptocurrencies.
# YOUR CODE HERE


# ###### Not enough time to finish this Challenge.
# I might be able to re-visit later.







In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
