# Clustering Crypto

In [31]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Deliverable 1: Preprocessing the Data for PCA

#### _crypto_data.csv originally started with an unnamed column. I was getting errors while trying to import it. After some troubleshooting, I named it "mystery" in Excel. Then I was able to import it as the index.

In [32]:
# Load the crypto_data.csv dataset.
file_path ="../Resources/crypto_data.csv"
# Added name to index col in Excel.
crypto_df1 = pd.read_csv(file_path, index_col="mystery")
crypto_df1.head()

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
mystery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [3]:
# Current col names and row count.
crypto_df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1252 entries, 42 to PUNK
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1252 non-null   object 
 1   Algorithm        1252 non-null   object 
 2   IsTrading        1252 non-null   bool   
 3   ProofType        1252 non-null   object 
 4   TotalCoinsMined  744 non-null    float64
 5   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 59.9+ KB


In [4]:
# Verify that the col only has True and False values.
crypto_df1['IsTrading'].unique()

array([ True, False])

In [5]:
# Keep all the cryptocurrencies that are being traded.
crypto_df2 = crypto_df1[crypto_df1['IsTrading'] == True]

In [6]:
# Verify values are updated.
crypto_df2['IsTrading'].unique()

array([ True])

In [7]:
# Current col names and row count.
crypto_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   IsTrading        1144 non-null   bool   
 3   ProofType        1144 non-null   object 
 4   TotalCoinsMined  685 non-null    float64
 5   TotalCoinSupply  1144 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 54.7+ KB


In [8]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE

This statement isn't explained, and it's not 
in the instructions. I don't know what's expected.
I'm skipping it.

In [9]:
# Remove the "IsTrading" column. 
crypto_df3 = crypto_df2.drop(columns=["IsTrading"], axis=1)

In [10]:
# Current col names and row count.
crypto_df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   ProofType        1144 non-null   object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  1144 non-null   object 
dtypes: float64(1), object(4)
memory usage: 53.6+ KB


In [11]:
# Remove rows that have at least 1 null value.
crypto_df4 = crypto_df3.dropna()

In [12]:
# Current col names and row count.
crypto_df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 685 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         685 non-null    object 
 1   Algorithm        685 non-null    object 
 2   ProofType        685 non-null    object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  685 non-null    object 
dtypes: float64(1), object(4)
memory usage: 32.1+ KB


In [13]:
# Keep the rows where coins are mined.
crypto_df5 = crypto_df4[crypto_df4['TotalCoinsMined'] > 0]

In [14]:
# Current col names and row count.
crypto_df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         532 non-null    object 
 1   Algorithm        532 non-null    object 
 2   ProofType        532 non-null    object 
 3   TotalCoinsMined  532 non-null    float64
 4   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(4)
memory usage: 24.9+ KB


In [15]:
# Verify no zero values.
crypto_df5.describe()

Unnamed: 0,TotalCoinsMined
count,532.0
mean,5340457000.0
std,45645690000.0
min,41.99995
25%,8359849.0
50%,24743970.0
75%,186725000.0
max,989989000000.0


In [16]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_df_coinname = crypto_df5[['CoinName']].copy()

In [17]:
# Verify df contains only required cols.
crypto_df_coinname.head()

Unnamed: 0_level_0,CoinName
mystery,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [18]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
crypto_df6 = crypto_df5.drop(columns=["CoinName"], axis=1)

In [19]:
# Current col names and row count.
crypto_df6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.8+ KB


In [20]:
# Compare to instructions for Deliverable 1.
crypto_df6.head(10)

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
mystery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159000000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0
LTC,Scrypt,PoW,63039240.0,84000000.0
DASH,X11,PoW/PoS,9031294.0,22000000.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0
ETC,Ethash,PoW,113359700.0,210000000.0
ZEC,Equihash,PoW,7383056.0,21000000.0


In [21]:
# Use get_dummies() to create variables for text features.
X1 = pd.get_dummies(crypto_df6, columns=[
    "Algorithm",
    "ProofType"
])

In [22]:
# Current col names and row count.
X1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 98 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   TotalCoinsMined                   532 non-null    float64
 1   TotalCoinSupply                   532 non-null    object 
 2   Algorithm_1GB AES Pattern Search  532 non-null    uint8  
 3   Algorithm_536                     532 non-null    uint8  
 4   Algorithm_Argon2d                 532 non-null    uint8  
 5   Algorithm_BLAKE256                532 non-null    uint8  
 6   Algorithm_Blake                   532 non-null    uint8  
 7   Algorithm_Blake2S                 532 non-null    uint8  
 8   Algorithm_Blake2b                 532 non-null    uint8  
 9   Algorithm_C11                     532 non-null    uint8  
 10  Algorithm_Cloverhash              532 non-null    uint8  
 11  Algorithm_Counterparty            532 non-null    uint8  
 12  Algorithm_Cr

In [57]:
# Standardize the data with StandardScaler().
X2 = StandardScaler().fit_transform(X1)


In [58]:
X2.dtype

dtype('float64')

In [59]:
X2.shape

(532, 98)

In [60]:
X2.size

52136

In [61]:
X2.view

<function ndarray.view>

### Deliverable 2: Reducing Data Dimensions Using PCA

In [62]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X3 = pca.fit_transform(X2)

In [63]:
X3

array([[-0.33000592,  0.98891678, -0.59329991],
       [-0.31332278,  0.98853736, -0.59372205],
       [ 2.31799094,  1.46640742, -0.6826739 ],
       ...,
       [ 0.32404176, -2.14233753,  0.45161529],
       [-0.14362667, -2.08327672,  0.37187161],
       [-0.28830217,  0.74968137, -0.27792608]])

In [86]:
# Create a DataFrame with the three principal components.
# Transform PCA data to a df
pcs_df = pd.DataFrame(
    # (Different from instructions. I don't use spaces in col names unless forced to.
    data = X3, columns = ["PC1", "PC2", "PC3"],
    index=crypto_df6.index
)
pcs_df.head(10)

Unnamed: 0_level_0,PC1,PC2,PC3
mystery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.330006,0.988917,-0.5933
404,-0.313323,0.988537,-0.593722
1337,2.317991,1.466407,-0.682674
BTC,-0.145685,-1.213853,0.167308
ETH,-0.162282,-1.965664,0.45079
LTC,-0.166444,-1.023535,-0.004298
DASH,-0.396652,1.022213,-0.498802
XMR,-0.166262,-2.110359,0.44462
ETC,-0.160722,-1.965805,0.45077
ZEC,-0.143626,-2.083277,0.371872


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [87]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
inertia = []
k = list(range(1, 5))

Running K-Means with `k=4`

In [88]:
# Initialize the K-Means model.
# YOUR CODE HERE
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    
# Fit the model
# YOUR CODE HERE
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# Predict clusters
# YOUR CODE HERE
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", 
                     title="Elbow Curve", 
                     xticks=k)

  "KMeans is known to have a memory leak on Windows "


In [97]:
# Initialize the K-Means model.
# YOUR CODE HERE
# Fit the model
# YOUR CODE HERE
# Predict clusters
# YOUR CODE HERE

def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["Class"] = model.labels_

    return data

k1 = get_clusters(4, pcs_df)
k1


Unnamed: 0_level_0,PC1,PC2,PC3,Class
mystery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.330006,0.988917,-0.593300,0
404,-0.313323,0.988537,-0.593722,0
1337,2.317991,1.466407,-0.682674,0
BTC,-0.145685,-1.213853,0.167308,1
ETH,-0.162282,-1.965664,0.450790,1
...,...,...,...,...
ZEPH,2.459466,1.167605,-0.079457,0
GAP,-0.328049,0.988748,-0.593326,0
BDX,0.324042,-2.142338,0.451615,1
ZEN,-0.143627,-2.083277,0.371872,1


In [99]:
# Create a new DataFrame including predicted clusters 
# and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE
clustered_df1 = pd.concat([crypto_df6, pcs_df])

#  Add a new column, "CoinName" to the clustered_df DataFrame 
# that holds the names of the cryptocurrencies. 
# YOUR CODE HERE
clustered_df2 = clustered_df1.join(crypto_df_coinname)
                                   
#  Add a new column, "Class" to the clustered_df DataFrame 
#  that holds the predictions.
# YOUR CODE HERE
clustered_df3 = pd.concat([clustered_df2, k1])


# Print the shape of the clustered_df
print(clustered_df3.shape)
clustered_df3.head(10)

(1596, 9)


Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
mystery,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1337,X13,PoW/PoS,29279420000.0,314159000000.0,,,,EliteCoin,
1337,,,,,2.317991,1.466407,-0.682674,EliteCoin,
1CR,Scrypt,PoW,88213.0,92000000000.0,,,,1Credit,
1CR,,,,,0.485284,-1.085361,-0.011754,1Credit,
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,,,,404Coin,
404,,,,,-0.313323,0.988537,-0.593722,404Coin,
42,Scrypt,PoW/PoS,41.99995,42.0,,,,42 Coin,
42,,,,,-0.330006,0.988917,-0.5933,42 Coin,
8BIT,Scrypt,PoW/PoS,1467841.0,0.0,,,,8BIT Coin,
8BIT,,,,,-0.329988,0.988917,-0.5933,8BIT Coin,


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE


In [None]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
