# Cryptocurrency Clusters Machine Learning

## Data Preparation

In [27]:
# Dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [28]:
# Read crypto_data.csv into Pandas
# The dataset was obtained from: https://min-api.cryptocompare.com/data/all/coinlist
file = Path("Resources/crypto_data.csv")
df = pd.read_csv(file)
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [29]:
# Discard all cryptocurrencies that are not being traded and drop the IsTrading column from the dataframe
df = df[df.IsTrading != False]
df = df.drop(['IsTrading'], axis=1)
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [30]:
# Remove all rows that have at least one null value
df = df.dropna()
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [31]:
# Filter for cryptocurrencies that have been mined (the total coins mined should be greater than zero)
df = df[df.TotalCoinSupply.astype(float) > 0]
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
9,LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [32]:
# Delete the CoinName from the original dataframe
df = df.drop(['CoinName'], axis=1)
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
9,Scrypt,PoW,6.303924e+07,84000000
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [33]:
# Convert Algorithm and ProofType into numerical data
# Use Pandas to create dummy variables
# Examine the number of rows and columns of your dataset now. How did they change?
df = pd.get_dummies(df, prefix=['Algorithm', 'ProofType'], columns=['Algorithm', 'ProofType'])
df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,6.303924e+07,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Changes:
The number of rows remained the same after adding dummy variables, but the number of columns increased by 101 

In [35]:
# Format all data as numeric
df.apply(pd.to_numeric)
df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,6.303924e+07,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Standardize the dataset so columns that contain larger values do not unduly influence the outcome
scaled_data = StandardScaler().fit_transform(df)
scaled_data

array([[-0.09782131, -0.03965512, -0.03965258, ..., -0.03965258,
        -0.03965258, -0.03965258],
       [-0.07228807, -0.03965497, -0.03965258, ..., -0.03965258,
        -0.03965258, -0.03965258],
       [ 0.61067897, -0.03956909, -0.03965258, ..., -0.03965258,
        -0.03965258, -0.03965258],
       ...,
       [-0.074102  , -0.03965474, -0.03965258, ..., -0.03965258,
        -0.03965258, -0.03965258],
       [-0.09764475, -0.03965511, -0.03965258, ..., -0.03965258,
        -0.03965258, -0.03965258],
       [-0.09781821, -0.03965512, -0.03965258, ..., -0.03965258,
        -0.03965258, -0.03965258]])

## Dimensionality Reduction

In [None]:
# Perform dimensionality reduction with PCA (preserve 90% of the explained variance)

# (Rather than specify the number of principal components when you instantiate the PCA model,
# it is possible to state the desired explained variance. For example, say that a dataset has 100 features.
# Using PCA(n_components=0.99) creates a model that will preserve approximately 99% of the explained variance,
# whether that means reducing the dataset to 80 principal components or 3. For this project,
# preserve 90% of the explained variance in dimensionality reduction.)

### Changes:

In [None]:
# Further reduce the dataset dimensions with t-SNE
# Run t-SNE on the principal components: the output of the PCA transformation


In [None]:
# Create a scatter plot of the t-SNE output (observe whether there are distinct clusters or not)


## Cluster Analysis with k-Means

In [None]:
# Create an elbow plot to identify the best number of clusters


In [None]:
# Use a for-loop to determine the inertia for each k between 1 through 10
# Determine, if possible, where the elbow of the plot is, and at which value of k it appears


## Recommendation