In [1]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

## Data Preparation

### Read data into pandas

In [58]:
orig_data_df = pd.read_csv(Path('crypto_data.csv'))
orig_data_df.shape

(1252, 7)

### Discard non-traded currencies; then drop 'IsTrading' column

In [59]:
working_df = orig_data_df
working_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [60]:
working_df.drop(working_df[working_df['IsTrading'] == False].index, inplace=True)
working_df.shape

(1144, 7)

In [61]:
working_df = working_df.drop(columns=['IsTrading'])
working_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


### Remove all rows that have at least one null value

In [64]:
working_df = working_df.dropna(how= 'any')
working_df.shape

(685, 6)

### Filter for mined currencies. Total coins mined should be greater than zero.

In [65]:
working_df2 = working_df[working_df['TotalCoinsMined'] > 0]
working_df2

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


### Delete 'CoinName' from original dataframe.
* me Q: do we need to have a way to identify the currencies within the model? 

In [66]:
working_df2 = working_df2.drop(columns=['CoinName'])
working_df2

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,4.199995e+01,42
2,404,Scrypt,PoW/PoS,1.055185e+09,532000000
5,1337,X13,PoW/PoS,2.927942e+10,314159265359
7,BTC,SHA-256,PoW,1.792718e+07,21000000
8,ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Equihash,PoW,7.296538e+06,21000000


In [40]:
# determining how many new columns to expect
alg_types = working_df2['Algorithm'].nunique()
proof_types = working_df2['ProofType'].nunique()

print(f'The number of algorithm types are: {alg_types}; the number of ProofTypes are: {proof_types}')

The number of algorithm types are: 71; the number of ProofTypes are: 25


### Convert remaining features w text values (i.e., 'Algorithm' & 'Prooftype') into numeric
* use pandas to create dummy variables

In [42]:
dummy_df = pd.get_dummies(working_df2, columns=['ProofType', 'Algorithm'])

Index(['Unnamed: 0', 'TotalCoinsMined', 'TotalCoinSupply', 'ProofType_DPOS',
       'ProofType_DPoS', 'ProofType_HPoW', 'ProofType_LPoS', 'ProofType_POBh',
       'ProofType_PoA', 'ProofType_PoC', 'ProofType_PoS', 'ProofType_PoS/LPoS',
       'ProofType_PoS/PoW', 'ProofType_PoS/PoW/PoT', 'ProofType_PoST',
       'ProofType_PoW', 'ProofType_PoW + Hive', 'ProofType_PoW and PoS',
       'ProofType_PoW/PoS', 'ProofType_PoW/PoS ', 'ProofType_PoW/PoW',
       'ProofType_PoW/nPoS', 'ProofType_Pos', 'ProofType_Proof of Authority',
       'ProofType_Proof of Trust', 'ProofType_TPoS',
       'ProofType_Zero-Knowledge Proof', 'ProofType_dPoW/PoW',
       'Algorithm_1GB AES Pattern Search', 'Algorithm_536',
       'Algorithm_Argon2d', 'Algorithm_BLAKE256', 'Algorithm_Blake',
       'Algorithm_Blake2S', 'Algorithm_Blake2b', 'Algorithm_C11',
       'Algorithm_Cloverhash', 'Algorithm_Counterparty',
       'Algorithm_CryptoNight', 'Algorithm_CryptoNight Heavy',
       'Algorithm_CryptoNight-V7', 'Algo

In [67]:
dummy_df.head()

Unnamed: 0.1,Unnamed: 0,TotalCoinsMined,TotalCoinSupply,ProofType_DPOS,ProofType_DPoS,ProofType_HPoW,ProofType_LPoS,ProofType_POBh,ProofType_PoA,ProofType_PoC,...,Algorithm_Tribus,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN
0,42,41.99995,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,404,1055185000.0,532000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1337,29279420000.0,314159265359,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,BTC,17927180.0,21000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ETH,107684200.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Examine number of rows & columns in dataset... 
Q: How did they change?
* A: The number of rows remained the same & the number of columns is equal to the number of columns that remain unchanged (i.e., 3) plus a column for each of the 71 algorithm types plus a column for each of the 25 proof types.

In [45]:
crypto_df = dummy_df

### Standardize dataset so columns with larger values do not unduly influence the outcome

In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [47]:
# this column is causing errors & doing dummy columns seems extraneous because I assume they are just abbreviations for the names. 
# I assume the indices can be used to identify them later if needed
crypto_df = crypto_df.drop(columns=['Unnamed: 0'])
crypto_df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,ProofType_DPOS,ProofType_DPoS,ProofType_HPoW,ProofType_LPoS,ProofType_POBh,ProofType_PoA,ProofType_PoC,ProofType_PoS,...,Algorithm_Tribus,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
scaler = StandardScaler().fit(crypto_df)
crypto_scaled = scaler.transform(crypto_df)
crypto_scaled

array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.08703883,
        -0.08703883, -0.10680283],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.08703883,
        -0.08703883, -0.10680283],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.08703883,
        -0.08703883, -0.10680283],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.08703883,
        -0.08703883, -0.10680283],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.08703883,
        -0.08703883, -0.10680283],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.08703883,
        -0.08703883, -0.10680283]])

## Dimensionality Reduction

### Perform dimensional reduction with PCA.
* use explained variance set to 90%
* 'PCA(n_components=0.90)'

In [52]:
# initialize PCA model
pca = PCA(n_components=.90)

crypto_pca = pca.fit_transform(crypto_scaled)

In [57]:
crypto_pca.shape


(532, 74)

#### Q: How did number of features change after reduction?
* A: the number of columns in the data reduced from 98 to 74

### Further reduce dataset with t-SNE & visually inspect results
* run t-SNE on the output from the PCA transformation
* create scatterplot of t-SNE output


#### Are there distinct clusters in the t-SNE scatterplot?

## Cluster Analysis with k-Means
* create an elbow plot to identify the best number of clusters
* use a for-loop to determine the intertia for each 'k' between 1 through 10
* determine, if possible, where the elbow of the plot is, and at which value of 'k' it appears

## Recommendation
* Based on your findings, make a brief (1-2 sentences) recommendation to your clients
* Can the cryptocurrencies be clustered together? If so, into how many clusters?