In [1]:
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

### Data Preparation

In [2]:
file = Path('crypto_data.csv')
#Cryptocurrency-Clusters_Unsupervised-ML/crypto_data.csv

In [3]:
# set first line as index-- index_col=0
df = pd.read_csv(file, index_col=0)
df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [4]:
# List all algorithms in use- 1
df['Algorithm'].unique()

array(['Scrypt', 'X11', 'SHA-256', 'X13', 'Ethash', 'CryptoNight-V7',
       'Equihash', 'SHA-512', 'Multiple', 'X15', 'NIST5', 'Quark',
       'Groestl', 'PoS', 'NeoScrypt', 'SHA3', 'HybridScryptHash256',
       'Scrypt-n', 'PHI1612', 'Lyra2REv2', 'CryptoNight', 'Shabal256',
       'Counterparty', 'Blake', 'Momentum', 'Stanford Folding', 'QuBit',
       'XG Hash', 'M7 POW', 'Curve25519', 'Lyra2RE', 'QUAIT', 'vDPOS',
       'Blake2b', 'BLAKE256', '1GB AES Pattern Search', 'Dagger',
       'CryptoNight-Lite', 'X11GOST', 'ScryptOG', 'SHA-256D', 'POS 3.0',
       'Progressive-n', 'DPoS', 'Lyra2Z', 'X14', 'Time Travel', 'Argon2',
       'Keccak', 'Blake2S', 'Dagger-Hashimoto', '536', 'Argon2d',
       'Cloverhash', 'Skein', 'SkunkHash v2 Raptor',
       'VeChainThor Authority', 'Ouroboros', 'POS 2.0', 'SkunkHash',
       'C11', 'Proof-of-BibleHash', 'SHA-256 + Hive',
       'Proof-of-Authority', 'XEVAN', 'VBFT', 'YescryptR16', 'IMesh',
       'X16S', 'Green Protocol', 'Semux BFT consensus'

In [5]:
# List all algorithms in use- 2
algorithms = set(df.Algorithm.values)
print(algorithms)

{'Rainforest', 'Proof-of-BibleHash', 'Equihash', 'Argon2d', 'vDPOS', 'Exosis', 'IMesh', 'Ouroboros', 'Progressive-n', 'Jump Consistent Hash', 'Proof-of-Authority', 'HybridScryptHash256', 'X16S', 'SHA-256', 'X11', 'Wild Keccak', 'Argon2', 'CryptoNight-lite', 'Semux BFT consensus', 'POS 3.0', 'CryptoNight-V7', 'Cryptonight-GPU', 'Stanford Folding', 'SHA3', 'Groestl', 'HMQ1725', 'QuBit', 'Keccak', 'M00N', 'Dagger-Hashimoto', 'CryptoNight Heavy', 'Lyra2Z', 'Equihash1927', 'Avesta hash', 'SHA-512', 'SHA-256 + Hive', 'CryptoNight Heavy X', 'PHI2', 'Slatechain', 'Equihash+Scrypt', 'PHI1612', 'NeoScrypt', 'CryptoNight-Lite', 'Momentum', 'Dagger', '1GB AES Pattern Search', 'Ethash', 'Green Protocol', 'T-Inside', 'X11GOST', 'VeChainThor Authority', 'Zhash', 'X14', 'Lyra2REv2', 'Multiple', 'X16R', 'DPoS', 'Curve25519', 'Time Travel', 'ECC 256K1', 'SkunkHash v2 Raptor', 'XG Hash', 'SHA3-256', 'C11', 'PoS', 'Blake2S', 'Tribus', 'BLAKE256', 'QUAIT', 'M7 POW', 'QuarkTX', 'Scrypt', 'X15', 'Skein', 'PO

In [6]:
# print the counts of stocks being currently traded vs. not being currently traded 
df['IsTrading'].value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [7]:
# Select only cryptocurrencies that are being traded
df = df.loc[df['IsTrading'] == True]
df['IsTrading'].value_counts()

True    1144
Name: IsTrading, dtype: int64

In [8]:
currency= df.CoinName.values
print(currency)
#df['CoinName'].unique()

['42 Coin' '365Coin' '404Coin' ... 'Beldex' 'Horizen' 'BitcoinPlus']


In [9]:
# Delete the IsTrading column; it is no longer necessary
df=df.drop('IsTrading',axis = 1)
df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
SERO,Super Zero,Ethash,PoW,,1000000000
UOS,UOS,SHA-256,DPoI,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [10]:
# Delete all rows with null values
df1 = df.dropna()
df1

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [11]:
df1 = df.dropna(axis=0, how='any')
df1.shape

(685, 5)

In [12]:
# Sort values in TotalCoinsMined to identify negative or zero values
#df = df['TotalCoinsMined'].sort_values()
#df
df.TotalCoinsMined.sort_values()

FIII   -5.917978e+09
LBTC    0.000000e+00
RIPO    0.000000e+00
BASH    0.000000e+00
CSH     0.000000e+00
            ...     
XQN              NaN
NETC             NaN
VPRC             NaN
SERO             NaN
UOS              NaN
Name: TotalCoinsMined, Length: 1144, dtype: float64

In [13]:
# Select only rows with positive TotalCoinsMined values


In [None]:
# Delete the CoinName column from the original dataframe


In [None]:
# Create dummy variables for columns with string values


In [None]:
# Standardize the data


In [None]:
# print the scaled data


In [None]:
# Identify the numbers of rows and columns in the scaled data


### Reduce dataset dimensions with PCA

In [None]:
# Reduce dimensions with PCA


In [None]:
# The sum of the explained variance of the principal components


In [None]:
# Reduce dimensions with t-SNE


In [None]:
# Plot t-SNE output


In [None]:
# Identify clusters with k-means


In [None]:
# Create an elbow plot
