# Cryptocurrency Clusters Machine Learning

## Data Preparation

In [1]:
# Dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [5]:
# Read crypto_data.csv into Pandas
# The dataset was obtained from: https://min-api.cryptocompare.com/data/all/coinlist
file = Path("Resources/crypto_data.csv")
df = pd.read_csv(file)
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [6]:
# Discard all cryptocurrencies that are not being traded and drop the IsTrading column from the dataframe
df = df[df.IsTrading != False]
df = df.drop(['IsTrading'], axis=1)
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [8]:
# Remove all rows that have at least one null value
df = df.dropna()
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [12]:
# Filter for cryptocurrencies that have been mined (the total coins mined should be greater than zero)
df = df[df.TotalCoinSupply.astype(float) > 0]
df.head(5)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
9,LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [None]:
# Delete the CoinName from the original dataframe


In [None]:
# Convert Algorithm and ProofType into numerical data
# Use Pandas to create dummy variables
# Examine the number of rows and columns of your dataset now. How did they change?


### Changes:

In [None]:
# Standardize the dataset so columns that contain larger values do not unduly influence the outcome


## Dimensionality Reduction

In [None]:
# Perform dimensionality reduction with PCA (preserve 90% of the explained variance)

# (Rather than specify the number of principal components when you instantiate the PCA model,
# it is possible to state the desired explained variance. For example, say that a dataset has 100 features.
# Using PCA(n_components=0.99) creates a model that will preserve approximately 99% of the explained variance,
# whether that means reducing the dataset to 80 principal components or 3. For this project,
# preserve 90% of the explained variance in dimensionality reduction.)

### Changes:

In [None]:
# Further reduce the dataset dimensions with t-SNE
# Run t-SNE on the principal components: the output of the PCA transformation


In [None]:
# Create a scatter plot of the t-SNE output (observe whether there are distinct clusters or not)


## Cluster Analysis with k-Means

In [None]:
# Create an elbow plot to identify the best number of clusters


In [None]:
# Use a for-loop to determine the inertia for each k between 1 through 10
# Determine, if possible, where the elbow of the plot is, and at which value of k it appears


## Recommendation