In [100]:
# Initial imports
import pandas as pd
from pathlib import Path


In [101]:
#Load data
file = Path('crypto_data.csv')
crypto_df = pd.read_csv(file)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [102]:
 #List dataframe data types
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [103]:
trading_df = crypto_df.loc[crypto_df['IsTrading'] == True]
trading_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [104]:
#Remove the IsTrading Column
trading_df = trading_df.drop(columns=["IsTrading"])
trading_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [105]:
#Find null values
for c in trading_df.columns:
    print(f"Column {c} has {trading_df[c].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [106]:
#Drop rows where TotalCoinsMinded has null values
trading_df = trading_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [107]:
#Check null values again
for c in trading_df.columns:
    print(f"Column {c} has {trading_df[c].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [108]:
#Find duplicate entries
print(f"Duplicate entries: {trading_df.duplicated().sum()}")

Duplicate entries: 0


In [109]:
#Find only coins that have been mined > 0 times
trading_df = trading_df.loc[crypto_df['TotalCoinsMined'] > 0]
trading_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [110]:
#Drop CoinName
trading_df = trading_df.drop(columns=["CoinName"])
trading_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [112]:
#Convert algorithm and prooftype to numeric
df = pd.get_dummies(trading_df, drop_first = True)
df.head()

Unnamed: 0,TotalCoinsMined,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,Unnamed: 0_ABY,Unnamed: 0_AC3,...,TotalCoinSupply_91388946,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999
0,41.99995,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1055185000.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,29279420000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,17927180.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Using pd.get_dummies() on the df created several additional columns (features) to split out the non-numeric values. These will need to be removed.

In [132]:
#Standardize/scale the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['TotalCoinsMined']])
df.columns


Index(['TotalCoinsMined', 'Unnamed: 0_1CR', 'Unnamed: 0_404', 'Unnamed: 0_42',
       'Unnamed: 0_8BIT', 'Unnamed: 0_AAC', 'Unnamed: 0_ABJ', 'Unnamed: 0_ABS',
       'Unnamed: 0_ABY', 'Unnamed: 0_AC3',
       ...
       'TotalCoinSupply_91388946', 'TotalCoinSupply_92000000000',
       'TotalCoinSupply_9354000', 'TotalCoinSupply_9507271',
       'TotalCoinSupply_9736000', 'TotalCoinSupply_98000000',
       'TotalCoinSupply_98100000000', 'TotalCoinSupply_990000000000',
       'TotalCoinSupply_999481516', 'TotalCoinSupply_9999999'],
      dtype='object', length=905)

In [136]:

# Create a DataFrame with the transformed data
new_df = pd.DataFrame(scaled_data)
df["ScaledCoinsMined"] = new_df[0]
df = df.drop(columns=["TotalCoinsMined"])
df.head()


Unnamed: 0,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,Unnamed: 0_ABY,Unnamed: 0_AC3,Unnamed: 0_ACC,...,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999,ScaledCoinsMined
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.117108
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.524946
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.115726
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.116731
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.114622


In [137]:
#Dimensionality reduction with PCA
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
