In [177]:
# Initial imports
import pandas as pd
from pathlib import Path


In [178]:
#Load data
file = Path('crypto_data.csv')
crypto_df = pd.read_csv(file)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [179]:
 #List dataframe data types
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [180]:
trading_df = crypto_df.loc[crypto_df['IsTrading'] == True]
trading_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [181]:
#Remove the IsTrading Column
trading_df = trading_df.drop(columns=["IsTrading"])
trading_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [182]:
#Find null values
for c in trading_df.columns:
    print(f"Column {c} has {trading_df[c].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [183]:
#Drop rows where TotalCoinsMinded has null values
trading_df = trading_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [184]:
#Check null values again
for c in trading_df.columns:
    print(f"Column {c} has {trading_df[c].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [185]:
#Find duplicate entries
print(f"Duplicate entries: {trading_df.duplicated().sum()}")

Duplicate entries: 0


In [186]:
#Find only coins that have been mined > 0 times
trading_df = trading_df.loc[crypto_df['TotalCoinsMined'] > 0]
trading_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [187]:
#Drop CoinName
trading_df = trading_df.drop(columns = ["CoinName"])
trading_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [188]:
#Convert algorithm and prooftype to numeric
df = pd.get_dummies(trading_df, drop_first = True)
df.head()

Unnamed: 0,TotalCoinsMined,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,Unnamed: 0_ABY,Unnamed: 0_AC3,...,TotalCoinSupply_91388946,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999
0,41.99995,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1055185000.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,29279420000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,17927180.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Using pd.get_dummies() on the df created several additional columns (features) to split out the non-numeric values. These will need to be removed.

In [189]:
#Standardize/scale the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['TotalCoinsMined']])
#df.columns


In [190]:

# Create a DataFrame with the transformed data
new_df = pd.DataFrame(scaled_data)
df["ScaledCoinsMined"] = new_df[0]
df = df.drop(columns = ["TotalCoinsMined"])
df.head()

Unnamed: 0,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,Unnamed: 0_ABY,Unnamed: 0_AC3,Unnamed: 0_ACC,...,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999,ScaledCoinsMined
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.117108
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.524946
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.115726
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.116731
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.114622


In [191]:
#Drop NaN from 
df = df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df.head()


Unnamed: 0,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,Unnamed: 0_ABY,Unnamed: 0_AC3,Unnamed: 0_ACC,...,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999,ScaledCoinsMined
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.117108
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.524946
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.115726
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.116731
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.114622


In [192]:
df.reset_index(drop = True)


Unnamed: 0,Unnamed: 0_1CR,Unnamed: 0_404,Unnamed: 0_42,Unnamed: 0_8BIT,Unnamed: 0_AAC,Unnamed: 0_ABJ,Unnamed: 0_ABS,Unnamed: 0_ABY,Unnamed: 0_AC3,Unnamed: 0_ACC,...,TotalCoinSupply_92000000000,TotalCoinSupply_9354000,TotalCoinSupply_9507271,TotalCoinSupply_9736000,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_990000000000,TotalCoinSupply_999481516,TotalCoinSupply_9999999,ScaledCoinsMined
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.117108
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.524946
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.115726
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.116731
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.114622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.117035
201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.095180
202,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.114915
203,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21.591857


In [193]:
#Dimensionality reduction with PCA
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
pca = PCA(n_components=0.90)

#Get principal components from pca
df_pca_fit = pca.fit_transform(df)

In [194]:
df_pca = pd.DataFrame(data = df_pca_fit)
df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,-0.182916,0.603811,-0.793648,-0.357388,0.069294,-0.154007,0.009431,-0.017663,0.026587,-0.058051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,7.216975e-21
1,0.45911,0.583356,-0.801878,-0.34864,0.070816,-0.151806,-0.006469,-0.017902,0.023249,-0.04849,...,0.02212424,0.05074614,-0.007475727,0.01300631,0.009620461,-0.01894785,0.03590671,-0.002287839,-0.0168313,-0.788325
2,-0.155406,0.911972,0.121997,-0.264279,0.034555,0.028959,0.660881,0.17297,-0.585173,0.367806,...,-2.217142e-16,-4.836007e-16,4.909606e-16,-2.2846070000000003e-17,5.3281950000000006e-17,-4.0478950000000004e-17,-6.982618e-16,-5.866764e-17,-7.421140000000001e-17,7.536446e-15
3,-0.210908,-0.539931,0.844908,-0.462287,0.584308,0.158866,-0.488141,0.675025,0.039806,0.058184,...,1.226635e-16,3.373246e-16,2.6295980000000003e-17,-1.625291e-16,1.544026e-16,2.766061e-16,2.207943e-16,-3.706354e-16,1.260367e-16,-2.345318e-15
4,-0.20386,-0.391811,0.401706,0.281969,-0.349816,0.954825,0.174062,-0.029771,0.068026,-0.299212,...,3.8945650000000004e-17,4.299355e-16,3.548041e-16,7.359808e-17,1.8859510000000002e-17,4.084694e-16,7.359808e-18,-1.799895e-16,1.047239e-16,-7.163817e-16


In [199]:
#Check explained variance
#pca.explained_variance_ratio_
pca.explained_variance_ratio_.cumsum() #Equals 0.90054597

array([0.42600936, 0.50039072, 0.54867197, 0.58223884, 0.6063158 ,
       0.62576676, 0.63958724, 0.64906311, 0.65721079, 0.66394979,
       0.67017784, 0.67617053, 0.68172618, 0.68702158, 0.69193011,
       0.6966946 , 0.70113867, 0.70550588, 0.7095482 , 0.71339582,
       0.71712583, 0.72051819, 0.72388867, 0.72723134, 0.73056481,
       0.73384929, 0.73706401, 0.74007569, 0.74299732, 0.74583765,
       0.74862071, 0.75125943, 0.75377221, 0.75628499, 0.75879776,
       0.76131054, 0.76382331, 0.76633609, 0.76884307, 0.77133844,
       0.77381281, 0.7762534 , 0.77868899, 0.78106087, 0.78340968,
       0.78572716, 0.78802607, 0.79026581, 0.79244591, 0.79462198,
       0.79659306, 0.79835977, 0.80003495, 0.80171014, 0.80338532,
       0.8050605 , 0.80673569, 0.80841087, 0.81008605, 0.81176124,
       0.81343642, 0.81511161, 0.81678679, 0.81846197, 0.82013716,
       0.82181234, 0.82348752, 0.82516271, 0.82683789, 0.82851307,
       0.83018826, 0.83186344, 0.83353863, 0.83521381, 0.83688

In [None]:
#Run t-SNE to further reduce