In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import os
import pathlib
from sklearn.decomposition import PCA


In [8]:
feature = 'volatility_pcnt'
feature_list = ['symbol','pr_count','issues_count','commit_add_sum','commit_del_sum','commit_count',
                'star_count','close','high','low','open','usd_market_cap','usd_volume','volatility_pcnt']
coins = ['BTC', 'BCH', 'BCD', 'BTG', 'DASH', 'DCR', 'DOGE', 'ETN', 'LTC', 'PIVX', 'XLM', 'XMR', 
                'XRB', 'XRP', 'ZEC','ADA', 'ARK', 'BTS', 'DGB', 'DRGN', 'EOS', 'FCT', 'GNT', 'LSK', 
                'NEO', 'OMG', 'QTUM', 'REP', 'RHOC', 'SNT', 'STEEM', 'STRAT', 'WAVES','ZRX','ETH','SC','BCN',
               'XVG','ZCL']
labels = ['payment','utility','payment_utility','asset_utility','unknown']

In [9]:
dfs = []
hist_symbol = []
feat_cols = []
for path in pathlib.Path("github_data").iterdir():
    filename = path.name
    symbol = filename.split('.')[0]
    #name=('{}_'+feature).format(symbol)
    sdf = pd.read_csv(path,
                          usecols=[feature_list.index(feature)],
                          header=0,
                          #names=[('{}_'+feature).format(symbol)])
                          names=[symbol])
    #feat_cols.append(name)
    feat_cols.append(symbol)
    dfs.append(sdf)
    hist_symbol.append(symbol)
hist_symbol = set(hist_symbol)

In [10]:
for df in dfs:
    symbol = df.columns.values[0].split("_",1)[0]
    df[symbol+'_return_pcnt'] = (df[symbol+"_close"] - df[symbol+"_open"]) / df[symbol+"_close"]
    df[symbol+'_volatility_pcnt'] = (df[symbol+"_high"] - df[symbol+"_low"]) / ((df[symbol+"_high"] + df[symbol+"_low"]) / 2)

KeyError: 'ADA_close'

In [None]:
feature_df = pd.concat(dfs, axis=1)
feature_df.describe()

In [None]:
feature_df = feature_df.fillna(0)

## Perform PCA on _feature_df_  for both the first 2 principal components and 3 components

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(feature_df[feat_cols].values)

feature_df['pca-one'] = pca_result[:,0]
feature_df['pca-two'] = pca_result[:,1] 
feature_df['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

## t-SNE on 2 and 3 components

In [None]:
import time
from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(feature_df[feat_cols].values)

print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

## Map symbols to coin function labels before plotting

In [None]:
feature_df.rename(columns={'ADA':'utility','BTC':'payment', 'BCH':'payment', 'BCD':'payment', 'BTG':'payment', 'DASH':'payment', 'DCR':'payment', 
    'DOGE':'payment', 'ETN':'payment', 'LTC':'payment', 'PIVX':'payment', 'XLM':'payment', 'XMR':'payment', 
    'XRB':'payment', 'XRP':'payment', 'ZEC':'payment','ARK':'utility', 'BTS':'utility', 
    'DGB':'utility', 'DRGN':'utility', 'EOS':'utility', 'FCT':'utility', 'GNT':'utility', 'LSK':'utility', 
    'NEO':'utility', 'OMG':'utility', 'QTUM':'utility', 'REP':'utility', 'RHOC':'utility', 'SNT':'utility', 
    'STEEM':'utility', 'STRAT':'utility', 'WAVES':'utility','ZRX':'utility','ETH':'payment_utility',
    'SC':'asset_utility','BCN':'unknown','XVG':'unknown', 'ZCL':'unknown'},
                 inplace=True)
#print(feature_df)

In [None]:
#print(feature_df.columns.values)
#for i in range(0,38):
    #print(feature_df.columns[i])

## Plot PCA for both the 1st 2 principal components and 3 components

In [None]:
labels = ['payment','utility','payment_utility','asset_utility','unknown']
# label-to-color dictionary
label_color_dict = {'utility':'red','payment':'green','payment_utility':'blue','asset_utility':'magenta','unknown':'yellow'}
# Color vector creation
cvec = [label_color_dict[label] for label in labels]

In [None]:
# Create the scatter plot
plt.figure(figsize=(8,8))
plt.scatter(pca_result[:,0], pca_result[:,1],
            c=cvec, edgecolor='', alpha=0.5)
plt.xlabel("pca-one")
plt.ylabel("pca-two")
plt.title("First and Second Principal Components Colored by Coin Function")
#plt.legend(loc='best')
print(label_color_dict)
   
plt.show()

3 Components

In [None]:
#from mpl_toolkits.mplot3d import Axes3D 
#from matplotlib import pyplot

#fig = pyplot.figure(figsize=(7,7))
#ax = Axes3D(fig)

#ax.scatter(pca_result[:,0], pca_result[:,1], pca_result[:,2],c=cvec, edgecolor='', alpha=0.5)
#ax.scatter(pca_result[:,0], pca_result[:,1], pca_result[:,2])
#ax.set_xlabel("pca-one")
#ax.set_ylabel("pca-two")
#ax.set_zlabel("pca-three")
#plt.title("First Three Principal Components Colored by Coin Function")

#pyplot.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D 
from matplotlib import pyplot

fig = pyplot.figure(figsize=(7,7))
ax = Axes3D(fig)
for label, color in label_color_dict.items():
    ax.scatter(pca_result[:,0], pca_result[:,1], pca_result[:,2], 
            color=color, label=label)

ax.legend(loc='best')
ax.set_xlabel("pca-one")
ax.set_ylabel("pca-two")
ax.set_zlabel("pca-three")
plt.title("First Three Principal Components Colored by Coin Function")

## Plot t-SNE 

Visualise the two dimensions by creating a scatter plot and coloring each sample by its respective label

In [None]:
fig, ax = plt.subplots(1,1)
for label, color in label_color_dict.items():
    ax.scatter(tsne_results[:,0], tsne_results[:,1],  
            color=color, label=label)

ax.legend(loc='best')
plt.xlabel("x-tsne")
plt.ylabel("y-tsne")
plt.title("tSNE dimensions colored by coin function")

3 Components

In [None]:
#fig, ax = plt.subplots(1,1)
fig = pyplot.figure(figsize=(7,7))
ax = Axes3D(fig)
for label, color in label_color_dict.items():
    ax.scatter(tsne_results[:,0], tsne_results[:,1], tsne_results[:,2], 
            color=color, label=label)

ax.legend(loc='best')
plt.xlabel("x-tsne")
plt.ylabel("y-tsne")
ax.set_zlabel("z-tsne")
plt.title("tSNE dimensions colored by coin function")