# Visualizing crypto market structure
Adapted from https://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html#sphx-glr-auto-examples-applications-plot-stock-market-py

This example employs several unsupervised learning techniques to extract
the stock market structure from variations in historical quotes.

The quantity that we use is the daily variation in quote price: quotes
that are linked tend to fluctuate in relation to each other during a day.


In [2]:
import time
import json

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
import requests
from sklearn import covariance, cluster, manifold

### We examine Coindesk Market Index [coindesk](https://www.coindesk.com/indices/cmis)

### Price data from Coin Gecko [coin gecko](https://www.coingecko.com/)

In [10]:
constituents = pd.read_csv("constituents_coindesk_market_index.csv")
constituents = constituents.rename(columns=lambda x: x.lower())

with open('coingecko_list.json') as f:
    cg_data = json.load(f)

cg_names = pd.DataFrame.from_records(cg_data)
cg_names = cg_names.rename(columns={'name': 'cg_name'})

constituents = constituents.merge(cg_names, how="left", on='symbol')
constituents.to_csv("cg_constituents.csv")

In [29]:
constituents = pd.read_csv("cg_constituents_2.csv", index_col=0)

In [30]:
constituents

Unnamed: 0,symbol,name,dacs sector,dacs industry group,dacs industry,reconstitution weight,id,cg_name
0,1inch,1inch Network,DeFi,Exchanges,Exchanges (Other),0.032945,1inch,1inch
1,aave,Aave,DeFi,Credit Platform,Lending / Borrowing,0.108995,aave,Aave
2,aca,Acala Token,Smart Contract Platform,Layer 1,Layer 1,0.006482,acala,Acala
3,ach,Alchemy Pay,Currency,BaaS,Payments,0.011424,alchemy-pay,Alchemy Pay
4,acs,Access Protocol,Culture & Entertainment,Media,Social,0.007609,access-protocol,Access Protocol
...,...,...,...,...,...,...,...,...
279,xyo,XYO,Computing,Oracle,Oracle,0.006356,xyo-network,XYO Network
280,yfi,yearn.finance,DeFi,Yield,Yield,0.019512,yearn-finance,yearn.finance
281,ygg,Yield Guild Games,Culture & Entertainment,Metaverse,Metaverse Platform,0.009636,yield-guild-games,Yield Guild Games
282,zec,Zcash,Currency,Private,Private,0.035552,zcash,Zcash


In [31]:
constituents[constituents.symbol.duplicated()]

Unnamed: 0,symbol,name,dacs sector,dacs industry group,dacs industry,reconstitution weight,id,cg_name


## Load historical prices from Coin Gecko
## Dec 2022 - Dec 2023

In [None]:
start = pd.Timestamp("2022-12-01T12:00:00")
end  = pd.Timestamp("2023-12-01T12:00:00")

results = []
for _, row in constituents.iterrows():
    time.sleep(21)
    token_id = row['id']
    name = row['name']
    print(f"token_id {token_id}")
    r = requests.get(f"https://api.coingecko.com/api/v3/coins/{token_id}/market_chart/range?vs_currency=USD&from={start.timestamp()}&to={end.timestamp()}")
    print(f"getting data for name {name}")
    resp = r.json()
    data = resp.get('prices')
    if (data is None) or (len(data) == 0):
        print(f"no data for ticker {name}")
        date_range = pd.date_range(start, end)
        nan_prices = np.empty((date_range.shape[0], ))
        nan_prices[:] = np.nan
        prices = pd.DataFrame({'date' : date_range, 'close': nan_prices})
    else:
        prices = pd.DataFrame(resp.get('prices')) 
        prices.columns = ["date", 'price']
    prices = prices.sort_values(by='date')
    prices["date"] = pd.to_datetime(prices.date, unit='ms', origin='unix')
    prices["name"] = name
    results.append(prices)

combined_data = pd.concat(results)

In [34]:
combined_data.to_csv('cg_price_data_total.csv')

In [36]:
dataset = combined_data.pivot(index='date', columns='name', values='price')

In [50]:
dataset

name,0x,1inch Network,API3,ARPA,Aave,Aavegotchi,Acala Token,Adventure Gold,Aergo,Akash Network,...,VeChain,Vulcan Forged PYR,WOO Network,Waves,XRP,XYO,Yield Guild Games,Zcash,iExec RLC,yearn.finance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-02,0.194349,0.512787,1.475673,0.027956,63.589064,1.052975,0.113451,0.274698,0.116292,0.252062,...,0.019083,3.207968,0.124526,2.357454,0.398586,0.004324,0.224919,44.932667,1.086012,6681.304682
2022-12-03,0.193552,0.505770,1.507737,0.028853,65.527824,1.025667,0.155059,0.276957,0.116840,0.239293,...,0.019441,3.196139,0.125819,2.424383,0.398616,0.004322,0.242841,46.120687,1.099833,6819.861450
2022-12-04,0.190112,0.489838,1.463525,0.028155,62.874587,1.018137,0.137897,0.270616,0.114971,0.228143,...,0.019056,3.156363,0.122130,2.329474,0.389865,0.004268,0.235330,45.626442,1.076376,6640.284090
2022-12-05,0.197506,0.500704,1.476821,0.028863,64.518508,1.039546,0.136052,0.275633,0.114717,0.229625,...,0.019364,3.185474,0.123227,2.365142,0.392702,0.004415,0.239984,45.949654,1.097304,7338.261697
2022-12-06,0.191915,0.485993,1.451414,0.028394,64.355277,1.042089,0.132244,0.319110,0.112539,0.222067,...,0.019219,3.150201,0.122647,2.401339,0.390779,0.004299,0.230107,46.337661,1.065158,7166.505976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27,0.401705,0.355181,1.461461,0.051936,98.052293,0.919328,0.059087,0.926657,0.174821,1.878847,...,0.022410,6.353866,0.224329,2.137117,0.616851,0.003949,0.379643,29.523909,1.527754,8533.514564
2023-11-28,0.391707,0.343751,1.410261,0.049584,96.196181,0.871273,0.063865,0.843920,0.166301,1.683940,...,0.021510,6.059879,0.214906,2.095527,0.604080,0.003885,0.362481,28.808138,1.460926,8443.978749
2023-11-29,0.396652,0.342631,1.475183,0.049815,96.854285,0.864136,0.063692,0.825368,0.174062,1.711161,...,0.021614,5.946000,0.221407,2.105248,0.611266,0.003871,0.375681,28.890945,1.479162,8468.222241
2023-11-30,0.395284,0.341082,1.443302,0.049426,96.300476,0.829303,0.061066,0.822433,0.183276,1.627297,...,0.021721,6.123255,0.212792,2.089071,0.609145,0.005699,0.376468,28.909134,1.571104,8253.033046


In [41]:
dataset.columns[dataset.isna().any()].tolist()

['Access Protocol',
 'Arbitrum',
 'Blur',
 'Echelon Prime',
 'Flare',
 'Pepe',
 'Sei',
 'Sui',
 'dYdX']

In [42]:
dataset = dataset.dropna(axis=1, how='any')

In [51]:
dataset.shape

(365, 175)

In [74]:
constituents['ticker'] = constituents['Symbol'].str.upper() + "-USD"

In [196]:
constituents['DACS Sector'].unique()

array(['DeFi', 'Smart Contract Platform', 'Currency',
       'Culture & Entertainment', 'Computing', 'Digitization'],
      dtype=object)

## Query Coinbase

In [169]:
def get_prices(ticker: str, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
    r = requests.get(f"https://api.pro.coinbase.com/products/{ticker}/candles?start={start.isoformat()}&end={end.isoformat()}&granularity=86400")
    resp = r.json()
    if isinstance(resp, list) and len(resp) > 0:
        print(f"found data from {ticker}")
        prices = pd.DataFrame(resp)
        prices.columns = ['time', 'open', 'high', 'low', 'close', 'volume']
        prices['date'] = pd.to_datetime(prices['time'], origin="unix", unit='s')
    else:
        print(f"no data for ticker {ticker}")
        date_range = pd.date_range(start, end)
        nan_prices = np.empty((date_range.shape[0], ))
        nan_prices[:] = np.nan
        prices = pd.DataFrame({'date' : date_range, 'close': nan_prices})
    prices = prices.sort_values(by='date')
    prices["ticker"] = ticker
    return prices[['date', 'ticker', 'close']]

start = pd.Timestamp("2022-12-01T00:00:00")
mid = pd.Timestamp("2023-06-01T00:00:00")
end = pd.Timestamp("2023-12-01T00:00:00")

prices = []
for ticker in constituents.ticker:
    print(f"get prices for: {ticker}")
    batch_1 = get_prices(ticker, start, mid)
    batch_2 = get_prices(ticker, mid + pd.Timedelta(hours=24), end)
    prices.append(pd.concat([batch_1, batch_2]))

dataset = pd.concat(prices)
dataset = dataset.pivot(index='date', columns='ticker', values='close')
# dataset = dataset.dropna(axis=1, how='all')


In [172]:
# dataset.to_csv("coinbase_prices.csv")
dataset = pd.read_csv("coinbase_prices.csv")

  values = values.astype(str)


In [8]:
dataset.columns[dataset.isna().any()]

Index(['ACA-USD', 'ACS-USD', 'AKT-USD', 'ALI-USD', 'ALPHA-USD', 'ANT-USD',
       'ARB-USD', 'ASTR-USD', 'AUDIO-USD', 'AXL-USD', 'BLUR-USD', 'BOND-USD',
       'CHR-USD', 'CQT-USD', 'DYDX-USD', 'EGLD-USD', 'ELON-USD', 'ETHW-USD',
       'FLR-USD', 'FTM-USD', 'FXS-USD', 'GALA-USD', 'GHST-USD', 'GLMR-USD',
       'GMX-USD', 'HNT-USD', 'ICX-USD', 'KAVA-USD', 'LIT-USD', 'LSK-USD',
       'LUNA-USD', 'LUNC-USD', 'MAGIC-USD', 'MOVR-USD', 'OMG-USD', 'OSMO-USD',
       'PEPE-USD', 'PRIME-USD', 'QRDO-USD', 'QTUM-USD', 'REN-USD', 'RPL-USD',
       'RUNE-USD', 'SC-USD', 'SEI-USD', 'SLP-USD', 'STG-USD', 'SUI-USD',
       'T-USD', 'TVK-USD', 'VET-USD', 'WAVES-USD', 'WOO-USD', 'XRP-USD',
       'YGG-USD'],
      dtype='object')


## Learning a graph structure

We use sparse inverse covariance estimation to find which quotes are
correlated conditionally on the others. Specifically, sparse inverse
covariance gives us a graph, that is a list of connections. For each
symbol, the symbols that it is connected to are those useful to explain
its fluctuations.



In [181]:
rets = dataset.pct_change().iloc[1:]
cov = rets.cov()
corrs = rets.corr()

ticker,1INCH-USD,AAVE-USD,ACH-USD,ADA-USD,AERGO-USD,AGLD-USD,ALCX-USD,ALGO-USD,ALICE-USD,AMP-USD,...,TRU-USD,UMA-USD,UNFI-USD,UNI-USD,XLM-USD,XTZ-USD,XYO-USD,YFI-USD,ZEC-USD,ZRX-USD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-01,0.510,63.65,0.010053,0.3145,0.1134,0.2757,18.01,0.2408,1.324,0.00354,...,0.0384,1.714,4.65,5.918,0.087632,1.004,0.00432,6699.09,44.92,0.193194
2022-12-02,0.504,65.70,0.010003,0.3191,0.1153,0.2791,18.79,0.2441,1.363,0.00353,...,0.0395,1.715,4.68,6.345,0.088096,1.029,0.00433,6800.23,46.12,0.192361
2022-12-03,0.487,62.76,0.009683,0.3196,0.1134,0.2720,18.04,0.2377,1.323,0.00349,...,0.0385,1.687,4.51,6.092,0.086310,0.993,0.00429,6627.07,45.51,0.189207
2022-12-04,0.497,64.40,0.009818,0.3224,0.1121,0.2762,18.64,0.2394,1.357,0.00351,...,0.0390,1.719,4.63,6.261,0.087659,1.005,0.00445,7323.88,45.85,0.195823
2022-12-05,0.484,64.36,0.009586,0.3195,0.1105,0.3223,18.34,0.2364,1.372,0.00350,...,0.0386,1.709,4.80,6.172,0.086940,1.006,0.00428,7152.54,46.43,0.191428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27,0.345,97.23,0.019895,0.3777,0.1655,0.8480,15.74,0.1317,1.188,0.00241,...,0.0605,1.812,12.39,6.290,0.116499,0.808,0.00387,8436.94,28.77,0.391225
2023-11-28,0.343,96.97,0.020228,0.3859,0.1725,0.8322,15.86,0.1330,1.143,0.00247,...,0.0568,1.945,12.84,6.175,0.118565,0.816,0.00388,8481.35,29.01,0.396697
2023-11-29,0.342,96.47,0.019862,0.3817,0.1859,0.8228,15.55,0.1342,1.109,0.00255,...,0.0574,1.939,11.90,6.014,0.118537,0.825,0.00590,8289.87,29.09,0.396890
2023-11-30,0.341,98.29,0.019865,0.3760,0.1900,0.8270,15.75,0.1337,1.090,0.00249,...,0.0576,1.929,11.59,5.960,0.118309,0.832,0.00686,8256.77,29.47,0.382034


In [60]:
X = rets.values 
emp_cov = covariance.EmpiricalCovariance()

emp_cov.fit(X)
emp_cov.covariance_

array([[0.00299053, 0.00097328, 0.00115571, ..., 0.00090266, 0.00097764,
        0.00117272],
       [0.00097328, 0.00137237, 0.00095949, ..., 0.00082572, 0.00086922,
        0.00081005],
       [0.00115571, 0.00095949, 0.00303881, ..., 0.00093523, 0.0010176 ,
        0.00088684],
       ...,
       [0.00090266, 0.00082572, 0.00093523, ..., 0.00108894, 0.00085371,
        0.00072687],
       [0.00097764, 0.00086922, 0.0010176 , ..., 0.00085371, 0.00183007,
        0.00089169],
       [0.00117272, 0.00081005, 0.00088684, ..., 0.00072687, 0.00089169,
        0.00189216]])

In [58]:
rets.cov()

name,0x,1inch Network,API3,ARPA,Aave,Aavegotchi,Acala Token,Adventure Gold,Aergo,Akash Network,...,VeChain,Vulcan Forged PYR,WOO Network,Waves,XRP,XYO,Yield Guild Games,Zcash,iExec RLC,yearn.finance
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0x,0.002999,0.000976,0.001159,0.000988,0.000898,0.000584,0.001085,0.001080,0.000870,0.000751,...,0.000963,0.001094,0.001345,0.001051,0.000696,0.000653,0.001589,0.000905,0.000980,0.001176
1inch Network,0.000976,0.001376,0.000962,0.000902,0.000856,0.000487,0.000898,0.000900,0.000624,0.000732,...,0.000854,0.000872,0.001094,0.000937,0.000704,0.000576,0.001285,0.000828,0.000872,0.000812
API3,0.001159,0.000962,0.003047,0.001036,0.000986,0.000431,0.001019,0.001398,0.000608,0.000867,...,0.000921,0.000833,0.001375,0.001067,0.000707,0.000733,0.001808,0.000938,0.001020,0.000889
ARPA,0.000988,0.000902,0.001036,0.003260,0.000842,0.000507,0.000807,0.001331,0.000675,0.000685,...,0.000878,0.000858,0.001150,0.001096,0.000645,0.000577,0.001455,0.000855,0.001021,0.000759
Aave,0.000898,0.000856,0.000986,0.000842,0.001467,0.000416,0.000844,0.000990,0.000582,0.000715,...,0.000876,0.000831,0.001131,0.000832,0.000617,0.000628,0.001341,0.000838,0.000869,0.000814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XYO,0.000653,0.000576,0.000733,0.000577,0.000628,0.000253,0.000570,0.000663,0.000537,0.000501,...,0.000569,0.000502,0.000667,0.000629,0.000389,0.004145,0.001051,0.000674,0.000724,0.000524
Yield Guild Games,0.001589,0.001285,0.001808,0.001455,0.001341,0.000925,0.001521,0.001927,0.000984,0.001154,...,0.001320,0.001557,0.001572,0.001384,0.000762,0.001051,0.006298,0.001186,0.001245,0.001175
Zcash,0.000905,0.000828,0.000938,0.000855,0.000838,0.000451,0.000811,0.001019,0.000552,0.000800,...,0.000817,0.000701,0.001048,0.000867,0.000797,0.000674,0.001186,0.001092,0.000856,0.000729
iExec RLC,0.000980,0.000872,0.001020,0.001021,0.000869,0.000414,0.001030,0.001227,0.000589,0.000790,...,0.000850,0.000853,0.001112,0.000832,0.000587,0.000724,0.001245,0.000856,0.001835,0.000894


In [61]:
pd.DataFrame(emp_cov.covariance_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,0.002991,0.000973,0.001156,0.000986,0.000895,0.000582,0.001082,0.001077,0.000868,0.000749,...,0.000960,0.001091,0.001341,0.001048,0.000694,0.000651,0.001584,0.000903,0.000978,0.001173
1,0.000973,0.001372,0.000959,0.000900,0.000854,0.000486,0.000896,0.000898,0.000622,0.000730,...,0.000851,0.000869,0.001091,0.000934,0.000702,0.000574,0.001282,0.000826,0.000869,0.000810
2,0.001156,0.000959,0.003039,0.001033,0.000983,0.000430,0.001016,0.001394,0.000606,0.000865,...,0.000919,0.000831,0.001371,0.001064,0.000705,0.000731,0.001803,0.000935,0.001018,0.000887
3,0.000986,0.000900,0.001033,0.003251,0.000840,0.000505,0.000805,0.001327,0.000673,0.000683,...,0.000876,0.000855,0.001147,0.001093,0.000644,0.000575,0.001451,0.000853,0.001019,0.000756
4,0.000895,0.000854,0.000983,0.000840,0.001463,0.000415,0.000842,0.000988,0.000580,0.000713,...,0.000874,0.000829,0.001128,0.000830,0.000615,0.000626,0.001337,0.000836,0.000866,0.000811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.000651,0.000574,0.000731,0.000575,0.000626,0.000252,0.000569,0.000662,0.000536,0.000499,...,0.000567,0.000500,0.000665,0.000627,0.000388,0.004134,0.001048,0.000672,0.000722,0.000523
171,0.001584,0.001282,0.001803,0.001451,0.001337,0.000923,0.001516,0.001922,0.000982,0.001150,...,0.001316,0.001553,0.001568,0.001380,0.000760,0.001048,0.006280,0.001183,0.001241,0.001172
172,0.000903,0.000826,0.000935,0.000853,0.000836,0.000450,0.000809,0.001016,0.000551,0.000797,...,0.000815,0.000699,0.001045,0.000865,0.000795,0.000672,0.001183,0.001089,0.000854,0.000727
173,0.000978,0.000869,0.001018,0.001019,0.000866,0.000413,0.001027,0.001223,0.000588,0.000788,...,0.000847,0.000850,0.001109,0.000830,0.000585,0.000722,0.001241,0.000854,0.001830,0.000892


In [63]:
emp_cov.covariance_

array([[0.00299053, 0.00097328, 0.00115571, ..., 0.00090266, 0.00097764,
        0.00117272],
       [0.00097328, 0.00137237, 0.00095949, ..., 0.00082572, 0.00086922,
        0.00081005],
       [0.00115571, 0.00095949, 0.00303881, ..., 0.00093523, 0.0010176 ,
        0.00088684],
       ...,
       [0.00090266, 0.00082572, 0.00093523, ..., 0.00108894, 0.00085371,
        0.00072687],
       [0.00097764, 0.00086922, 0.0010176 , ..., 0.00085371, 0.00183007,
        0.00089169],
       [0.00117272, 0.00081005, 0.00088684, ..., 0.00072687, 0.00089169,
        0.00189216]])

In [None]:
alphas = np.logspace(-1.5, 1, num=10)
edge_model = covariance.GraphicalLassoCV(alphas=alphas)

# standardize the time series: using correlations rather than covariance
# former is more efficient for structure recovery
# X = variation.copy().T
# X /= X.std(axis=0)

edge_model.fit(X)

In [71]:
cov.values

array([[0.00299877, 0.00097596, 0.00115889, ..., 0.00090514, 0.00098033,
        0.00117595],
       [0.00097596, 0.00137615, 0.00096213, ..., 0.000828  , 0.00087161,
        0.00081228],
       [0.00115889, 0.00096213, 0.00304718, ..., 0.0009378 , 0.00102041,
        0.00088928],
       ...,
       [0.00090514, 0.000828  , 0.0009378 , ..., 0.00109194, 0.00085606,
        0.00072887],
       [0.00098033, 0.00087161, 0.00102041, ..., 0.00085606, 0.00183511,
        0.00089414],
       [0.00117595, 0.00081228, 0.00088928, ..., 0.00072887, 0.00089414,
        0.00189737]])

## Clustering using affinity propagation

We use clustering to group together quotes that behave similarly. Here,
amongst the `various clustering techniques <clustering>` available
in the scikit-learn, we use `affinity_propagation` as it does
not enforce equal-size clusters, and it can choose automatically the
number of clusters from the data.

Note that this gives us a different indication than the graph, as the
graph reflects conditional relations between variables, while the
clustering reflects marginal properties: variables clustered together can
be considered as having a similar impact at the level of the full stock
market.



In [76]:
# _, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
# _, labels = cluster.affinity_propagation(cov.values, random_state=0)
_, labels = cluster.affinity_propagation(emp_cov.covariance_, random_state=0)
n_labels = labels.max()

for i in range(n_labels + 1):
    print(f"Cluster {i + 1}: {', '.join(cov.index.values[labels == i])}")

Cluster 1: 0x, ARPA, Aave, Aavegotchi, Aergo, Akash Network, Alchemy Pay, Algorand, Alpha Coin, Amp, Ampleforth Governance Token, Ankr, Aptos, Artificial Liquid Intelligence, Astar, Audius, Avalanche, Axie Infinity, Badger DAO, Balancer, Bancor, Band Protocol, BarnBridge, Basic Attention Token, Biconomy, Bitcoin, Bitcoin Cash, Bluzelle, COTI, Cardano, Cartesi, Celer Network, Chainlink, Chiliz, Chromia, Civic, Coin98, Compound, Convex Finance, Cosmos, Covalent, Cronos, Dash, Decentraland, Dogecoin, Dogelon Mars, EOS, Enjin Coin, Enzyme, Ethereum Classic, Ethereum Name Service, Ethernity Chain, Fantom, Fetch.ai, Filecoin, Flow, Frax Share, GMT, GMX, Galxe, Gitcoin, Gods Unchained, Golem, Hashflow, Hedera, Helium, Highstreet, ICON, IDEX, Illuvium, Immutable X, Injective, Internet Computer, IoTeX, JasmyCoin, Kava, Kusama, Kyber Network Crystal, Lido DAO, Liquity, Lisk, Litecoin, Litentry, Livepeer, Loopring, MAGIC, Mask Network, Mina, Mines of Dalarnia, Moonbeam, Moonriver, MultiversX, My 

## Embedding in 2D space

For visualization purposes, we need to lay out the different symbols on a
2D canvas. For this we use `manifold` techniques to retrieve 2D
embedding.
We use a dense eigen_solver to achieve reproducibility (arpack is initiated
with the random vectors that we don't control). In addition, we use a large
number of neighbors to capture the large-scale structure.



In [79]:
# Finding a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver="dense", n_neighbors=10
)

embedding = node_position_model.fit_transform(X.T).T

In [75]:
embedding.shape

(2, 175)

## Visualization

The output of the 3 models are combined in a 2D graph where nodes
represents the stocks and edges the:

- cluster labels are used to define the color of the nodes
- the sparse covariance model is used to display the strength of the edges
- the 2D embedding is used to position the nodes in the plan

This example has a fair amount of visualization-related code, as
visualization is crucial here to display the graph. One of the challenge
is to position the labels minimizing overlap. For this we use an
heuristic based on the direction of the nearest neighbor along each
axis.



In [None]:
plt.figure(1, facecolor="w", figsize=(10, 8))
plt.clf()
ax = plt.axes([0.0, 0.0, 1.0, 1.0])
plt.axis("off")

# Plot the graph of partial correlations
# partial_correlations = edge_model.precision_.copy()
partial_correlations = emp_cov.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02

# Plot the nodes using the coordinates of our embedding
plt.scatter(
    embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral
)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [
    [embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(
    segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
)
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
names = cov.index.values
for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):
    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = "left"
        x = x + 0.002
    else:
        horizontalalignment = "right"
        x = x - 0.002
    if this_dy > 0:
        verticalalignment = "bottom"
        y = y + 0.002
    else:
        verticalalignment = "top"
        y = y - 0.002
    plt.text(
        x,
        y,
        name,
        size=10,
        horizontalalignment=horizontalalignment,
        verticalalignment=verticalalignment,
        bbox=dict(
            facecolor="w",
            edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
            alpha=0.6,
        ),
    )

plt.xlim(
    embedding[0].min() - 0.15 * embedding[0].ptp(),
    embedding[0].max() + 0.10 * embedding[0].ptp(),
)
plt.ylim(
    embedding[1].min() - 0.03 * embedding[1].ptp(),
    embedding[1].max() + 0.03 * embedding[1].ptp(),
)

plt.show()