# Visualizing crypto market structure
Adapted from https://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html#sphx-glr-auto-examples-applications-plot-stock-market-py

This example employs several unsupervised learning techniques to extract
the stock market structure from variations in historical quotes.

The quantity that we use is the daily variation in quote price: quotes
that are linked tend to fluctuate in relation to each other during a day.


In [13]:
import time
import json

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
import requests
from sklearn import covariance, cluster, manifold

### We examine Coindesk Market Index 
[coindesk](https://www.coindesk.com/indices/cmis)

### Price data from Coin Gecko 
[coingeck](https://www.coingecko.com/)

In [10]:
constituents = pd.read_csv("constituents_coindesk_market_index.csv")
constituents = constituents.rename(columns=lambda x: x.lower())

with open('coingecko_list.json') as f:
    cg_data = json.load(f)

cg_names = pd.DataFrame.from_records(cg_data)
cg_names = cg_names.rename(columns={'name': 'cg_name'})

constituents = constituents.merge(cg_names, how="left", on='symbol')
constituents.to_csv("cg_constituents.csv")

In [29]:
constituents = pd.read_csv("cg_constituents_2.csv", index_col=0)

In [30]:
constituents

Unnamed: 0,symbol,name,dacs sector,dacs industry group,dacs industry,reconstitution weight,id,cg_name
0,1inch,1inch Network,DeFi,Exchanges,Exchanges (Other),0.032945,1inch,1inch
1,aave,Aave,DeFi,Credit Platform,Lending / Borrowing,0.108995,aave,Aave
2,aca,Acala Token,Smart Contract Platform,Layer 1,Layer 1,0.006482,acala,Acala
3,ach,Alchemy Pay,Currency,BaaS,Payments,0.011424,alchemy-pay,Alchemy Pay
4,acs,Access Protocol,Culture & Entertainment,Media,Social,0.007609,access-protocol,Access Protocol
...,...,...,...,...,...,...,...,...
279,xyo,XYO,Computing,Oracle,Oracle,0.006356,xyo-network,XYO Network
280,yfi,yearn.finance,DeFi,Yield,Yield,0.019512,yearn-finance,yearn.finance
281,ygg,Yield Guild Games,Culture & Entertainment,Metaverse,Metaverse Platform,0.009636,yield-guild-games,Yield Guild Games
282,zec,Zcash,Currency,Private,Private,0.035552,zcash,Zcash


In [31]:
constituents[constituents.symbol.duplicated()]

Unnamed: 0,symbol,name,dacs sector,dacs industry group,dacs industry,reconstitution weight,id,cg_name


In [None]:
start = pd.Timestamp("2022-12-01T12:00:00")
end  = pd.Timestamp("2023-12-01T12:00:00")

results = []
for _, row in constituents.iterrows():
    time.sleep(21)
    token_id = row['id']
    name = row['name']
    print(f"token_id {token_id}")
    r = requests.get(f"https://api.coingecko.com/api/v3/coins/{token_id}/market_chart/range?vs_currency=USD&from={start.timestamp()}&to={end.timestamp()}")
    print(f"getting data for name {name}")
    resp = r.json()
    data = resp.get('prices')
    if (data is None) or (len(data) == 0):
        print(f"no data for ticker {name}")
        date_range = pd.date_range(start, end)
        nan_prices = np.empty((date_range.shape[0], ))
        nan_prices[:] = np.nan
        prices = pd.DataFrame({'date' : date_range, 'close': nan_prices})
    else:
        prices = pd.DataFrame(resp.get('prices')) 
        prices.columns = ["date", 'price']
    prices = prices.sort_values(by='date')
    prices["date"] = pd.to_datetime(prices.date, unit='ms', origin='unix')
    prices["name"] = name
    results.append(prices)

combined_data = pd.concat(results)

In [34]:
combined_data.to_csv('cg_price_data_total.csv')

In [35]:
combined_data.head()

Unnamed: 0,date,price,name
0,2022-12-02,0.512787,1inch Network
1,2022-12-03,0.50577,1inch Network
2,2022-12-04,0.489838,1inch Network
3,2022-12-05,0.500704,1inch Network
4,2022-12-06,0.485993,1inch Network


In [36]:
dataset = combined_data.pivot(index='date', columns='name', values='price')

In [37]:
dataset

name,0x,1inch Network,API3,ARPA,Aave,Aavegotchi,Acala Token,Access Protocol,Adventure Gold,Aergo,...,Vulcan Forged PYR,WOO Network,Waves,XRP,XYO,Yield Guild Games,Zcash,dYdX,iExec RLC,yearn.finance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-02,0.194349,0.512787,1.475673,0.027956,63.589064,1.052975,0.113451,,0.274698,0.116292,...,3.207968,0.124526,2.357454,0.398586,0.004324,0.224919,44.932667,,1.086012,6681.304682
2022-12-03,0.193552,0.505770,1.507737,0.028853,65.527824,1.025667,0.155059,,0.276957,0.116840,...,3.196139,0.125819,2.424383,0.398616,0.004322,0.242841,46.120687,,1.099833,6819.861450
2022-12-04,0.190112,0.489838,1.463525,0.028155,62.874587,1.018137,0.137897,,0.270616,0.114971,...,3.156363,0.122130,2.329474,0.389865,0.004268,0.235330,45.626442,,1.076376,6640.284090
2022-12-05,0.197506,0.500704,1.476821,0.028863,64.518508,1.039546,0.136052,,0.275633,0.114717,...,3.185474,0.123227,2.365142,0.392702,0.004415,0.239984,45.949654,,1.097304,7338.261697
2022-12-06,0.191915,0.485993,1.451414,0.028394,64.355277,1.042089,0.132244,,0.319110,0.112539,...,3.150201,0.122647,2.401339,0.390779,0.004299,0.230107,46.337661,,1.065158,7166.505976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27,0.401705,0.355181,1.461461,0.051936,98.052293,0.919328,0.059087,0.002301,0.926657,0.174821,...,6.353866,0.224329,2.137117,0.616851,0.003949,0.379643,29.523909,3.210250,1.527754,8533.514564
2023-11-28,0.391707,0.343751,1.410261,0.049584,96.196181,0.871273,0.063865,0.002209,0.843920,0.166301,...,6.059879,0.214906,2.095527,0.604080,0.003885,0.362481,28.808138,3.210250,1.460926,8443.978749
2023-11-29,0.396652,0.342631,1.475183,0.049815,96.854285,0.864136,0.063692,0.002344,0.825368,0.174062,...,5.946000,0.221407,2.105248,0.611266,0.003871,0.375681,28.890945,3.214087,1.479162,8468.222241
2023-11-30,0.395284,0.341082,1.443302,0.049426,96.300476,0.829303,0.061066,0.002260,0.822433,0.183276,...,6.123255,0.212792,2.089071,0.609145,0.005699,0.376468,28.909134,3.248654,1.571104,8253.033046


In [41]:
dataset.columns[dataset.isna().any()].tolist()

['Access Protocol',
 'Arbitrum',
 'Blur',
 'Echelon Prime',
 'Flare',
 'Pepe',
 'Sei',
 'Sui',
 'dYdX']

In [42]:
dataset = dataset.dropna(axis=1, how='any')

In [43]:
dataset.shape

(365, 175)

In [17]:
token_id  = "ada-the-dog"
start = pd.Timestamp("2022-12-01T12:00:00")
end  = pd.Timestamp("2023-12-01T12:00:00")
r = requests.get(f"https://api.coingecko.com/api/v3/coins/{token_id}/market_chart/range?vs_currency=USD&from={start.timestamp()}&to={end.timestamp()}")
resp = r.json()    
print(resp)

{'prices': [], 'market_caps': [], 'total_volumes': []}


In [74]:
constituents['ticker'] = constituents['Symbol'].str.upper() + "-USD"

In [18]:
constituents[constituents.name == "Cardano"]

Unnamed: 0,symbol,name,dacs sector,dacs industry group,dacs industry,reconstitution weight,id,cg_name
7,ada,Cardano,Smart Contract Platform,Layer 1,Layer 1,1.52029,ada-the-dog,ADA the Dog
8,ada,Cardano,Smart Contract Platform,Layer 1,Layer 1,1.52029,binance-peg-cardano,Binance-Peg Cardano
9,ada,Cardano,Smart Contract Platform,Layer 1,Layer 1,1.52029,cardano,Cardano


In [196]:
constituents['DACS Sector'].unique()

array(['DeFi', 'Smart Contract Platform', 'Currency',
       'Culture & Entertainment', 'Computing', 'Digitization'],
      dtype=object)

In [90]:
f"https://api.pro.coinbase.com/products/{ticker}/candles?start={start}&end={end}&granularity=86400"

'https://api.pro.coinbase.com/products/1INCH-USD/candles?start=2022-12-01 12:00:00&end=2023-06-01 12:00:00&granularity=86400'

In [169]:
def get_prices(ticker: str, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
    r = requests.get(f"https://api.pro.coinbase.com/products/{ticker}/candles?start={start.isoformat()}&end={end.isoformat()}&granularity=86400")
    resp = r.json()
    if isinstance(resp, list) and len(resp) > 0:
        print(f"found data from {ticker}")
        prices = pd.DataFrame(resp)
        prices.columns = ['time', 'open', 'high', 'low', 'close', 'volume']
        prices['date'] = pd.to_datetime(prices['time'], origin="unix", unit='s')
    else:
        print(f"no data for ticker {ticker}")
        date_range = pd.date_range(start, end)
        nan_prices = np.empty((date_range.shape[0], ))
        nan_prices[:] = np.nan
        prices = pd.DataFrame({'date' : date_range, 'close': nan_prices})
    prices = prices.sort_values(by='date')
    prices["ticker"] = ticker
    return prices[['date', 'ticker', 'close']]


In [33]:
pd.Timestamp("2022-12-01T12:00:00").isoformat()

'2022-12-01T12:00:00'

In [24]:
missing_names = {
 'ali': 'alethea-artificial-liquid-intelligence-token',
 'alpha': 'alphacoin',
 'ant': 'aragon',
 'arb': 'arbitrum',
 'astr': 'astar',
 'audio': 'audius',
 'axl': 'axelar',
 'blur': 'blur',
 'bond': 'barnbridge',
 # 'chromia',
 'cqt': 'covalent',
 'dydx': 'dydx',
 'legld': 'salsa-liquid-multiversx',
 'elon': 'dogelon-mars',
 # 'ethereumpow',
 'solarflare': 'flare',
 'ftm': 'fantom',
 'fxs': 'frax-share',
 'gala': 'gala',
 'ghst': 'aavegotchi',
 'glmr': 'moonbeam',
 'gmx': 'gmx',
 'hnt': 'helium',
 'icx': 'icon',
 'kava': 'kava',
 'lit': 'litentry',
 'lsk': 'lisk',
 'lunc': 'terra-luna',
 'terraport': 'terra',
 'magic': 'magic',
 'movr': 'moonriver',
 'omg': 'omisego',
 'osmo': 'osmosis',
 'pepe': 'newpepe',
 "prime": 'echelon-prime',
 'qrdo': 'qredo',
 'qtum': 'qtum',
 'ren': "republic-protocol",
 'rpl': 'rocket-pool',
 'rune': 'thorchain',
 'sc': 'siacoin',
 'sei': 'sei-network',
 'slp': 'smooth-love-potion',
 "stg": 'stargate-finance',
 "salmonation": 'sui',
 't': "threshold",
 'tvk': 'the-virtua-kolect',
 'vet': 'vechain',
 'waves':'layground-waves-floor-index',
 'xrp': 'ripple',
 'ygg': 'yield-guild-games'
}

In [None]:
results = []
for ticker, name in missing_names.items():
    r = requests.get(f"https://api.coingecko.com/api/v3/coins/{name}/market_chart/range?vs_currency=USD&from=1669896000&to=1701432000")
    print(f"getting data for name {name}")
    resp = r.json()
    data = resp.get('data')
    if data is None:
        prices = pd.DataFrame(resp.get('data'))
    else:
        prices = pd.DataFrame(resp.get('data')) 
    prices.columns = ["date", 'price']
    prices["date"] = pd.to_datetime(prices.date, unit='ms', origin='unix')
    prices["ticker"] = ticker
    results.append(prices)

combined_data = pd.concat(results)


In [33]:
a = pd.DataFrame(results[0]['data'])

In [35]:
a.columns = ["date", "price"]

In [38]:
pd.to_datetime(a.date, unit='ms', origin='unix')

0     2022-12-02
1     2022-12-03
2     2022-12-04
3     2022-12-05
4     2022-12-06
         ...    
360   2023-11-27
361   2023-11-28
362   2023-11-29
363   2023-11-30
364   2023-12-01
Name: date, Length: 365, dtype: datetime64[ns]

In [None]:
r = requests.get(f"https://api.pro.coinbase.com/products/{ticker}/candles?start={start.isoformat()}&end={end.isoformat()}&granularity=86400")
resp = r.json()
resp

In [None]:
start = pd.Timestamp("2022-12-01T00:00:00")
mid = pd.Timestamp("2023-06-01T00:00:00")
end = pd.Timestamp("2023-12-01T00:00:00")

prices = []
for ticker in constituents.ticker:
    print(f"get prices for: {ticker}")
    batch_1 = get_prices(ticker, start, mid)
    batch_2 = get_prices(ticker, mid + pd.Timedelta(hours=24), end)
    prices.append(pd.concat([batch_1, batch_2]))

dataset = pd.concat(prices)
dataset = dataset.pivot(index='date', columns='ticker', values='close')
#dataset = dataset.dropna(axis=1, how='all')

In [172]:
dataset.to_csv("coinbase_prices.csv")

  values = values.astype(str)


In [4]:
dataset = pd.read_csv("coinbase_prices.csv")

In [8]:
dataset.columns[dataset.isna().any()]

Index(['ACA-USD', 'ACS-USD', 'AKT-USD', 'ALI-USD', 'ALPHA-USD', 'ANT-USD',
       'ARB-USD', 'ASTR-USD', 'AUDIO-USD', 'AXL-USD', 'BLUR-USD', 'BOND-USD',
       'CHR-USD', 'CQT-USD', 'DYDX-USD', 'EGLD-USD', 'ELON-USD', 'ETHW-USD',
       'FLR-USD', 'FTM-USD', 'FXS-USD', 'GALA-USD', 'GHST-USD', 'GLMR-USD',
       'GMX-USD', 'HNT-USD', 'ICX-USD', 'KAVA-USD', 'LIT-USD', 'LSK-USD',
       'LUNA-USD', 'LUNC-USD', 'MAGIC-USD', 'MOVR-USD', 'OMG-USD', 'OSMO-USD',
       'PEPE-USD', 'PRIME-USD', 'QRDO-USD', 'QTUM-USD', 'REN-USD', 'RPL-USD',
       'RUNE-USD', 'SC-USD', 'SEI-USD', 'SLP-USD', 'STG-USD', 'SUI-USD',
       'T-USD', 'TVK-USD', 'VET-USD', 'WAVES-USD', 'WOO-USD', 'XRP-USD',
       'YGG-USD'],
      dtype='object')

In [21]:
prods = pd.DataFrame.from_records(resp)

In [31]:
prods

Unnamed: 0,id,base_currency,quote_currency,quote_increment,base_increment,display_name,min_market_funds,margin_enabled,post_only,limit_only,cancel_only,status,status_message,trading_disabled,fx_stablecoin,max_slippage_percentage,auction_mode,high_bid_limit_percentage
0,ANKR-USD,ANKR,USD,0.00001,1,ANKR/USD,1,False,False,False,False,online,,False,False,0.03000000,False,
1,FIDA-EUR,FIDA,EUR,0.0001,0.01,FIDA/EUR,1,False,False,False,False,delisted,,True,False,0.03000000,False,
2,ADA-USDC,ADA,USDC,0.001,0.01,ADA/USDC,1,False,False,False,False,delisted,,True,False,0.03000000,False,
3,MIR-EUR,MIR,EUR,0.001,0.01,MIR/EUR,0.84,False,False,False,False,delisted,,True,False,0.03000000,False,
4,LSETH-ETH,LSETH,ETH,0.00001,0.00001,LSETH/ETH,0.002,False,False,True,False,online,,False,True,0.01000000,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,UNI-USD,UNI,USD,0.001,0.000001,UNI/USD,1,False,False,False,False,online,,False,False,0.03000000,False,
604,GNO-USDT,GNO,USDT,0.01,0.0001,GNO/USDT,1,False,False,False,False,delisted,,True,False,0.03000000,False,
605,RBN-USD,RBN,USD,0.00001,0.01,RBN/USD,1,False,False,False,False,online,,False,False,0.03000000,False,
606,UPI-USD,UPI,USD,0.00001,1,UPI/USD,1,False,False,False,False,delisted,,True,False,0.03000000,False,


In [32]:
prods[prods.display_name.str.contains('WOO')]

Unnamed: 0,id,base_currency,quote_currency,quote_increment,base_increment,display_name,min_market_funds,margin_enabled,post_only,limit_only,cancel_only,status,status_message,trading_disabled,fx_stablecoin,max_slippage_percentage,auction_mode,high_bid_limit_percentage


In [177]:
dataset1.pct_change().iloc[1:]

ticker,1INCH-USD,AAVE-USD,ACH-USD,ADA-USD,AERGO-USD,AGLD-USD,ALCX-USD,ALGO-USD,ALICE-USD,AMP-USD,...,TRU-USD,UMA-USD,UNFI-USD,UNI-USD,XLM-USD,XTZ-USD,XYO-USD,YFI-USD,ZEC-USD,ZRX-USD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-02,-0.011765,0.032207,-0.004974,0.014626,0.016755,0.012332,0.043309,0.013704,0.029456,-0.002825,...,0.028646,0.000583,0.006452,0.072153,0.005295,0.024900,0.002315,0.015098,0.026714,-0.004312
2022-12-03,-0.033730,-0.044749,-0.031990,0.001567,-0.016479,-0.025439,-0.039915,-0.026219,-0.029347,-0.011331,...,-0.025316,-0.016327,-0.036325,-0.039874,-0.020273,-0.034985,-0.009238,-0.025464,-0.013226,-0.016396
2022-12-04,0.020534,0.026131,0.013942,0.008761,-0.011464,0.015441,0.033259,0.007152,0.025699,0.005731,...,0.012987,0.018969,0.026608,0.027741,0.015630,0.012085,0.037296,0.105146,0.007471,0.034967
2022-12-05,-0.026157,-0.000621,-0.023630,-0.008995,-0.014273,0.166908,-0.016094,-0.012531,0.011054,-0.002849,...,-0.010256,-0.005817,0.036717,-0.014215,-0.008202,0.000995,-0.038202,-0.023395,0.012650,-0.022444
2022-12-06,-0.018595,-0.006526,0.004277,-0.003756,0.009955,0.001551,-0.004362,-0.011421,-0.014577,0.014286,...,0.005181,0.143944,0.002083,0.001944,-0.010559,0.008946,-0.007009,0.001931,-0.006031,0.004158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27,-0.028169,-0.006844,-0.029086,-0.025542,-0.059659,-0.084332,-0.014402,-0.031618,-0.030204,-0.012295,...,0.011706,-0.012534,-0.154266,0.016812,-0.028940,-0.039239,-0.020253,-0.014398,-0.025737,-0.024603
2023-11-28,-0.005797,-0.002674,0.016738,0.021710,0.042296,-0.018632,0.007624,0.009871,-0.037879,0.024896,...,-0.061157,0.073400,0.036320,-0.018283,0.017734,0.009901,0.002584,0.005264,0.008342,0.013987
2023-11-29,-0.002915,-0.005156,-0.018094,-0.010884,0.077681,-0.011295,-0.019546,0.009023,-0.029746,0.032389,...,0.010563,-0.003085,-0.073209,-0.026073,-0.000236,0.011029,0.520619,-0.022577,0.002758,0.000487
2023-11-30,-0.002924,0.018866,0.000151,-0.014933,0.022055,0.005105,0.012862,-0.003726,-0.017133,-0.023529,...,0.003484,-0.005157,-0.026050,-0.008979,-0.001923,0.008485,0.162712,-0.003993,0.013063,-0.037431


In [2]:
constituents.head()

Unnamed: 0,Symbol,Name,DACS Sector,DACS Industry Group,DACS Industry,Reconstitution Weight
0,1inch,1inch Network,DeFi,Exchanges,Exchanges (Other),0.032945
1,aave,Aave,DeFi,Credit Platform,Lending / Borrowing,0.108995
2,aca,Acala Token,Smart Contract Platform,Layer 1,Layer 1,0.006482
3,ach,Alchemy Pay,Currency,BaaS,Payments,0.011424
4,acs,Access Protocol,Culture & Entertainment,Media,Social,0.007609



## Learning a graph structure

We use sparse inverse covariance estimation to find which quotes are
correlated conditionally on the others. Specifically, sparse inverse
covariance gives us a graph, that is a list of connections. For each
symbol, the symbols that it is connected to are those useful to explain
its fluctuations.



In [181]:
dataset1

ticker,1INCH-USD,AAVE-USD,ACH-USD,ADA-USD,AERGO-USD,AGLD-USD,ALCX-USD,ALGO-USD,ALICE-USD,AMP-USD,...,TRU-USD,UMA-USD,UNFI-USD,UNI-USD,XLM-USD,XTZ-USD,XYO-USD,YFI-USD,ZEC-USD,ZRX-USD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-01,0.510,63.65,0.010053,0.3145,0.1134,0.2757,18.01,0.2408,1.324,0.00354,...,0.0384,1.714,4.65,5.918,0.087632,1.004,0.00432,6699.09,44.92,0.193194
2022-12-02,0.504,65.70,0.010003,0.3191,0.1153,0.2791,18.79,0.2441,1.363,0.00353,...,0.0395,1.715,4.68,6.345,0.088096,1.029,0.00433,6800.23,46.12,0.192361
2022-12-03,0.487,62.76,0.009683,0.3196,0.1134,0.2720,18.04,0.2377,1.323,0.00349,...,0.0385,1.687,4.51,6.092,0.086310,0.993,0.00429,6627.07,45.51,0.189207
2022-12-04,0.497,64.40,0.009818,0.3224,0.1121,0.2762,18.64,0.2394,1.357,0.00351,...,0.0390,1.719,4.63,6.261,0.087659,1.005,0.00445,7323.88,45.85,0.195823
2022-12-05,0.484,64.36,0.009586,0.3195,0.1105,0.3223,18.34,0.2364,1.372,0.00350,...,0.0386,1.709,4.80,6.172,0.086940,1.006,0.00428,7152.54,46.43,0.191428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27,0.345,97.23,0.019895,0.3777,0.1655,0.8480,15.74,0.1317,1.188,0.00241,...,0.0605,1.812,12.39,6.290,0.116499,0.808,0.00387,8436.94,28.77,0.391225
2023-11-28,0.343,96.97,0.020228,0.3859,0.1725,0.8322,15.86,0.1330,1.143,0.00247,...,0.0568,1.945,12.84,6.175,0.118565,0.816,0.00388,8481.35,29.01,0.396697
2023-11-29,0.342,96.47,0.019862,0.3817,0.1859,0.8228,15.55,0.1342,1.109,0.00255,...,0.0574,1.939,11.90,6.014,0.118537,0.825,0.00590,8289.87,29.09,0.396890
2023-11-30,0.341,98.29,0.019865,0.3760,0.1900,0.8270,15.75,0.1337,1.090,0.00249,...,0.0576,1.929,11.59,5.960,0.118309,0.832,0.00686,8256.77,29.47,0.382034


In [182]:
X = dataset1.values 
emp_cov = covariance.EmpiricalCovariance()

cov = emp_cov.fit(X)
cov.covariance_



array([[1.19662539e-02, 6.22788128e-01, 5.36003167e-04, ...,
        1.07459506e+02, 6.97347534e-01, 1.95976255e-03],
       [6.22788128e-01, 1.51263491e+02, 3.71860883e-02, ...,
        1.12312601e+04, 2.65015843e+01, 5.96867397e-01],
       [5.36003167e-04, 3.71860883e-02, 8.52180868e-05, ...,
        8.75178591e+00, 9.16463645e-03, 2.36777436e-04],
       ...,
       [1.07459506e+02, 1.12312601e+04, 8.75178591e+00, ...,
        2.10935401e+06, 4.39697223e+03, 6.56726911e+01],
       [6.97347534e-01, 2.65015843e+01, 9.16463645e-03, ...,
        4.39697223e+03, 5.22123191e+01, 2.61586639e-02],
       [1.95976255e-03, 5.96867397e-01, 2.36777436e-04, ...,
        6.56726911e+01, 2.61586639e-02, 3.90810946e-03]])

In [44]:
rets = dataset.pct_change().iloc[1:]
cov = rets.cov()

In [None]:
alphas = np.logspace(-1.5, 1, num=10)
edge_model = covariance.GraphicalLassoCV(alphas=alphas)

# standardize the time series: using correlations rather than covariance
# former is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)


edge_model.fit(X)

## Clustering using affinity propagation

We use clustering to group together quotes that behave similarly. Here,
amongst the `various clustering techniques <clustering>` available
in the scikit-learn, we use `affinity_propagation` as it does
not enforce equal-size clusters, and it can choose automatically the
number of clusters from the data.

Note that this gives us a different indication than the graph, as the
graph reflects conditional relations between variables, while the
clustering reflects marginal properties: variables clustered together can
be considered as having a similar impact at the level of the full stock
market.



In [45]:
#_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
_, labels = cluster.affinity_propagation(cov, random_state=0)
n_labels = labels.max()

for i in range(n_labels + 1):
    print(f"Cluster {i + 1}: {', '.join(cov.index.values[labels == i])}")

Cluster 1: 0x, ARPA, Aave, Aavegotchi, Aergo, Akash Network, Alchemy Pay, Algorand, Alpha Coin, Amp, Ampleforth Governance Token, Ankr, Aptos, Artificial Liquid Intelligence, Astar, Audius, Avalanche, Axie Infinity, Badger DAO, Balancer, Bancor, Band Protocol, BarnBridge, Basic Attention Token, Biconomy, Bitcoin, Bitcoin Cash, Bluzelle, COTI, Cardano, Cartesi, Celer Network, Chainlink, Chiliz, Chromia, Civic, Coin98, Compound, Convex Finance, Cosmos, Covalent, Cronos, Dash, Decentraland, Dogecoin, Dogelon Mars, EOS, Enjin Coin, Enzyme, Ethereum Classic, Ethereum Name Service, Ethernity Chain, Fantom, Fetch.ai, Filecoin, Flow, Frax Share, GMT, GMX, Galxe, Gitcoin, Gods Unchained, Golem, Hashflow, Hedera, Helium, Highstreet, ICON, IDEX, Illuvium, Immutable X, Injective, Internet Computer, IoTeX, JasmyCoin, Kava, Kusama, Kyber Network Crystal, Lido DAO, Liquity, Lisk, Litecoin, Litentry, Livepeer, Loopring, MAGIC, Mask Network, Mina, Mines of Dalarnia, Moonbeam, Moonriver, MultiversX, My 

In [193]:
cov

ticker,1INCH-USD,AAVE-USD,ACH-USD,ADA-USD,AERGO-USD,AGLD-USD,ALCX-USD,ALGO-USD,ALICE-USD,AMP-USD,...,TRU-USD,UMA-USD,UNFI-USD,UNI-USD,XLM-USD,XTZ-USD,XYO-USD,YFI-USD,ZEC-USD,ZRX-USD
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1INCH-USD,0.011999,0.624494,0.000537,0.004638,0.001231,-0.007304,0.359878,0.005949,0.039204,0.000116,...,0.001328,0.027547,-0.034584,0.073282,-0.001075,0.018066,0.000108,1.077539e+02,0.699258,0.001965
AAVE-USD,0.624494,151.677912,0.037288,0.452713,0.239238,1.055477,22.537626,0.253925,2.022751,0.005015,...,0.094622,1.680298,13.269469,5.297721,0.046068,1.088445,0.006587,1.126203e+04,26.574191,0.598503
ACH-USD,0.000537,0.037288,0.000085,0.000319,0.000090,0.000165,0.020700,0.000178,0.001945,0.000004,...,0.000143,0.001822,-0.000730,0.002281,-0.000013,0.001003,0.000003,8.775763e+00,0.009190,0.000237
ADA-USD,0.004638,0.452713,0.000319,0.002943,0.000901,-0.000111,0.151855,0.002049,0.016127,0.000039,...,0.000586,0.014165,0.011940,0.029624,-0.000162,0.007525,0.000043,5.982823e+01,0.219449,0.002145
AERGO-USD,0.001231,0.239238,0.000090,0.000901,0.000486,0.001540,0.044317,0.000503,0.004051,0.000010,...,0.000199,0.003697,0.024381,0.008835,0.000040,0.002057,0.000012,2.392500e+01,0.047387,0.001233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XTZ-USD,0.018066,1.088445,0.001003,0.007525,0.002057,-0.008394,0.572445,0.008780,0.060722,0.000177,...,0.002337,0.043944,-0.051084,0.117488,-0.001255,0.029559,0.000162,1.853410e+02,0.982486,0.003591
XYO-USD,0.000108,0.006587,0.000003,0.000043,0.000012,-0.000076,0.003216,0.000057,0.000367,0.000001,...,0.000010,0.000241,-0.000221,0.000743,-0.000011,0.000162,0.000001,9.004531e-01,0.006801,0.000017
YFI-USD,107.753915,11262.030654,8.775763,59.828233,23.925001,28.362326,3722.076263,45.631626,355.573546,0.972934,...,20.522608,305.675793,367.858069,743.411035,-2.092490,185.340958,0.900453,2.115133e+06,4409.018733,65.852616
ZEC-USD,0.699258,26.574191,0.009190,0.219449,0.047387,-0.667286,18.805195,0.384900,2.252644,0.007239,...,0.039401,1.361224,-3.615688,4.717113,-0.079505,0.982486,0.006801,4.409019e+03,52.355367,0.026230


In [189]:
symbol_dict = { 
    "SNY": "Sanofi-Aventis",
    "NVS": "Novartis",
    "KMB": "Kimberly-Clark",
    "R": "Ryder",
    "GD": "General Dynamics",
    "RTN": "Raytheon",
    "CVS": "CVS",
    "CAT": "Caterpillar",
    "DD": "DuPont de Nemours",
}


symbols, names = np.array(sorted(symbol_dict.items())).T

In [191]:
names

array(['Caterpillar', 'CVS', 'DuPont de Nemours', 'General Dynamics',
       'Kimberly-Clark', 'Novartis', 'Ryder', 'Raytheon',
       'Sanofi-Aventis'], dtype='<U17')

## Embedding in 2D space

For visualization purposes, we need to lay out the different symbols on a
2D canvas. For this we use `manifold` techniques to retrieve 2D
embedding.
We use a dense eigen_solver to achieve reproducibility (arpack is initiated
with the random vectors that we don't control). In addition, we use a large
number of neighbors to capture the large-scale structure.



In [46]:
# Finding a low-dimension embedding for visualization: find the best position of
# the nodes (the stocks) on a 2D plane

X = cov.values

node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver="dense", n_neighbors=6
)

embedding = node_position_model.fit_transform(X.T).T

## Visualization

The output of the 3 models are combined in a 2D graph where nodes
represents the stocks and edges the:

- cluster labels are used to define the color of the nodes
- the sparse covariance model is used to display the strength of the edges
- the 2D embedding is used to position the nodes in the plan

This example has a fair amount of visualization-related code, as
visualization is crucial here to display the graph. One of the challenge
is to position the labels minimizing overlap. For this we use an
heuristic based on the direction of the nearest neighbor along each
axis.



In [None]:
plt.figure(1, facecolor="w", figsize=(10, 8))
plt.clf()
ax = plt.axes([0.0, 0.0, 1.0, 1.0])
plt.axis("off")

# Plot the graph of partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02

# Plot the nodes using the coordinates of our embedding
plt.scatter(
    embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral
)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [
    [embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(
    segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
)
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):
    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = "left"
        x = x + 0.002
    else:
        horizontalalignment = "right"
        x = x - 0.002
    if this_dy > 0:
        verticalalignment = "bottom"
        y = y + 0.002
    else:
        verticalalignment = "top"
        y = y - 0.002
    plt.text(
        x,
        y,
        name,
        size=10,
        horizontalalignment=horizontalalignment,
        verticalalignment=verticalalignment,
        bbox=dict(
            facecolor="w",
            edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
            alpha=0.6,
        ),
    )

plt.xlim(
    embedding[0].min() - 0.15 * embedding[0].ptp(),
    embedding[0].max() + 0.10 * embedding[0].ptp(),
)
plt.ylim(
    embedding[1].min() - 0.03 * embedding[1].ptp(),
    embedding[1].max() + 0.03 * embedding[1].ptp(),
)

plt.show()