# Calibration of the Average Oracle eigenvalues

2021-10-20

author: Emanuele Sorgente
Note: the computation takes about a minute.

In [2]:
import os
# To speed up multiprocessing. Force numpy to work on a single thread
os.environ["OMP_NUM_THREADS"] = "1"

from multiprocessing import Pool

In [3]:
import numpy as np
import pandas as pd

In [4]:
#it is on pip. Maybe you don't need it. 
import fastcluster
from scipy.cluster.hierarchy import fcluster

def clustThr(ret,thr=0.95):
    # Select only the subset of stocks with a correlaction smaller than thr
    C = np.corrcoef(ret)[np.triu_indices(ret.shape[0],1)]
    c = fcluster(fastcluster.single(1-C),1-thr,criterion='distance')
    return np.unique(c,return_index=True)[1]

In [5]:
def get_sortest_eig(C):
    '''
    input 
        C: correlation matrix
        
    output: 
        l: eigenvalues
        v: eigenvectors 
    '''
    
    l,v = np.linalg.eigh(C)
    ordn = np.argsort(l)
    l,v = l[ordn],v[:,ordn]
    return l,v

In [6]:
def selectData(t,dtin,dtout,N):
    '''
    input 
        t: index of today
        dtin: in-sample window size
        dtout: out-of-sample window size
        N: number of stocks
    output
        rin: in-sample returns (stocks by rows, days by columns)
        rout: in-sample returns (stocks by rows, days by columns)
    
    Note: In case you do not need some of the following filters, comment them out
    '''
    
    # Select only stocks without NaN
    r = R.iloc[t-dtin:t+dtout].dropna(axis=1).values.T

    #Split in-sample and out-of-sample
    rin,rout = r[:,:dtin],r[:,dtin:]

    #Filter1: Remove stocks with more than 20% of zero returns in-sample 
    msk = (rin==0).mean(axis=1)<0.2
    rin,rout = rin[msk],rout[msk]

    #Filter2: Remove stocks that in-sample have a correlation larger then 0.95 (probably errors in out dataset)
    msk = clustThr(rin,thr=0.95)
    rin,rout = rin[msk],rout[msk]

    #Randomly select N stocks on the remaning ones
    
    if N>rin.shape[0]:
        return None,None
    
    msk = np.random.choice(range(rin.shape[0]),size=N,replace=False)
    rin,rout = rin[msk],rout[msk]
    
    return rin,rout


In [7]:
def get_Oracle(x):
    '''
    input
        x: tuple of (t,dtin,dtout,N)
            t: index of today
            dtin: in-sample window size
            dtout: out-of-sample window size
            N: number of stocks

    output:
        oracle eigenvalues
    '''
    
    t,dtin,dtout,N = x
    
    #get in-sample and out-of-sample
    rin,rout = selectData(t,dtin,dtout,N)
    if rin is None:
        return None
    
    #compute correlation matrices
    Cin,Cout = np.corrcoef(rin),np.corrcoef(rout)

    #get eigenvalues and eigenvectos
    l,v = get_sortest_eig(Cin)

    #compute oracle
    oracle = (v.T @ Cout @ v).diagonal()
    return oracle

In [8]:

def compute_AO(R,N,dtin,dtout,Nsamples,ncpu=None):


    #Available days for the calibratios
    avail_t = np.arange(dtin,Tmax-dtout)

    # random selection of the inputs
    conf_input = [(t,dtin,dtout,N) for t in np.random.choice(avail_t,size=Nsamples,replace=True)]

    p = Pool(processes=ncpu)
    AO = p.map(get_Oracle,conf_input,chunksize=1000)
    p.close()
    
    AO = [i for i in AO if not i is None]

    # Average-Oracle eigenvalues
    AO = np.mean(AO,axis=0)

    print(AO)
    return AO

# you can now save the AO values

In [9]:
# Return matrix [Global variable] (Stocks by columns, days by rows)
R = np.exp(pd.read_csv('/home/damien/work/data/finance/index/US/sp1000.csv.gz',index_col=0))-1
R.index = pd.DatetimeIndex(R.index)
# Select the calibration window
R = R[(R.index.year>=1995) & (R.index.year<2005)]

# Total number of days
Tmax = R.shape[0] 

In [10]:
Ns=np.arange(10,500,1)   # or choose any fixed value

In [None]:
import pickle

dtin=50
dtout=252
Nsamples=10000

file_AO="AO_Tin50.pkl"

AOs=[]
for N in Ns:
    print(N)
    AOs.append(compute_AO(R,N,dtin,dtout,Nsamples))
    pickle.dump( AOs, open( file_AO, "wb" ) )

In [13]:
pd.DataFrame({'lambdas':AO_500}).to_csv("AO_500.csv")

In [37]:
AO_full=[]

for i in range(len(AOs)):
    AO=AOs[i]
    nan_pads = np.empty((np.max(Ns)-len(AO)))
    nan_pads[:] = np.nan
    AO_full.append(np.concatenate((AO,nan_pads)))

In [40]:
AO_full_matrix=np.reshape(AO_full,(len(AOs),len(AO_full[0])))

In [46]:
AO_DF=pd.DataFrame(AO_full_matrix.T)
AO_DF.columns=Ns
AO_DF

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,...,490,491,492,493,494,495,496,497,498,499
0,0.820475,0.817424,0.816211,0.816463,0.811535,0.813291,0.811721,0.811449,0.811400,0.810021,...,0.746617,0.744508,0.741566,0.747134,0.740722,0.740720,0.740036,0.740070,0.740063,0.734928
1,0.841253,0.838007,0.835823,0.833212,0.829061,0.828064,0.827962,0.826869,0.826280,0.826592,...,0.739699,0.740400,0.735789,0.736660,0.738019,0.737577,0.730659,0.737430,0.734723,0.732586
2,0.855915,0.849662,0.847851,0.844485,0.839345,0.838152,0.836575,0.835624,0.834108,0.832393,...,0.739699,0.740417,0.737118,0.737114,0.736031,0.734879,0.732536,0.734294,0.731328,0.733988
3,0.870841,0.861650,0.855793,0.853798,0.847331,0.846879,0.846126,0.842045,0.840965,0.840090,...,0.740085,0.741232,0.737760,0.739927,0.740059,0.736902,0.734936,0.735580,0.734075,0.733928
4,0.879421,0.872243,0.868440,0.864347,0.856170,0.853822,0.851418,0.848851,0.846612,0.843562,...,0.742042,0.742051,0.740727,0.739811,0.737475,0.738573,0.733779,0.737467,0.734766,0.734845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,,,,,,,,,,,...,,,,,,95.177875,6.109013,3.326051,2.545245,1.820739
495,,,,,,,,,,,...,,,,,,,96.024118,6.130092,3.343982,2.538789
496,,,,,,,,,,,...,,,,,,,,95.831079,6.160580,3.340269
497,,,,,,,,,,,...,,,,,,,,,96.693067,6.075418


In [47]:
AO_DF.to_csv("AO_Tin50.csv.gz")

In [None]:
def filterCorrelation_AO(C, l_AO):
    l,v = get_sortest_eig(C)
    C_AO = (v@ l_AO @ v.T)
    
    return C_AO


def filterCovariance_AO(Sigma, l_AO):
    s = np.sqrt(np.diag(Sigma))
    si_sj = np.outer(s,s)
    C = Sigma/si_sj
    C_AO = filterCorrelation_AO(C, l_AO)
    
    Sigma_AO = C_AO*si_sj
    
    return Sigma_AO