# Implementation step by step of the cluster based shrinkage correlation matrix

In [46]:
import numpy as np
import pandas as pd
import numpy.linalg as LA
from sklearn.cluster import KMeans

In [30]:
X_matrix = pd.read_csv('./data/sp_500_returns.csv', index_col='Date')

In [31]:
X_matrix.head()

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABC,ABMD,ABT,ACN,ADBE,ADI,...,WYNN,XEL,XLNX,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-04,-0.00795,-0.017578,0.023444,0.005374,0.003415,-0.007671,-0.003879,-0.000378,-0.01015,-0.001388,...,-0.017197,-0.006584,-0.010635,0.000233,-0.000283,0.033015,0.006829,-0.013534,-0.007559,0.022063
2012-01-05,0.022382,0.087475,0.006363,0.011102,0.007592,0.008835,-0.002301,-0.019082,0.007072,0.004447,...,-0.010446,0.004418,0.00888,-0.003022,0.000848,-0.014247,0.007631,0.01184,-0.00141,0.028005
2012-01-06,0.010811,0.023766,0.004637,0.010454,0.002598,-0.007116,-0.009047,-0.001733,0.008427,-0.006641,...,-0.035186,-0.002933,-0.006486,-0.007463,-0.005932,-0.016797,0.007237,0.0026,-0.008192,0.001135
2012-01-09,0.026203,0.021429,-0.00028,-0.001586,-0.000518,0.003859,-0.000179,-0.001736,-0.006616,0.019499,...,-0.002805,0.000735,0.021759,0.004464,-0.004831,0.003973,-0.003843,0.00741,-0.004557,0.013605
2012-01-10,0.01876,0.0,0.000839,0.00358,0.011408,-0.002746,0.004655,0.017201,0.023484,0.004372,...,0.008346,0.008082,0.001825,0.002573,0.014278,0.012267,0.006206,0.020964,0.009156,0.016219


## Obtaining $R$

In [32]:
R_matrix = np.corrcoef(X_matrix.T)

In [33]:
R_matrix

array([[1.        , 0.29372562, 0.29238002, ..., 0.44798073, 0.46050623,
        0.41458942],
       [0.29372562, 1.        , 0.25589812, ..., 0.35511135, 0.2528106 ,
        0.42792454],
       [0.29238002, 0.25589812, 1.        , ..., 0.30030208, 0.27456568,
        0.34156475],
       ...,
       [0.44798073, 0.35511135, 0.30030208, ..., 1.        , 0.33233173,
        0.43178304],
       [0.46050623, 0.2528106 , 0.27456568, ..., 0.33233173, 1.        ,
        0.35792117],
       [0.41458942, 0.42792454, 0.34156475, ..., 0.43178304, 0.35792117,
        1.        ]])

## Getting the distance matrix $D$

In [34]:
D_matrix = 1 - R_matrix

In [35]:
D_matrix

array([[2.22044605e-16, 7.06274376e-01, 7.07619977e-01, ...,
        5.52019273e-01, 5.39493770e-01, 5.85410575e-01],
       [7.06274376e-01, 0.00000000e+00, 7.44101881e-01, ...,
        6.44888655e-01, 7.47189397e-01, 5.72075464e-01],
       [7.07619977e-01, 7.44101881e-01, 0.00000000e+00, ...,
        6.99697919e-01, 7.25434322e-01, 6.58435246e-01],
       ...,
       [5.52019273e-01, 6.44888655e-01, 6.99697919e-01, ...,
        0.00000000e+00, 6.67668273e-01, 5.68216961e-01],
       [5.39493770e-01, 7.47189397e-01, 7.25434322e-01, ...,
        6.67668273e-01, 0.00000000e+00, 6.42078832e-01],
       [5.85410575e-01, 5.72075464e-01, 6.58435246e-01, ...,
        5.68216961e-01, 6.42078832e-01, 2.22044605e-16]])

## Determining the optimal number of clusters through the eigenvalues of $R$

In [36]:
N = X_matrix.shape[0]
T = X_matrix.shape[1]

In [37]:
q = N/T

In [38]:
lambda_plus = (1 + np.sqrt(q))**2
lambda_plus

11.162500473138333

Test if the equality in (9) holds:

In [39]:
1+ 2*(np.sqrt(N/T))+ N/T

11.162500473138333

Obtaining the eigenvectors of $R$

In [43]:
lambdas_i = LA.eigvals(R_matrix)

In [44]:
K = np.sum(lambdas_i > lambda_plus)

In [45]:
K

3

## Implementing the clustering algorithm

Notar que se está utilizando la inicialización propuesta por Begusic.

In [50]:
C_kmeans = KMeans(n_clusters=K, random_state=0, init='k-means++').fit(X_matrix.T)

In [52]:
C_kmeans.labels_

array([2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 2, 2, 2, 0,
       2, 0, 1, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 0, 2, 2, 0, 1, 1, 2, 1,
       2, 2, 1, 0, 1, 0, 0, 2, 0, 2, 2, 0, 1, 2, 2, 0, 0, 0, 2, 1, 2, 2,
       2, 2, 2, 0, 1, 0, 1, 2, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0, 1, 2, 2, 2,
       1, 1, 1, 0, 2, 1, 2, 0, 1, 2, 1, 0, 2, 0, 1, 1, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 1, 0, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 1, 1, 0, 2,
       1, 0, 1, 1, 2, 0, 0, 2, 2, 2, 2, 1, 2, 1, 2, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 2, 1, 2, 0, 1, 0, 2, 2, 0, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0,
       1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0,
       2, 0, 0, 2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 0, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 0, 1, 0, 0, 2, 1, 0, 0,
       1, 1, 0, 2, 2, 2, 2, 2, 0, 1, 1, 0, 1, 2, 2, 1, 0, 0, 0, 0, 2, 1,
       0, 2, 1, 2, 2, 2, 1, 2, 0, 0, 0, 1, 2, 0, 2, 2, 2, 1, 0, 0, 2, 1,
       0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 1, 2, 1, 2,

## Creating the S matrix

In [None]:
def get_s_matrix(K, R_matrix, C_kmeans):
    labels = np.unique(C_kmeans.labels_)
    

In [69]:
labels = np.unique(C_kmeans.labels_)

In [76]:
c_ps = [np.sum(C_kmeans.labels_ == label) for label in labels]

In [64]:
S_matrix = np.zeros((K, K))
S_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [80]:
for i in range(K):
    for j in range(K):
        if i == j:
            constant = 1/(c_ps[i] * (c_ps[i]-1))
            print(constant)
        else:
            constant = 1/(2*c_ps[i]*c_ps[j])
            print(constant)
        

4.856254856254856e-05
3.306878306878307e-05
1.6456029489204843e-05
3.306878306878307e-05
9.157509157509158e-05
2.2568269013766644e-05
1.6456029489204843e-05
2.2568269013766644e-05
2.2568269013766644e-05


In [70]:
labels[0]

0

In [73]:
np.sum(C_kmeans.labels_ == 0)

144

In [59]:
np.unique(C_kmeans.labels_)

array([0, 1, 2], dtype=int32)