In [2]:
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
import pandas as pd
import numpy as np


In [3]:
def make_corr_matrix(groups: list, corrs: list)->np.ndarray:
    mat_holder = np.zeros(shape=(sum(groups), sum(groups)))
    for index,group in enumerate(groups):
        if index == 0:
            myrange = list(range(0, group))
            index_i = 0
            index_j = 0
            for index_i in myrange:
                for index_j in myrange:
                    mat_holder[index_i, index_j] = corrs[index]
        else:
            myrange = list(range(sum(groups[:index]), sum(groups[:index+1])))
            for index_i in myrange:
                for index_j in myrange:
                    mat_holder[index_i, index_j] = corrs[index]
    np.fill_diagonal(mat_holder, 1.0)
    return mat_holder

In [17]:
corr_matrix = make_corr_matrix([5, 2, 3], [0.5, 0.8, 0.7])

In [5]:
num_obs = 300

In [18]:
daily_returns = np.random.multivariate_normal(mean = np.zeros(corr_matrix.shape[0]), cov = corr_matrix, size=num_obs)

In [11]:
daily_returns

array([[-0.54595891,  1.58839105, -0.39157113, ..., -0.77583979,
        -1.05367518, -0.6995698 ],
       [-0.4349153 , -1.19989567, -0.31710805, ..., -0.54708636,
        -1.89051816, -0.37257508],
       [ 0.079308  ,  1.23748031,  0.12371209, ...,  0.75989623,
        -0.06262072,  0.63824463],
       ...,
       [ 0.45298218,  0.52452412,  0.50984958, ..., -0.52504802,
        -1.23840126, -0.22314827],
       [-0.42727682,  1.06228966, -0.41830433, ...,  1.24566159,
         0.47199675,  0.89813745],
       [-0.60721759, -0.53151234,  1.32468138, ..., -0.78239125,
        -0.44648184, -0.18391569]])

In [31]:
daily_returns.shape

(300, 10)

In [19]:
corr_matrix = np.corrcoef(daily_returns.T)

In [12]:
corr_matrix

array([[ 1.        ,  0.10132382,  0.09699756,  0.08191097,  0.0653825 ,
        -0.03966415,  0.02056098, -0.0295454 ,  0.00604266,  0.0686173 ],
       [ 0.10132382,  1.        ,  0.07235393,  0.00177305,  0.08709511,
        -0.13592496, -0.03470837,  0.01747125, -0.02454694, -0.03016468],
       [ 0.09699756,  0.07235393,  1.        ,  0.06948882,  0.12793192,
         0.13153147,  0.07440259, -0.01049844,  0.09860462,  0.02080205],
       [ 0.08191097,  0.00177305,  0.06948882,  1.        ,  0.05440259,
        -0.05882895,  0.06544377,  0.03126454,  0.06895593,  0.04183491],
       [ 0.0653825 ,  0.08709511,  0.12793192,  0.05440259,  1.        ,
        -0.04954097, -0.00338218, -0.03083727,  0.03939755,  0.00700091],
       [-0.03966415, -0.13592496,  0.13153147, -0.05882895, -0.04954097,
         1.        ,  0.3750248 , -0.08371248, -0.05602752, -0.04422681],
       [ 0.02056098, -0.03470837,  0.07440259,  0.06544377, -0.00338218,
         0.3750248 ,  1.        , -0.15909873

In [20]:
D_matrix = np.sqrt(2*(1- corr_matrix))
D_matrix = np.around(D_matrix, decimals=7)

In [16]:
D_matrix

array([[0.       , 1.3406537, 1.3438768, 1.3550565, 1.3671997, 1.4419876,
        1.3995992, 1.4349532, 1.4099343, 1.3648316],
       [1.3406537, 0.       , 1.3620911, 1.4129593, 1.3512253, 1.5072657,
        1.4385467, 1.4018051, 1.4314656, 1.4353847],
       [1.3438768, 1.3620911, 0.       , 1.3641929, 1.3206575, 1.3179291,
        1.3605862, 1.4216177, 1.3426804, 1.399427 ],
       [1.3550565, 1.4129593, 1.3641929, 0.       , 1.3752072, 1.4552175,
        1.3671549, 1.3919306, 1.3645835, 1.3843158],
       [1.3671997, 1.3512253, 1.3206575, 1.3752072, 0.       , 1.4488209,
        1.4166031, 1.4358532, 1.3860754, 1.4092545],
       [1.4419876, 1.5072657, 1.3179291, 1.4552175, 1.4488209, 0.       ,
        1.1180118, 1.4722177, 1.4532911, 1.4451483],
       [1.3995992, 1.4385467, 1.3605862, 1.3671549, 1.4166031, 1.1180118,
        0.       , 1.5225628, 1.4916898, 1.4367182],
       [1.4349532, 1.4018051, 1.4216177, 1.3919306, 1.4358532, 1.4722177,
        1.5225628, 0.       , 0.81084

## K medioides con disimilaridades

In [21]:
dis_k_medioids = KMedoids(n_clusters = 3,
                        random_state = 0,
                        init='k-medoids++',
                        method = 'pam',
                        metric = 'precomputed').fit(D_matrix)

In [23]:
dis_k_medioids.labels_

array([2, 2, 2, 2, 2, 0, 0, 1, 1, 1])

## K medioides con matriz de observaciones + MDS

In [32]:
mds_embedding_distancias = MDS(n_components=300, normalized_stress='auto', dissimilarity = 'precomputed', metric = True)

In [34]:
mds_coordinates = mds_embedding_distancias.fit_transform(D_matrix)

In [36]:
mds_coordinates.shape

(10, 300)

In [38]:
dis_k_medioids_mds = KMedoids(n_clusters = 3,
                        random_state = 0,
                        init='k-medoids++',
                        method = 'pam',
                        metric='euclidean').fit(mds_coordinates)

In [39]:
dis_k_medioids_mds.labels_

array([2, 2, 2, 2, 2, 0, 0, 1, 1, 1])

## K means + mds

In [41]:
k_means_mds = KMeans(n_clusters=3, random_state=0, init="k-means++").fit(mds_coordinates)



In [43]:
k_means_mds.labels_

array([2, 2, 2, 2, 2, 0, 0, 1, 1, 1], dtype=int32)