In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import kneed
import scipy.spatial.distance as ssd
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics import calinski_harabasz_score
from hcaa_implementation import hcaa_alocation
import rie_estimator
import csestimator


In [5]:
precios_americano = pd.read_csv(
    "./sp_500_original_clean.csv",
    index_col="Date",
    parse_dates=True,
)

precios_europeo = pd.read_csv(
    "./european_market_original_clean.csv",
    index_col="Date",
    parse_dates=True,
)

retornos_american = (np.log(precios_americano) - np.log(precios_americano.shift(1))).iloc[1:]
retornos_europeo = (np.log(precios_europeo) - np.log(precios_europeo.shift(1))).iloc[1:]


In [27]:
def get_optimal_k_eigen(corr_matrix, N, T):
    eigenvals = np.linalg.eigvals(corr_matrix)
    count = (eigenvals > 1 + 2 * np.sqrt(N / T) + N / T).sum()
    return count

def get_optimal_k_calenski(dataset, bottom_range, top_range, corr_function):
    corr_mat = corr_function(dataset.T)
    D_matrix = np.sqrt(2 * (1 - corr_mat))
    D_matrix = np.around(D_matrix, decimals=7)
    D_condensed = ssd.squareform(D_matrix)
    Z = linkage(D_condensed, "ward", optimal_ordering=True)
    indices = []
    for i in range(bottom_range, top_range):
        labels = fcluster(Z, i, criterion="maxclust")
        indices.append(calinski_harabasz_score(dataset.T, labels))
    # pd.Series(indices).plot()
    print(
        kneed.KneeLocator(
            range(bottom_range, top_range),
            indices,
            curve="convex",
            direction="decreasing",
        ).knee
    )
    return kneed.KneeLocator(
        range(bottom_range, top_range), indices, curve="convex", direction="decreasing"
    ).knee


def get_optimal_k_calenski_rie(dataset, bottom_range, top_range, corr_function):
    corr_mat = corr_function(dataset)
    D_matrix = np.sqrt(2 * (1 - corr_mat))
    D_matrix = np.around(D_matrix, decimals=7)
    D_condensed = ssd.squareform(D_matrix)
    Z = linkage(D_condensed, "ward", optimal_ordering=True)
    indices = []
    for i in range(bottom_range, top_range):
        labels = fcluster(Z, i, criterion="maxclust")
        indices.append(calinski_harabasz_score(dataset.T, labels))
    print(
        kneed.KneeLocator(
            range(bottom_range, top_range),
            indices,
            curve="convex",
            direction="decreasing",
        ).knee
    )
    return kneed.KneeLocator(
        range(bottom_range, top_range), indices, curve="convex", direction="decreasing"
    ).knee


## Determinación de número óptimo de grupos

### Valores Propios

In [54]:
rie_estimator_american = rie_estimator.get_rie(retornos_american, True, True)
pearson_estimator_american = np.corrcoef(retornos_american.values.T)
eca_estimator_american = csestimator.get_shrinkage_est(retornos_american, alpha = 0.5)

k_optimal_eigen_american_rie = get_optimal_k_eigen(rie_estimator_american, retornos_american.shape[0], retornos_american.shape[1])
k_optimal_eigen_american_pearson = get_optimal_k_eigen(pearson_estimator_american, retornos_american.shape[0], retornos_american.shape[1])
k_optimal_eigen_american_eca = get_optimal_k_eigen(eca_estimator_american, retornos_american.shape[0], retornos_american.shape[1])

print(k_optimal_eigen_american_rie)
print(k_optimal_eigen_american_pearson)
print(k_optimal_eigen_american_eca)


3
3
4


In [55]:
rie_estimator_european = rie_estimator.get_rie(retornos_europeo, False, True)
pearson_estimator_european = np.corrcoef(retornos_europeo.values.T)
eca_estimator_european = csestimator.get_shrinkage_est(retornos_europeo, alpha = 0.5)

k_optimal_eigen_european = get_optimal_k_eigen(rie_estimator_european, retornos_europeo.shape[0], retornos_europeo.shape[1])
k_optimal_eigen_european_pearson = get_optimal_k_eigen(pearson_estimator_european, retornos_europeo.shape[0], retornos_europeo.shape[1])
k_optimal_eigen_european_eca = get_optimal_k_eigen(eca_estimator_european, retornos_europeo.shape[0], retornos_europeo.shape[1])


print(k_optimal_eigen_european)
print(k_optimal_eigen_european_pearson)
print(k_optimal_eigen_european_eca)

1
1
1


### Calenski

In [56]:
k_optimal_calenski_american_rie = get_optimal_k_calenski_rie(retornos_american, 2, 50, rie_estimator.get_rie)
k_optimal_calenski_american_pearson = get_optimal_k_calenski(retornos_american, 2, 50, np.corrcoef)
k_optimal_calenski_american_eca = get_optimal_k_calenski(retornos_american, 2, 50, np.corrcoef)

2
2
2


In [57]:
k_optimal_calenski_european_rie = get_optimal_k_calenski_rie(retornos_europeo, 2, 50, rie_estimator.get_rie)
k_optimal_calenski_european_pearson = get_optimal_k_calenski(retornos_europeo, 2, 50, np.corrcoef)
k_optimal_calenski_european_eca = get_optimal_k_calenski(retornos_europeo, 2, 50, np.corrcoef)

10
11
11
