In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import math as m
from scipy.sparse import csr_matrix
import scipy.sparse as ss
import sklearn.cluster as sl
import networkx as nx
#this you can just pull out the source code from github
from signet.cluster import Cluster 
#This you can also pull from github
from signet.utils import sqrtinvdiag, invdiag, cut, merge, objscore
from sklearn.metrics.cluster import adjusted_rand_score

# Sample Code
- Info from the paper "Correlation Matrix Clustering for Statistical Arbitrage Portfolios", Cartea, Cucuringu, Jin 2023

In [None]:
#some clustering algorithms I implemented

I = 1j 
from sklearn.cluster import KMeans 

def HermitianClustering(n_clusters, matrix): #Hermitian Clustering
    matrix2 = matrix * I
    eigvals, eigvects = np.linalg.eig(matrix2)
    eigvals = eigvals.real
    indices = eigvals.argsort()[-n_clusters:]
    W1 = eigvects[:, indices].real
    W2 = eigvects[:, indices].imag
    W = np.hstack([W1, W2]) 
    kmeans = KMeans(n_clusters=n_clusters,random_state=10).fit(W)
    data = pd.DataFrame(list(zip(matrix.columns, kmeans.labels_)),columns = ['index','cluster'])
    return data

from sklearn.cluster import SpectralClustering #Spectral Clustering

def clustering(df,num,random=0): #Spectral Clustering
    df = df.abs() #absolute value of the adjacency matrix
    G=nx.from_pandas_adjacency(df)
    sc = SpectralClustering(num, assign_labels='discretize', affinity='precomputed', n_init=200,random_state=random)
    a = sc.fit(nx.to_pandas_adjacency(G))
    data = pd.DataFrame(list(zip(df.columns, a.labels_)),columns = ['index','cluster'])
    return data
def split_by_cluster(data, df):#
    temp = []
    for i in np.unique(data['cluster']):
        data_temp = data.loc[data['cluster']==i]
        x = data_temp['index']
        temp_matrix = df[x].loc[x]
        temp.append(temp_matrix)
    return temp


def SPONGE_Clustering(df, num,method='regular'): # SPONGE
    df_pos = df[df>=0].fillna(0)
    df_neg = -df[df<=0].fillna(0)
    c = Cluster((csr_matrix(df_pos.values), csr_matrix(df_neg.values)))
    if method == 'regular': 
        predictions = c.SPONGE(k=num, tau_p=1, tau_n=1, eigens=None, mi=None)
    elif method =='sym':
        predictions = c.SPONGE_sym(k=num, tau_p=1, tau_n=1, eigens=None, mi=None)
    result = pd.DataFrame(df.columns)
    result.columns = ['index']
    result['cluster'] = predictions
    return result

def laplacian_rw(c, k=2, eigens=None, mi=None): # Laplacian random walk clustering
    if eigens == None:
        eigens = k
    if mi == None:
        mi = c.size

    symmetric = True
    eye = ss.eye(c.size, format="csc")

    d = invdiag(c.D_p)
    matrix = d * c.p
    d = invdiag(c.D_n)
    matrix = matrix - (d * c.n)
    matrix = eye - matrix
    symmetric = False

    if symmetric:
        (w, v) = ss.linalg.eigsh(matrix, eigens, maxiter=mi, which='SA')
    else:
        (w, v) = ss.linalg.eigs(matrix, eigens, maxiter=mi, which='SR')

    v = v / w  # weight eigenvalues by eigenvectors, since smaller eigenvectors are more likely to be informative
    v = np.atleast_2d(v)
    v = np.real(v)

    x = sl.KMeans(n_clusters=k).fit(v)
    
    return x.labels_

def Laplacian_Clustering(df, num,method='sym'): #vanilla laplacian clustering
    df_pos = df[df>=0].fillna(0)
    df_neg = -df[df<=0].fillna(0)
    c = Cluster((csr_matrix(df_pos.values), csr_matrix(df_neg.values)))
    if method == 'sym': 
        predictions = c.spectral_cluster_laplacian(k=num, normalisation='sym_sep', eigens=None, mi=None)
    elif method =='rw':
        predictions = laplacian_rw(c, k=num, eigens=None, mi=None)
    result = pd.DataFrame(df.columns)
    result.columns = ['index']
    result['cluster'] = predictions
    return result

def Signed_Spectral_Clustering(df, num):
    df_pos = df[df>=0].fillna(0)
    df_neg = -df[df<=0].fillna(0)
    c = Cluster((csr_matrix(df_pos.values), csr_matrix(df_neg.values)))
    predictions = c.spectral_cluster_adjacency(k = num, normalisation='sym')
    result = pd.DataFrame(df.columns)
    result.columns = ['index']
    result['cluster'] = predictions
    return result



In [None]:
#Code for plotting the cluster v.s. sector membership

merged = cluster_SPONGE.set_index('index').sort_values('cluster')
indust = {1:'Non-durables',2: 'Durables',3:"Manufacturing",4:"Energy",5:'Chemicals',6:'Business Equipments',
         7:'Telecom',8:'Utilities',9:'Shops',10:'Healthcare',11:'Finance',12:'Other'}
merged.industry = merged.industry.map(indust)
sns.set_style("ticks")
sns.color_palette("tab10")
plt.figure(figsize=(10, 7))
plt.scatter(merged.index,merged['industry'],marker='|',c = merged['cluster'],linewidths=1.5,cmap='viridis')
prev_color = merged['cluster'][0]
for i in range(len(merged['cluster'])):
    # if color has changed, add a black dash
    if merged['cluster'][i] != prev_color:
        plt.plot([merged.index[i], merged.index[i-1]], [-0.5, 12], 'k--', linewidth=1)
        prev_color = merged['cluster'][i]
plt.gcf().set_dpi(1000)
plt.title('Comparing Clusters and Sectors of Stocks 2019-2022')
plt.yticks(fontsize=12,fontname="pdf")
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False)