<a href="https://colab.research.google.com/github/kenwkliu/ideas/blob/master/colab/StockClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%load_ext google.colab.data_table 
%matplotlib inline

In [None]:
# Read the highly correlated HK Pairs
pairsDf = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/hkCorrelatedPairs.csv')
pairsDf

In [None]:
# Read stock histocial adjusted close price
researchData = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/researchHKStocksAdjClosePx.csv', index_col=0)
researchData

In [None]:
# Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset
researchData.describe().T.round(4)

In [None]:
# Cluster based on the stock return and volatility

# Calculate the annualized return
TRADING_DAYS = 252
returns = researchData.pct_change().mean() * TRADING_DAYS
returns = pd.DataFrame(returns)
returns.columns = ['returns']

# Calculate the annualized volatility
returns['volatility'] = researchData.pct_change().std() * np.sqrt(TRADING_DAYS)

# Normalize the 2 set of numbers to similar scale
scale = StandardScaler().fit(returns)
scaledReturns = pd.DataFrame(scale.fit_transform(returns),columns = returns.columns, index = returns.index)
scaledReturns

In [None]:
# Use K-mean to cluster different groups and elbow method to find the optimal k 
from sklearn.cluster import KMeans
from sklearn import metrics

K = range(1,15)
distortions = []
scaledReturns.dropna(inplace = True)

#Fit the method
for k in K:
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(scaledReturns)
    distortions.append(kmeans.inertia_)

#Plot the results
fig = plt.figure(figsize= (15, 5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.grid(True)
plt.show()

In [None]:
# Pick the number of clusters
c = 8

#Fit the model
k_means = KMeans(n_clusters=c)
k_means.fit(scaledReturns)
prediction = k_means.predict(scaledReturns)

#Plot the results
centroids = k_means.cluster_centers_
fig = plt.figure(figsize = (18,10))

ax = fig.add_subplot(111)
scatter = ax.scatter(scaledReturns.iloc[:,0], scaledReturns.iloc[:,1], c=k_means.labels_, cmap="rainbow", label=scaledReturns.index)

ax.set_title('k-Means Cluster Analysis Results')
ax.set_xlabel('Mean Return')
ax.set_ylabel('Volatility')

plt.colorbar(scatter)
plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=10)
plt.show()

In [None]:
# Plot the number of stocks in each cluster
clustered_series = pd.Series(index=scaledReturns.index, data=k_means.labels_.flatten())
clustered_series_all = pd.Series(index=scaledReturns.index, data=k_means.labels_.flatten())
clustered_series = clustered_series[clustered_series != -1]

plt.figure(figsize=(12,8))
plt.barh(range(len(clustered_series.value_counts())),clustered_series.value_counts())
plt.title('Clusters')

plt.xlabel('Stocks per Cluster')
plt.ylabel('Cluster Number')

plt.show()

In [None]:
# Show the stocks and the corresponding clusters
clusterDf = clustered_series.to_frame().reset_index()
clusterDf.rename(columns = {'index': 'stock', 0 : 'cluster'}, inplace=True)
clusterDf

In [None]:
# Merge the cluster info back to the stock Pairs
cols = ['stockA', 'stockB', 'corr', 'cluster_A', 'cluster_B']
clusterPairsDf = pairsDf.merge(clusterDf, how='left', left_on='stockA', right_on='stock').merge(clusterDf, how='left', left_on='stockB', right_on='stock', suffixes=('_A', '_B'))[cols]
clusterPairsDf['sameCluster'] = (clusterPairsDf['cluster_A'] == clusterPairsDf['cluster_B'])

clusterPairsDf

In [None]:
 # Time-series based clustering: https://tslearn.readthedocs.io/en/stable/user_guide/clustering.html

!pip install tslearn
from tslearn.clustering import TimeSeriesKMeans

In [None]:
from sklearn.preprocessing import Normalizer

researchData.dropna(inplace=True)

# Normalize the price series
normalizer = Normalizer()
researchDataPriceNormalized = normalizer.fit_transform(researchData.T.reset_index(drop=True))
researchDataPriceNormalized

In [None]:
# dtw is "Dynamic Time Warping"
tskm = TimeSeriesKMeans(n_clusters=c, metric="dtw")
tskm.fit(researchDataPriceNormalized)

tskm_labels = tskm.labels_
tskm_labels

In [None]:
stock_names = researchData.columns
tskmDf = pd.DataFrame(tskm_labels, stock_names)
tskmDf.rename(columns = {0 : 'cluster'}, inplace=True)
tskmDf