<a href="https://colab.research.google.com/github/kenwkliu/ideas/blob/master/colab/StockClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%load_ext google.colab.data_table 
%matplotlib inline

In [None]:
# Read stock info

stocksInfo = pd.read_excel('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/hkStocksQuotes.xlsx')
stocksInfo

In [None]:
# Read stock histocial adjusted close price
researchData = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/researchHKStocksAdjClosePx.csv', index_col=0)
researchData

In [None]:
# Cluster based on the stock performance and volatility

#Calculate the performance
returns = researchData.pct_change().mean()*266
returns = pd.DataFrame(returns)
returns.columns = ['returns']

#Calculate the volatility
returns['volatility'] = researchData.pct_change().std()*np.sqrt(266)

#Prepare the scaler
scale = StandardScaler().fit(returns)

#Fit the scaler
scaledReturns = pd.DataFrame(scale.fit_transform(returns),columns = returns.columns, index = returns.index)
scaledReturns

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics

K = range(1,15)
distortions = []
scaledReturns.dropna(inplace = True)

#Fit the method
for k in K:
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(scaledReturns)
    distortions.append(kmeans.inertia_)

#Plot the results
fig = plt.figure(figsize= (15, 5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.grid(True)
plt.show()

In [None]:
c = 6

#Fit the model
k_means = KMeans(n_clusters=c)
k_means.fit(scaledReturns)
prediction = k_means.predict(scaledReturns)

#Plot the results
centroids = k_means.cluster_centers_
fig = plt.figure(figsize = (18,10))

ax = fig.add_subplot(111)
scatter = ax.scatter(scaledReturns.iloc[:,0], scaledReturns.iloc[:,1], c=k_means.labels_, cmap="rainbow", label=scaledReturns.index)

ax.set_title('k-Means Cluster Analysis Results')
ax.set_xlabel('Mean Return')
ax.set_ylabel('Volatility')

plt.colorbar(scatter)
plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=10)
plt.show()

In [None]:
clustered_series = pd.Series(index=scaledReturns.index, data=k_means.labels_.flatten())
clustered_series_all = pd.Series(index=scaledReturns.index, data=k_means.labels_.flatten())
clustered_series = clustered_series[clustered_series != -1]

plt.figure(figsize=(12,8))
plt.barh(range(len(clustered_series.value_counts())),clustered_series.value_counts())
plt.title('Clusters')

plt.xlabel('Stocks per Cluster')
plt.ylabel('Cluster Number')

plt.show()

In [None]:
clusterDf = clustered_series.to_frame()
clusterDf.rename(columns = {0 : 'cluster'}, inplace=True)
clusterDf