# Clustering analysis

## 1.Loading the essentical package

In [None]:
from pandas import Series,DataFrame
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

## 2. Import the dataset and obtain the preprocessed data

### 2.1 Get dataset fro a certain year
Data for 2020 is obtained firstly as example

In [None]:
df = pd.read_excel("all_submission_files.xlsx")
data = df[df["report_end_date"].dt.year == 2020]
data.head(n=6)

### 2.2 Summary dataset
Get the number of stocks and total amount of each company’s investment in stocks in 2020

In [None]:
data_summary = data.groupby('cik').agg({'value': ['sum', 'count']})
index_name = data_summary.index
total_value = []
number_stocks = []
for i in range(0, len(data_summary)):
  total_value.append(data_summary.values[i, 0])
  number_stocks.append(data_summary.values[i, 1])
data_final = {'cik':index_name,'number_stocks': number_stocks, 'total_value': total_value}
data_final = DataFrame(data_final)
data_final = data_final.drop(data_final[data_final['total_value']==0].index)
data_final.head(n=6)

### 2.3 Adjust the data scale
Log the data value of total_value and number_stocks

In [None]:
data_log = data_final.copy()
data_log['number_stocks'] = numpy.log(data_log['number_stocks'])
data_log['total_value'] = numpy.log(data_log['total_value'])
data_log.head(n=6)

## 3. Visualize the data

### 3.1 Plot the dataset (log) 

In [None]:
data_label= data_log[(data_log.total_value>15)|(data_log.total_value<11)]
fig=plt.figure(figsize=(12, 13))
sns.lmplot(x='number_stocks',y='total_value',data=data_log,fit_reg=True) 
for i in range (len(data_label)):
    plt.annotate(data_label['cik'].iloc[i],(data_label['number_stocks'].iloc[i],data_label['total_value'].iloc[i]))

### 3.2 Get the suitable number of clusters 

In [None]:
wcss = []
n=11
data_analysis = data_log.drop(columns="cik")
if len(data_analysis)<11:
    n=len(data_analysis)
for i in range(1, n):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(data_analysis)
    wcss.append(kmeans.inertia_)
plt.figure()
plt.plot(range(1, n), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')

### 3.3 Clustering analysis (two component and 6 cluster)

In [None]:
pca = PCA(2)
data_cluster = pca.fit_transform(data_analysis)
kmeans = KMeans(n_clusters=6)
label = kmeans.fit_predict(data_cluster)
centroids = kmeans.cluster_centers_
u_labels = np.unique(label)
plt.figure()
for i in u_labels:
    plt.scatter(data_cluster[label == i, 0], data_cluster[label == i, 1], label=i)
plt.scatter(centroids[:, 0], centroids[:, 1], s=80, color="black")
plt.legend()


### 3.4 Obtain the cik of different clusters

In [None]:
def get_cik (label_n):
    label_list = pca.inverse_transform(data_cluster[label==label_n])
    cik = []
    for i in range(len(label_list)):
        cik.append(data_log[round(data_log['total_value'],5)==round(label_list[i][1],5)]['cik'].values[0])
    return cik

In [None]:
cik_all=[]
for i in u_labels:
    cik_all.append(get_cik(i))
len(cik_all)  

In [None]:
cik_all[3] # the all cik of culster 4