# 分析各分群組成 (隨機森林特徵重要性)

In [2]:
import numpy as np
import pandas as pd
from pyecharts.charts import Bar
from pyecharts import options as opts
from src.describe import summary_cluster, one_hot_encode_clusters, random_forest_train, render_importance_rank

設定cluster csv路徑並讀取資料

In [3]:
cluster_path = './cluster-result/1714460704_som_sklearn/cluster_20.csv'
cluster = pd.read_csv(cluster_path)

根據cluster進行劃分

In [4]:

chart = summary_cluster(cluster)
chart.render_notebook()

# Unsupervised to Supervised

將各分群的資料獨立並進行binary encoding

In [5]:
one_hot_encoded_clusters = one_hot_encode_clusters(cluster)

每個分群進行隨機森林建模 (Binary classification)

In [6]:
kappas = []
importance_bars = []
importance_data = {}
for cluster_index in range(len(one_hot_encoded_clusters)):
	feature_importance_df, kappa = random_forest_train(
		one_hot_encoded_clusters,
		cluster_index
	)
	importance_bar = render_importance_rank(
		feature_importance_df,
		cluster_index
	)
	kappas.append(kappa)
	importance_bars.append(importance_bar)
	importance_data[cluster_index] = {
		"feature": list(feature_importance_df['Feature']),
		"importance": list(feature_importance_df['Importance'])
	}
	


Cluster 0 | Accuracy: 0.9983466666666667 | Kappa: 0.9762221099155827
Cluster 1 | Accuracy: 0.99308 | Kappa: 0.9560594919056267
Cluster 2 | Accuracy: 0.99216 | Kappa: 0.9587622900648735
Cluster 3 | Accuracy: 0.9952 | Kappa: 0.9698604848541569
Cluster 4 | Accuracy: 0.9960133333333333 | Kappa: 0.9211933898118843
Cluster 5 | Accuracy: 0.9920933333333334 | Kappa: 0.922042942901254
Cluster 6 | Accuracy: 0.991 | Kappa: 0.9081551199773341
Cluster 7 | Accuracy: 0.9931066666666667 | Kappa: 0.9030893830051544
Cluster 8 | Accuracy: 0.9907333333333334 | Kappa: 0.9304035686694723
Cluster 9 | Accuracy: 0.9897866666666667 | Kappa: 0.8623189992720809
Cluster 10 | Accuracy: 0.9912 | Kappa: 0.8671583443176386
Cluster 11 | Accuracy: 0.9924666666666667 | Kappa: 0.9203918849657383
Cluster 12 | Accuracy: 0.9924266666666667 | Kappa: 0.9630654552008812
Cluster 13 | Accuracy: 0.9928 | Kappa: 0.9646921990492988
Cluster 14 | Accuracy: 0.99228 | Kappa: 0.9048429426687901
Cluster 15 | Accuracy: 0.9923866666666666 |

畫出每個分群隨機森林的kappa

In [7]:
xy_pairs = list(zip(
	[cluster_index for cluster_index in range(len(one_hot_encoded_clusters))], 
	kappas
))
sorted_pairs = sorted(xy_pairs, key=lambda pair: pair[1])
xaxis_sorted, yaxis_sorted = zip(*sorted_pairs)
chart = Bar()
chart.add_xaxis(xaxis_sorted)
chart.add_yaxis("Kappa", yaxis_sorted)
chart.set_global_opts(
	title_opts=opts.TitleOpts(title="Cluster Random Forest Kappa")
)
chart.render_notebook()


劃出每個分群的feature importance (Bar and Heatmap)

In [8]:
from pyecharts.charts import Page, HeatMap

x = [cluster for cluster in importance_data.keys()]
y = importance_data[0]['feature']
value = []
max = 0
for cluster in importance_data.keys():
	for j, imp in enumerate(importance_data[cluster]['importance']):
		cellValue = imp * 100
		cell = [cluster, j, cellValue]
		value.append(cell)
		if cellValue > max:
			max = cellValue

c = (
	HeatMap()
	.add_xaxis(x)
	.add_yaxis("Feature Importance", y, value)
	.set_global_opts(
		title_opts=opts.TitleOpts(title="HeatMap"),
		visualmap_opts=opts.VisualMapOpts(
			max_=max,
			range_color=["#2b83ba","#abdda4", "#ffffbf", "#fdae61", "#d7191c" ]
		),
	)
)
c.render_notebook()


In [9]:

page = Page(layout=Page.SimplePageLayout)

page.add(
	importance_bars[0],
	importance_bars[1],
	importance_bars[2],
	importance_bars[3],
	importance_bars[4],
	importance_bars[5],
	importance_bars[6],
	importance_bars[7],
	importance_bars[8],
	importance_bars[9],
	importance_bars[10],
	importance_bars[11],
	importance_bars[12],
	importance_bars[13],
	importance_bars[14],
	importance_bars[15]
)
page.render_notebook()