In [1]:
import pickle
# 导入向量化后的数据集
np_vectorized_tfidf = pickle.load(open('model/vectorized_generated_corpus_doc2vec.pkl', 'rb'))

In [2]:
# 读入正确标签（Labels）
labels = []
# 读入数据集
with open('sixTypes-GeneratedLabels.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        labels.append(line.strip('\n'))

In [3]:
# 利用 RandomizedSearchCV 搜索不同的 维度 和 gamma 值来找到最优解（利用 V-measure 评估）

# Import required libraries
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import v_measure_score
from scipy.stats import uniform

# Define the parameter values that want to search
param_grid = {
    'kpca__n_components': range(2, 11),
    'kpca__gamma': uniform(0, 10),
    'kmeans__n_clusters': range(6, 7)
}

# Define a scoring function that takes in y_true and y_pred as arguments and returns the v_measure_score
def v_measure_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return v_measure_score(y_true, y_pred)

# Create a randomize search object
kpca = KernelPCA(kernel='rbf', random_state=9)
kmeans = KMeans(init='k-means++', random_state=9)
model = Pipeline([('kpca', kpca), ('kmeans', kmeans)])
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=5, random_state=9, scoring=v_measure_scorer, verbose=3)

# Fit the random search object to the data
random_search.fit(np_vectorized_tfidf, labels)

# Get the best estimator
best_estimator = random_search.best_estimator_
print('best_estimator', best_estimator)

# Use the best estimator to make predictions
y_pred = best_estimator.predict(np_vectorized_tfidf)

# Use V-measure to evaluate the clustering results
score = v_measure_score(labels, y_pred)
print('V-measure:', score)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END kmeans__n_clusters=6, kpca__gamma=0.10374153885699955, kpca__n_components=7;, score=0.016 total time=   0.3s
[CV 2/5] END kmeans__n_clusters=6, kpca__gamma=0.10374153885699955, kpca__n_components=7;, score=0.356 total time=   0.0s
[CV 3/5] END kmeans__n_clusters=6, kpca__gamma=0.10374153885699955, kpca__n_components=7;, score=0.266 total time=   0.1s
[CV 4/5] END kmeans__n_clusters=6, kpca__gamma=0.10374153885699955, kpca__n_components=7;, score=0.007 total time=   0.1s
[CV 5/5] END kmeans__n_clusters=6, kpca__gamma=0.10374153885699955, kpca__n_components=7;, score=0.006 total time=   0.1s
[CV 1/5] END kmeans__n_clusters=6, kpca__gamma=4.957732931341461, kpca__n_components=8;, score=0.016 total time=   0.0s
[CV 2/5] END kmeans__n_clusters=6, kpca__gamma=4.957732931341461, kpca__n_components=8;, score=0.379 total time=   0.1s
[CV 3/5] END kmeans__n_clusters=6, kpca__gamma=4.957732931341461, kpca__n_components=8;, 

In [4]:
# 选用最优解维数用 KPCA 降维
kpca = KernelPCA(n_components=8, kernel='rbf', gamma=4.957732931341461, random_state=9)
decomposition_data = kpca.fit_transform(np_vectorized_tfidf)

In [5]:
# 选用 k 值做 K-means 算法聚类
y_pred = KMeans(n_clusters=6, init='k-means++', random_state=9).fit_predict(decomposition_data)

In [6]:
print(y_pred[:10])

[3 1 0 3 1 0 0 0 3 1]


In [7]:
from sklearn import metrics

In [8]:
print('%.2f' % metrics.calinski_harabasz_score(decomposition_data, y_pred))
km = KMeans(n_clusters=6, random_state=9)
y_pred = km.fit_predict(decomposition_data)
print('%.4f' % metrics.silhouette_score(decomposition_data, km.labels_, metric='euclidean'))
print('%.4f' % metrics.homogeneity_score(labels, y_pred))
print('%.4f' % metrics.completeness_score(labels, y_pred))
print('%.4f' % metrics.v_measure_score(labels, y_pred))
print('%.4f' % metrics.adjusted_rand_score(labels, y_pred))
print('%.4f' % metrics.adjusted_mutual_info_score(labels, y_pred))
print('%.4f' % metrics.fowlkes_mallows_score(labels, y_pred))

1092.05
0.3447
0.1940
0.2127
0.2030
0.0891
0.1980
0.2629


In [35]:
# 利用 Calinski Harabasz Score 评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.calinski_harabasz_score(decomposition_data, y_pred)

2523.0732557850483

In [36]:
# 利用 Silhouette Score（轮廓系数）评估当前 K-Means 聚类效果（利用 TFIDF 表征）
km = KMeans(n_clusters=6, random_state=9)
y_pred = km.fit_predict(decomposition_data)
metrics.silhouette_score(decomposition_data, km.labels_, metric='euclidean')

0.49615210375981617

In [37]:
# 读入正确标签（Labels）
labels = []
# 读入数据集
with open('sixTypes-GeneratedLabels.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        labels.append(line.strip('\n'))

In [38]:
labels[:10]

['CP', 'CP', 'CP', 'CP', 'CP', 'CP', 'CP', 'CP', 'CP', 'CP']

In [39]:
# 利用 Homogeneity 评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.homogeneity_score(labels, y_pred)

0.5201727046123753

In [40]:
# 利用 Completeness 评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.completeness_score(labels, y_pred)

0.6141185157433506

In [41]:
# 利用 V-measure 评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.v_measure_score(labels, y_pred)

0.5632551562667902

In [42]:
# 利用 Adjusted Rand Index（调整兰德系数）评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.adjusted_rand_score(labels, y_pred)

0.3474011625813265

In [43]:
# 利用 Adjusted Mutual Information Score（调整互信息）评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.adjusted_mutual_info_score(labels, y_pred)

0.5604570770108419

In [44]:
# 利用 Fowlkes-Mallows scores 评估当前 K-Means 聚类效果（利用 TFIDF 表征）
metrics.fowlkes_mallows_score(labels, y_pred)

0.4977517378698821