In [14]:
#文本聚类
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import jieba.posseg as pseg

In [5]:
#中文分词
def jieba_cut(comment):
    word_list = []
    seg_list = pseg.cut(comment)
    for word in seg_list:
        if word.flag in ['a','ag','an']:
            word_list.append(word.word)
    return word_list

In [7]:
#读取数据文件
fn = open('comment.txt')
comment_list = fn.readlines()
fn.close()

In [12]:
# word to vector
stop_words = ['...','。','，','？','！','+',' ','、','：','；','（','）','.','-']
vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=jieba_cut, use_idf=True)
#创建词向量模型
X = vectorizer.fit_transform(comment_list)
#将评论关键字列表转换为词向量空间模型

In [13]:
#K 均值聚类
model_kmeans = KMeans(n_clusters=3)
model_kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [18]:
#聚类结果汇总
cluster_labels = model_kmeans.labels_
word_vectors = vectorizer.get_feature_names()
word_values = X.toarray()
comment_matrix = np.hstack((word_values, cluster_labels.reshape(word_values.shape[0],1)))
word_vectors.append('cluster_labels')
comment_pd = pd.DataFrame(comment_matrix, columns=word_vectors)
print(comment_pd.head(1))

    一般   不厚   不爽   不贵   不错   不高   便宜   具体   准确    凹       ...          蓝   诚意  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0       ...        0.0  0.0   

     贵    足   透亮   重要    难         高   麻烦  cluster_labels  
0  0.0  0.0  0.0  0.0  0.0  0.392535  0.0             0.0  

[1 rows x 64 columns]


In [19]:
#聚类结果分析
comment_cluster1 = comment_pd[comment_pd['cluster_labels']==2].drop('cluster_labels',axis=1)
word_importance = np.sum(comment_cluster1,axis=0)
print(word_importance.sort_values(ascending=False)[:5])

好     4.890181
流畅    0.808409
清晰    0.685972
着急    0.641769
慢     0.508659
dtype: float64
