In [1]:
from time import time
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
print u'加载的20新闻数据中的数据类别为:',categories


dataset = fetch_20newsgroups(data_home='datas', subset='all', categories=categories,
                             shuffle=True, random_state=42)
print("%d条数据；%d个新闻类别" % (len(dataset.data), len(dataset.target_names)))


labels = dataset.target

加载的20新闻数据中的数据类别为: ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387条数据；4个新闻类别


In [3]:
target_cluster_k = np.unique(labels).shape[0]
features = 2 ** 20
components = 5
mini_batch_km_batchsize = 1000

In [4]:

hasher1 = HashingVectorizer(n_features=features, stop_words='english', non_negative=True, 
                            norm=None, binary=False, token_pattern=u'(?u)\\b\\w\\w+\\b')
tt = TfidfTransformer(norm='l2', use_idf=True)
hasher2 = HashingVectorizer(n_features=features, stop_words='english', non_negative=False,
                            norm='l2', binary=False, token_pattern=u'(?u)\\b\\w\\w+\\b')
tv = TfidfVectorizer(max_df=0.5, max_features=features, min_df=2, stop_words='english', use_idf=True)


vectorizers = [
    ('hashing&tf-idf', make_pipeline(hasher1, tt), False),
    ('hasing', make_pipeline(hasher2), False),
    ('tf-idf', make_pipeline(tv), True)
]

In [5]:
svd = TruncatedSVD(n_components=components)
normalizer = Normalizer(norm='l2', copy=False)
sn = make_pipeline(svd, normalizer)

In [6]:

mbkm = MiniBatchKMeans(n_clusters=target_cluster_k, init='k-means++', n_init=5, 
                       init_size=10 * mini_batch_km_batchsize, batch_size=mini_batch_km_batchsize)

km = KMeans(n_clusters=target_cluster_k, init='k-means++', max_iter=100, n_init=5)

cluster_als = [('Mini-Batch-KMeans', mbkm), ('KMeans', km)]

In [7]:

for vectorizer_name, vectorizer, can_inverse in vectorizers:
    print "============================================"
    print "采用'%s'的方式将文本数据转换为特征矩阵" % vectorizer_name

    
    t0 = time()
    X = vectorizer.fit_transform(dataset.data)
    print "转换消耗时间:%.3fs" % (time() - t0)
    print "样本数量:%d,特征属性数量:%d" % X.shape

   
    t0 = time()
    X = sn.fit_transform(X)
    print "SVD分解及归一化消耗时间:%.3fs" % (time() - t0)
    print "降维&归一化操作后，样本数量:%d,特征属性数量:%d" % X.shape
    
    
    for cluster_name,cluster_al in cluster_als:
        print
        print "使用算法%s对数据进行建模操作" % cluster_name
        t0 = time()
        cluster_al.fit(X)
        print "模型构建消耗时间:%.3fs" % (time() - t0)
        print "%s算法效果评估相关系数" % cluster_name
        print(u"均一性/同质性: %0.3f" % metrics.homogeneity_score(labels, cluster_al.labels_))
        print("完整性: %0.3f" % metrics.completeness_score(labels, cluster_al.labels_))
        print("V-measure: %0.3f" % metrics.v_measure_score(labels, cluster_al.labels_))
        print("Adjusted Rand-Index(ARI): %.3f" % metrics.adjusted_rand_score(labels, cluster_al.labels_))
        print("轮廓系数: %0.3f" % metrics.silhouette_score(X, cluster_al.labels_, sample_size=1000))
        print "聚类中心点为:", cluster_al.cluster_centers_
        
        if can_inverse:
            print "获取文本转换特征矩阵中，各个分类考虑特征属性的前10个feature特征（10个单词）："
            
            original_space_centroids = svd.inverse_transform(cluster_al.cluster_centers_)
            
            order_centroids = original_space_centroids.argsort()[:, ::-1]
           
            terms = vectorizer.named_steps.items()[0][1].get_feature_names()
           
            for i in range(target_cluster_k):
                print "类别%d:" % i,
                for ind in order_centroids[i, :10]:
                    print ' %s' % terms[ind],
                print
    print
    print
print "==================算法完成======================"

采用'hashing&tf-idf'的方式将文本数据转换为特征矩阵
转换消耗时间:2.151s
样本数量:3387,特征属性数量:1048576
SVD分解及归一化消耗时间:7.753s
降维&归一化操作后，样本数量:3387,特征属性数量:5

使用算法Mini-Batch-KMeans对数据进行建模操作
模型构建消耗时间:0.044s
Mini-Batch-KMeans算法效果评估相关系数
均一性/同质性: 0.553
完整性: 0.594
V-measure: 0.573
Adjusted Rand-Index(ARI): 0.554
轮廓系数: 0.389
聚类中心点为: [[ 0.81965113  0.23784285 -0.11635794 -0.15431643 -0.17500972]
 [ 0.7198625  -0.53206199 -0.05900797  0.12853194  0.03868587]
 [ 0.76368709 -0.2896739   0.00599186 -0.36158345  0.33797136]
 [ 0.67635775  0.28099513  0.52946783  0.16911658  0.07248387]]

使用算法KMeans对数据进行建模操作
模型构建消耗时间:0.057s
KMeans算法效果评估相关系数
均一性/同质性: 0.563
完整性: 0.593
V-measure: 0.578
Adjusted Rand-Index(ARI): 0.574
轮廓系数: 0.395
聚类中心点为: [[ 0.75261424 -0.29979432  0.00810426 -0.37995294  0.35494373]
 [ 0.73036399 -0.52309952 -0.05496925  0.1201719   0.04725732]
 [ 0.86294126  0.20612099 -0.02661772 -0.20769458 -0.22264159]
 [ 0.58541422  0.33279894  0.16698756  0.27796689  0.21453899]]


采用'hasing'的方式将文本数据转换为特征矩阵
转换消耗时间:1.931s
样本数量:3387