In [11]:
import numpy as np
import os
import sys
import pickle
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim import corpora, models, matutils

from gensim.models import doc2vec, Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

In [9]:
class SingelPassCluster(object):

    '''
        利用tfidf vec计算cossim
    '''
    def tfidf_vec(self, corpus, pivot=10, slope=0.25):
        dictionary = corpora.Dictionary(corpus)  # 形成词典映射
        self.dict_size = len(dictionary)
        print('dictionary size:{}'.format(len(dictionary)))
        corpus = [dictionary.doc2bow(text) for text in corpus]  # 词的向量表示
        tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope)
        corpus_tfidf = tfidf[corpus]
        return corpus_tfidf

    def get_max_similarity(self, cluster_cores, vector):
        max_value = 0
        max_index = -1
        print('vector:{}'.format(vector))
        for k, core in cluster_cores.items():
            print('core:{}'.format(core))
            similarity = matutils.cossim(vector, core)
            if similarity > max_value:
                max_value = similarity
                max_index = k
        return max_index, max_value

    def single_pass(self, corpus_vec, corpus, theta):
        clusters = {}
        cluster_cores = {}
        cluster_text = {}
        num_topic = 0
        cnt = 0
        for vector, text in zip(corpus_vec, corpus):
            if num_topic == 0:
                clusters.setdefault(num_topic, []).append(vector)
                cluster_cores[num_topic] = vector
                cluster_text.setdefault(num_topic, []).append(text)
                num_topic += 1
            else:
                max_index, max_value = self.get_max_similarity(cluster_cores, vector)
                if max_value > theta:
                    clusters[max_index].append(vector)
                    text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,
                                                        num_docs=len(clusters[max_index])).T  # 稀疏转稠密
                    core = np.mean(text_matrix, axis=0)  # 更新簇中心
                    core = matutils.any2sparse(core)  # 将稠密向量core转为稀疏向量
                    cluster_cores[max_index] = core
                    cluster_text[max_index].append(text)
                else:  # 创建一个新簇
                    clusters.setdefault(num_topic, []).append(vector)
                    cluster_cores[num_topic] = vector
                    cluster_text.setdefault(num_topic, []).append(text)
                    num_topic += 1
            cnt += 1
            if cnt % 100 == 0:
                print('processing {}...'.format(cnt))
        return clusters, cluster_text

    def fit_transform(self, corpus, raw_data, theta=0.5):
        tfidf_vec = self.tfidf_vec(corpus)  # tfidf_vec是稀疏向量
        clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)
        return clusters, cluster_text

In [None]:
if __name__ == '__main__':
    process_text = './tf_idf.txt'  # 处理后的样本路径
    cluster_result = './cluster_result.txt'

    corpus = load_data(process_text)
    raw_text = load_samples(process_text)

    index2corpus = collections.OrderedDict()
    for index, line in enumerate(raw_text):
        index2corpus[index] = line
    text2index = list(index2corpus.keys())
    print('docs total size:{}'.format(len(text2index)))

    single_cluster = SingelPassCluster()

    clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4)


    print("............................................................................................")
    print("得到的类数量有: {} 个 ...".format(len(clusters)))
    print("............................................................................................\n")
    # 按聚类语句数量对聚类结果进行降序排列
    clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True)
    with open(cluster_result, 'w', encoding='utf-8') as file_write:
        for k in clusterTopic_list:
            cluster_text = []
            for index, value in enumerate(k[1],start=1):
                cluster_text.append('(' + str(index) + '): ' + index2corpus[value])
            cluster_text = '\n'.join(cluster_text)
            file_write.write("【簇索引】:{} \n【簇中文档数】：{} \n【簇中文档】 ：\n{}".format(k[0], len(k[1]), cluster_text))
            file_write.write('\n')
            file_write.flush()