In [1]:
# -*- coding: utf-8 -*-

import gensim
import logging
import multiprocessing
import os
import re
import sys
import matplotlib.pyplot as plt
import numpy as np


from nltk import word_tokenize
from time import time
from sklearn.cluster import KMeans


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)


def Show2dCorpora(corpus):
    nodes = list(corpus)
    ax0 = [x[0][1] for x in nodes] # 绘制各个doc代表的点
    ax1 = [x[1][1] for x in nodes]
    # print(ax0)
    # print(ax1)
    plt.plot(ax0,ax1,'o')
    plt.show()

def get_stop_words_set(file_name):
    with open(file_name,'r') as file:
        return set([line.strip() for line in file])


def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleanrn = re.compile('\n')
    cleantext = re.sub(cleanr, ' ', raw_html)
    cleantext = re.sub(cleanrn,'',cleantext)
    return cleantext


class MySentences(object):
    def __init__(self, dirname, stopwordfile):
        self.dirname = dirname
        self.stop_list = get_stop_words_set(stopwordfile)

    def __iter__(self):
        for root, dirs, files in os.walk(self.dirname):
            for filename in files:
                file_path = (root + '/' + filename)
                for line in open(file_path,encoding='utf-8'):
                    sline = line.strip()
                    if sline == "":
                        continue
                    rline = cleanhtml(sline)
                    tokenized_line = ' '.join(word_tokenize(rline))
                    is_alpha_word_line = [word for word in
                                          tokenized_line.lower().split()
                                          if (word.isalpha() and word not in self.stop_list)]
                    yield is_alpha_word_line



Using TensorFlow backend.


In [3]:
data_path = './test'
if (os.path.isfile(data_path)):
    print('file found')
else :
    print("data not found!")
    
    
stopword_path = './stopword.txt'
if (os.path.isfile(stopword_path)):
    print("found stopword file")
else :
    print("stopword file not found!")
    

sentences = MySentences(data_path, stopword_path)
dictionary = gensim.corpora.Dictionary(sentences)
corpus = [dictionary.doc2bow(text) for text in sentences]

2017-09-22 16:31:27,477 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


data not found!
found stopword file


2017-09-22 16:31:30,952 : INFO : adding document #10000 to Dictionary(15890 unique tokens: ['peru', 'bus', 'haq', 'surigao', 'conops']...)
2017-09-22 16:31:34,351 : INFO : adding document #20000 to Dictionary(24116 unique tokens: ['peru', 'wmrdc', 'surigao', 'conops', 'idiotic']...)
2017-09-22 16:31:37,898 : INFO : adding document #30000 to Dictionary(30721 unique tokens: ['peru', 'khankitigress', 'wmrdc', 'surigao', 'conops']...)
2017-09-22 16:31:38,262 : INFO : built Dictionary(31259 unique tokens: ['peru', 'khankitigress', 'wmrdc', 'surigao', 'conops']...) from 31013 documents (total 216067 corpus positions)


In [4]:
begin = time()
model = gensim.models.Word2Vec(sentences,
                               size=100,
                               window=5,
                               min_count=1,
                               workers=multiprocessing.cpu_count())
model.save("data/model/word2vec_gensim")
model.wv.save_word2vec_format("data/model/word2vec_org",
                              "data/model/vocabulary",
                              binary=False)

end = time()
print
"Total procesing time: %d seconds" % (end - begin)


2017-09-22 16:31:48,884 : INFO : collecting all words and their counts
2017-09-22 16:31:48,887 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-22 16:31:52,075 : INFO : PROGRESS: at sentence #10000, processed 73839 words, keeping 15890 word types
2017-09-22 16:31:55,202 : INFO : PROGRESS: at sentence #20000, processed 141854 words, keeping 24116 word types
2017-09-22 16:32:00,888 : INFO : PROGRESS: at sentence #30000, processed 209180 words, keeping 30721 word types
2017-09-22 16:32:01,666 : INFO : collected 31259 word types from a corpus of 216067 raw words and 31013 sentences
2017-09-22 16:32:01,666 : INFO : Loading a fresh vocabulary
2017-09-22 16:32:01,791 : INFO : min_count=1 retains 31259 unique words (100% of original 31259, drops 0)
2017-09-22 16:32:01,792 : INFO : min_count=1 leaves 216067 word corpus (100% of original 216067, drops 0)
2017-09-22 16:32:01,934 : INFO : deleting the raw counts dictionary of 31259 items
2017-09-22 16:32:01,937 : 

'Total procesing time: 70 seconds'

In [5]:
tfidf_model = gensim.models.TfidfModel(corpus)
tfidf_m = tfidf_model[corpus]
lda = gensim.models.LdaModel(tfidf_m, id2word=dictionary, num_topics=200)
corpus_lda = lda[tfidf_m]
lda_csc_matrix = gensim.matutils.corpus2csc(corpus_lda).transpose()

from sklearn.cluster import KMeans
kmean = KMeans(n_clusters=10)
kmean.fit(lda_csc_matrix)


2017-09-22 16:34:06,179 : INFO : collecting document frequencies
2017-09-22 16:34:06,182 : INFO : PROGRESS: processing document #0
2017-09-22 16:34:06,295 : INFO : PROGRESS: processing document #10000
2017-09-22 16:34:06,355 : INFO : PROGRESS: processing document #20000
2017-09-22 16:34:06,400 : INFO : PROGRESS: processing document #30000
2017-09-22 16:34:06,407 : INFO : calculating IDF weights for 31013 documents and 31258 features (210360 matrix non-zeros)
2017-09-22 16:34:06,469 : INFO : using symmetric alpha at 0.005
2017-09-22 16:34:06,470 : INFO : using symmetric eta at 3.199078665344381e-05
2017-09-22 16:34:06,486 : INFO : using serial LDA version on this node
2017-09-22 16:34:33,310 : INFO : running online (single-pass) LDA training, 200 topics, 1 passes over the supplied corpus of 31013 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-09-22 16:34:33,364 : INFO : PROGRES

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
doc2 = MySentences('./testContent.txt'， stopword_path)
copDoc2 = dictionary.doc2bow(doc2)

tfidf_model = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
lda = gensim.models.LdaModel(corpus_tfidf, num_topics=50, id2word=dictionary)
corpus_lda = lda[corpus_tfidf]
Show2dCorpora(corpus_lda)


SyntaxError: invalid character in identifier (<ipython-input-11-710695a011a9>, line 1)

In [12]:
 from sklearn.decomposition import PCA

    weight = lda_csc_matrix.toArray()
    pca = PCA(n_components=2)  # 输出两维
    newData = pca.fit_transform(weight)  # 载入N维
    print(newData)

    # 5A景区
    x1 = []
    y1 = []
    i = 0
    while i < 400:
        x1.append(newData[i][0])
        y1.append(newData[i][1])
        i += 1

        # 动物
    x2 = []
    y2 = []
    i = 400
    while i < 600:
        x2.append(newData[i][0])
        y2.append(newData[i][1])
        i += 1

        # 人物
    x3 = []
    y3 = []
    i = 600
    while i < 800:
        x3.append(newData[i][0])
        y3.append(newData[i][1])
        i += 1

        # 国家
    x4 = []
    y4 = []
    i = 800
    while i < 1000:
        x4.append(newData[i][0])
        y4.append(newData[i][1])
        i += 1

        # 四种颜色 红 绿 蓝 黑
    PCA.plt.plot(x1, y1, 'or')
    PCA.plt.plot(x2, y2, 'og')
    PCA.plt.plot(x3, y3, 'ob')
    PCA.plt.plot(x4, y4, 'ok')
    PCA.plt.show()

    data_path = sys.argv[1]
    begin = time()

    sentences = MySentences(data_path)
    dictionary = gensim.corpora.Dictionary(sentences)
    corpus = [dictionary.doc2bow(text) for text in sentences]
    model = gensim.models.Word2Vec(sentences,
                                   size=100,
                                   window=5,
                                   min_count=5,
                                   workers=multiprocessing.cpu_count())

    model.save("data/model/word2vec_gensim")
    model.wv.save_word2vec_format("data/model/word2vec_org",
                                  "data/model/vocabulary",
                                  binary=False)

    end = time()
    print
    "Total procesing time: %d seconds" % (end - begin)

   

IndentationError: unexpected indent (<ipython-input-12-6d24b69fd2f0>, line 3)