# 15 电商产品评论数据分析 - jieba分词 

In [1]:
import os
import pandas as pd
import jieba

os.chdir('E:\Jupyterspace')

In [10]:
#读入数据
inputfile1 = 'meidi_jd_neg.txt'
inputfile2 = 'meidi_jd_pos.txt'
data1 = pd.read_csv(inputfile1, encoding = 'utf-8', header = None) 
data2 = pd.read_csv(inputfile2, encoding = 'utf-8', header = None)


#通过“广播”形式,jieba分词，加快速度
mycut = lambda s: ' '.join(jieba.cut(s)) #自定义简单分词函数
data1 = data1[0].apply(mycut) 
data2 = data2[0].apply(mycut)


#保存分词结果
outputfile1 = 'meidi_jd_neg_cut.txt'
outputfile2 = 'meidi_jd_pos_cut.txt'
data1.to_csv(outputfile1, index = False, header = False, encoding = 'utf-8') 
data2.to_csv(outputfile2, index = False, header = False, encoding = 'utf-8')

# 15 电商产品评论数据 - LDA主题模型

In [15]:
# gensim告警不显示
import warnings
from gensim import corpora, models
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')


# 读入分词数据和停用词表
negfile = 'meidi_jd_neg_cut.txt'
posfile = 'meidi_jd_pos_cut.txt'
stoplist = 'stoplist.txt'
neg = pd.read_csv(negfile, encoding = 'utf-8', header = None)
pos = pd.read_csv(posfile, encoding = 'utf-8', header = None)
stop = pd.read_csv(stoplist, encoding = 'utf-8', header = None, sep = 'tipdm',engine='python') #sep设置一个不存在的分割词tipdm，因为csv中的逗号与停用词重复，
stop = [' ', ''] + list(stop[0]) #Pandas自动过滤了空格符，这里手动添加


# 停用词剔除
neg[1] = neg[0].apply(lambda s: s.split(' ')) #定义一个分割函数，然后用apply广播
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop]) #逐词判断是否停用词，思路同上
pos[1] = pos[0].apply(lambda s: s.split(' '))
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop])


#负面主题分析
neg_dict = corpora.Dictionary(neg[2]) #建立词典
neg_corpus = [neg_dict.doc2bow(i) for i in neg[2]] #建立语料库
neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict) #LDA模型训练
#正面主题分析
pos_dict = corpora.Dictionary(pos[2])
pos_corpus = [pos_dict.doc2bow(i) for i in pos[2]]
pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict)

# 查看结果
pos_lda.print_topic(1)
neg_lda.print_topic(1)


# 15 电商产品评论数据 - TFIDF模型

In [1]:
# 简单样例1
import jieba
from gensim import corpora,models,similarities
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')

# 语料 all_doc
doc0 = "我不喜欢上海"
doc1 = "上海是一个好地方"
doc2 = "北京是一个好地方"
doc3 = "上海好吃的在哪里"
doc4 = "上海好玩的在哪里"
doc5 = "上海是好地方"
doc6 = "上海路和上海人"
doc7 = "喜欢小吃"
all_doc = []
all_doc.append(doc0)
all_doc.append(doc1)
all_doc.append(doc2)
all_doc.append(doc3)
all_doc.append(doc4)
all_doc.append(doc5)
all_doc.append(doc6)
all_doc.append(doc7)


# 分词
all_doc_list = []
for doc in all_doc:
    doc_list = [word for word in jieba.cut(doc)]
    all_doc_list.append(doc_list)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\hey\AppData\Local\Temp\jieba.cache
Loading model cost 1.436 seconds.
Prefix dict has been built succesfully.


In [27]:
# 简单样例2
import jieba.posseg as pseg
import codecs
from gensim import corpora, models, similarities


import os
os.chdir('E:\Jupyterspace')
# 读入停用词表
stop_words = 'stoplist.txt'
stopwords = codecs.open(stop_words,'r',encoding='utf8').readlines() 
stopwords = [ w.strip() for w in stopwords ]

# 结巴分词后的停用词性 [标点符号、连词、助词、副词、介词、时语素、‘的’、数词、方位词、代词]
stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']

# jieba分词，去停用词，效率低，参考使用上一种广播方法
def tokenization(filename): 
    result = [] 
    with open(filename, 'r',encoding='utf8') as f: 
        text = f.read() 
        words = pseg.cut(text) 
    for word, flag in words: 
        if flag not in stop_flag and word not in stopwords: 
            result.append(word) 
    return result

filenames = ['帮您稳血压.txt','脱脂奶.txt','ios.txt']            
corpus = []
for each in filenames:
    corpus.append(tokenization(each))

# 训练tf-idf模型
dictionary = corpora.Dictionary(corpus)  
doc_vectors = [dictionary.doc2bow(text) for text in corpus]  
tfidf = models.TfidfModel(doc_vectors)  
tfidf_vectors = tfidf[doc_vectors] #  
index = similarities.MatrixSimilarity(tfidf_vectors)


# 计算测试数据相似性
query = tokenization('降压药.txt')
query_bow = dictionary.doc2bow(query)
sims = index[query_bow]
print(list(enumerate(sims)))

In [59]:
# 读取数据
import pandas as pd
inputfile = 'huizong.csv' #评论汇总文件
data = pd.read_csv(inputfile, encoding = 'utf-8')
data1 = data[[u'评论']][data[u'品牌'] == u'美的']
data2 = data[[u'评论']][data[u'品牌'] == u'海尔']
data3 = data[[u'评论']][data[u'品牌'] == u'万和']
test = data[[u'评论']][data[u'品牌'] == u'万家乐']
# 去重
data1 = data1.drop_duplicates(keep='first')
data2 = data2.drop_duplicates(keep='first')
data3 = data3.drop_duplicates(keep='first')
test = test.drop_duplicates(keep='first')

# 保存数据
filenames = ['meidi_jd.txt','haier_jd.txt','wanhe_jd.txt','wanjiale_jd.txt']
data1.to_csv(filenames[0], index = False, header = False, encoding = 'utf-8')
data2.to_csv(filenames[1], index = False, header = False, encoding = 'utf-8')
data3.to_csv(filenames[2], index = False, header = False, encoding = 'utf-8')
test.to_csv(filenames[3], index = False, header = False, encoding = 'utf-8')

In [None]:
#读入数据
inputfile1 = 'meidi_jd_neg.txt'
inputfile2 = 'meidi_jd_pos.txt'
data1 = pd.read_csv(inputfile1, encoding = 'utf-8', header = None) 
data2 = pd.read_csv(inputfile2, encoding = 'utf-8', header = None)


#通过“广播”形式,jieba分词，加快速度
mycut = lambda s: ' '.join(jieba.cut(s)) #自定义简单分词函数
data1 = data1[0].apply(mycut) 
data2 = data2[0].apply(mycut)


# 读入分词数据和停用词表
negfile = 'meidi_jd_neg_cut.txt'
posfile = 'meidi_jd_pos_cut.txt'
stoplist = 'stoplist.txt'
neg = pd.read_csv(negfile, encoding = 'utf-8', header = None)
pos = pd.read_csv(posfile, encoding = 'utf-8', header = None)
stop = pd.read_csv(stoplist, encoding = 'utf-8', header = None, sep = 'tipdm',engine='python') #sep设置一个不存在的分割词tipdm，因为csv中的逗号与停用词重复，
stop = [' ', ''] + list(stop[0]) #Pandas自动过滤了空格符，这里手动添加


# 停用词剔除
neg[1] = neg[0].apply(lambda s: s.split(' ')) #定义一个分割函数，然后用apply广播
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop]) #逐词判断是否停用词，思路同上
pos[1] = pos[0].apply(lambda s: s.split(' '))
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop])

In [38]:
# 分词
all_doc_list = []
for doc in all_doc:
    doc_list = [word for word in jieba.cut(doc)]
    all_doc_list.append(doc_list)
    

# 获取词袋
dictionary = corpora.Dictionary(all_doc_list) 


# 制作语料库
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]

In [39]:
all_doc_list

[['我', '不', '喜欢', '上海'],
 ['上海', '是', '一个', '好', '地方'],
 ['北京', '是', '一个', '好', '地方'],
 ['上海', '好吃', '的', '在', '哪里'],
 ['上海', '好玩', '的', '在', '哪里'],
 ['上海', '是', '好', '地方'],
 ['上海', '路', '和', '上海', '人'],
 ['喜欢', '小吃']]

In [69]:

print("########dictionary信息##########")
print(str(dictionary))
print ("##字典，{单词id，在多少文档中出现}")
print (dictionary.dfs) #字典，{单词id，在多少文档中出现}
print ("##文档数目")
print (dictionary.num_docs) #文档数目
print ("##字典，{单词id，对应的词}")
print(dict(dictionary.iteritems()))
print ("##所有词的个数")
print (dictionary.num_pos) #所有词的个数
print ("##每个文件中不重复词个数的和")
print (dictionary.num_nnz) #每个文件中不重复词个数的和


########dictionary信息##########
Dictionary(18 unique tokens: ['上海', '不', '喜欢', '我', '一个']...)
##字典，{单词id，在多少文档中出现}
{3: 1, 1: 1, 2: 2, 0: 6, 7: 3, 4: 2, 6: 3, 5: 3, 8: 1, 11: 1, 12: 2, 10: 2, 9: 2, 13: 1, 16: 1, 15: 1, 14: 1, 17: 1}
##文档数目
8
##字典，{单词id，对应的词}
{0: '上海', 1: '不', 2: '喜欢', 3: '我', 4: '一个', 5: '地方', 6: '好', 7: '是', 8: '北京', 9: '哪里', 10: '在', 11: '好吃', 12: '的', 13: '好玩', 14: '人', 15: '和', 16: '路', 17: '小吃'}
##所有词的个数
35
##每个文件中不重复词个数的和
34


In [71]:
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]

In [79]:
print ("词袋，列表[(单词id，词频)]")
print(all_doc_list)
print(corpus) # 每一条语料中包含单词id及其在该语料中的词频


词袋，列表[(单词id，词频)]
[['我', '不', '喜欢', '上海'], ['上海', '是', '一个', '好', '地方'], ['北京', '是', '一个', '好', '地方'], ['上海', '好吃', '的', '在', '哪里'], ['上海', '好玩', '的', '在', '哪里'], ['上海', '是', '好', '地方'], ['上海', '路', '和', '上海', '人'], ['喜欢', '小吃']]
[[(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(0, 1), (9, 1), (10, 1), (12, 1), (13, 1)], [(0, 1), (5, 1), (6, 1), (7, 1)], [(0, 2), (14, 1), (15, 1), (16, 1)], [(2, 1), (17, 1)]]


In [81]:
li = corpus.item()
for id, freq in li:
    print (id, dictionary.id2token[id], freq)

AttributeError: 'list' object has no attribute 'item'

In [15]:
#通过“广播”形式,jieba分词，加快速度
import pandas as pd
newdata = pd.DataFrame(all_doc)
mycut = lambda s: ' '.join(jieba.cut(s)) #自定义简单分词函数
newdata = newdata[0].apply(mycut) 


In [20]:
# 测试
doc_test="我喜欢上海的小吃"
doc_test_list = [word for word in jieba.cut(doc_test)]
doc_test_vec = dictionary.doc2bow(doc_test_list)

In [18]:
# tf-idf
tfidf = models.TfidfModel(corpus)

In [23]:
doc_test_list

['我', '喜欢', '上海', '的', '小吃']

In [24]:
tfidf[doc_test_vec]

[(0, 0.08112725037593049),
 (2, 0.3909393754390612),
 (3, 0.5864090631585919),
 (12, 0.3909393754390612),
 (17, 0.5864090631585919)]

In [26]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
sorted(enumerate(sim), key=lambda item: -item[1])

[(7, 0.70477605),
 (0, 0.54680777),
 (3, 0.17724207),
 (4, 0.17724207),
 (5, 0.013545224),
 (6, 0.01279765),
 (1, 0.010553493),
 (2, 0.0)]

In [2]:
all_doc_list

[['我', '不', '喜欢', '上海'],
 ['上海', '是', '一个', '好', '地方'],
 ['北京', '是', '一个', '好', '地方'],
 ['上海', '好吃', '的', '在', '哪里'],
 ['上海', '好玩', '的', '在', '哪里'],
 ['上海', '是', '好', '地方'],
 ['上海', '路', '和', '上海', '人'],
 ['喜欢', '小吃']]

In [10]:
dictionary = corpora.Dictionary(all_doc_list)

In [22]:

# -*- coding: utf-8 -*-

import jieba, os

import codecs

from gensim import corpora, models, similarities

from pprint import pprint

from collections import defaultdict

import sys

import pickle
import importlib

importlib.reload(sys)

 

def print_dict(dict):

    for key in dict:

        print (type(key),key,str(dict[key])),

    print

def test3():

    '''''

    gensim学习之Dictionary

    '''

    a = [['一','一','二'],['一','二','三']]

    b = ['一','一','三','四','四']

    dictionary = corpora.Dictionary(a)

    print ("########dictionary信息##########")

    print (str(dictionary)) #

    print ("字典，{单词id，在多少文档中出现}")

    print (dictionary.dfs) #字典，{单词id，在多少文档中出现}

    print ("文档数目")

    print (dictionary.num_docs) #文档数目

    print ("dictionary.items()")

    print_dict(dict(dictionary.items())) #

    print ("字典，{单词id，对应的词}")

    print_dict(dictionary.id2token) #字典，{单词id，对应的词}

    print ("字典，{词，对应的单词id}")

    print_dict(dictionary.token2id) #字典，{词，对应的单词id}

    print ("所有词的个数")

    print (dictionary.num_pos) #所有词的个数

    print ("每个文件中不重复词个数的和")

    print (dictionary.num_nnz) #每个文件中不重复词个数的和

    print ("########doc2bow##########")

    #dictionary.add_documents([b])

    #allow_update->更新当前字典；return_missing->返回字典中不存在的词

    #result为b文章转换得到的词袋，列表[(单词id，词频)]

    result, missing = dictionary.doc2bow(b, allow_update=False, return_missing=True)

    print ("词袋b，列表[(单词id，词频)]")

    print (result)

    print ("不在字典中的词及其词频，字典[(单词，词频)]")

    print_dict(missing)

    print ("########bow信息##########")

    for id, freq in result:

        print (id, dictionary.id2token[id], freq)

    print ("########dictionary信息##########")

    #过滤文档频率大于no_below，小于no_above*num_docs的词

    dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=10)

 

    return


In [23]:
test3()

########dictionary信息##########
Dictionary(3 unique tokens: ['一', '二', '三'])
字典，{单词id，在多少文档中出现}
{0: 2, 1: 2, 2: 1}
文档数目
2
dictionary.items()
<class 'int'> 0 一
<class 'int'> 1 二
<class 'int'> 2 三
字典，{单词id，对应的词}
<class 'int'> 0 一
<class 'int'> 1 二
<class 'int'> 2 三
字典，{词，对应的单词id}
<class 'str'> 一 0
<class 'str'> 二 1
<class 'str'> 三 2
所有词的个数
6
每个文件中不重复词个数的和
5
########doc2bow##########
词袋b，列表[(单词id，词频)]
[(0, 2), (2, 1)]
不在字典中的词及其词频，字典[(单词，词频)]
<class 'str'> 四 2
########bow信息##########
0 一 2
2 三 1
########dictionary信息##########


In [13]:
print(str(dictionary))

Dictionary(18 unique tokens: ['上海', '不', '喜欢', '我', '一个']...)


In [14]:
dictionary.dfs

{0: 6,
 1: 1,
 2: 2,
 3: 1,
 4: 2,
 5: 3,
 6: 3,
 7: 3,
 8: 1,
 9: 2,
 10: 2,
 11: 1,
 12: 2,
 13: 1,
 14: 1,
 15: 1,
 16: 1,
 17: 1}