In [1]:
# original : http://www.voidcn.com/blog/u014365862/article/p-6355756.html

import tensorflow as tf
import numpy as np
import nltk
import random
import pickle
from collections import Counter
 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
"""
词形还原(lemmatizer)，即把一个任何形式的英语单词还原到一般形式，与词根还原不同(stemmer)，后者是抽取一个单词的词根。
"""

'\n\xe8\xaf\x8d\xe5\xbd\xa2\xe8\xbf\x98\xe5\x8e\x9f(lemmatizer)\xef\xbc\x8c\xe5\x8d\xb3\xe6\x8a\x8a\xe4\xb8\x80\xe4\xb8\xaa\xe4\xbb\xbb\xe4\xbd\x95\xe5\xbd\xa2\xe5\xbc\x8f\xe7\x9a\x84\xe8\x8b\xb1\xe8\xaf\xad\xe5\x8d\x95\xe8\xaf\x8d\xe8\xbf\x98\xe5\x8e\x9f\xe5\x88\xb0\xe4\xb8\x80\xe8\x88\xac\xe5\xbd\xa2\xe5\xbc\x8f\xef\xbc\x8c\xe4\xb8\x8e\xe8\xaf\x8d\xe6\xa0\xb9\xe8\xbf\x98\xe5\x8e\x9f\xe4\xb8\x8d\xe5\x90\x8c(stemmer)\xef\xbc\x8c\xe5\x90\x8e\xe8\x80\x85\xe6\x98\xaf\xe6\x8a\xbd\xe5\x8f\x96\xe4\xb8\x80\xe4\xb8\xaa\xe5\x8d\x95\xe8\xaf\x8d\xe7\x9a\x84\xe8\xaf\x8d\xe6\xa0\xb9\xe3\x80\x82\n'

In [2]:
# 创建词汇表
pos_file = "./data/p1/pos.txt"
neg_file = "./data/p1/neg.txt"
def create_lexicon(pos_file, neg_file, upper=0.9, lower=0.0001): # TODO 如何选择这里的lower和upper？
    lex = []
    # 读取文件
    def process_file(f):
        with open(pos_file, 'r') as f:
            lex = []
            lines = f.readlines()
            for line in lines:
                try:
                    words = word_tokenize(line.lower())
                    lex += words
                except Exception:
                    continue
            return lex
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)
    
    lex += process_file(pos_file)
    lex += process_file(neg_file)
    # lex 包含了两个文件中所有的词汇
    lemmatizer = WordNetLemmatizer()
    lex = [lemmatizer.lemmatize(word) for word in lex if is_ascii(word)] # 词形还原 (cats->cat)
    word_count = Counter(lex)
    # {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
    # 去掉一些常用词,像the,a and等等，和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
    ret = []
    for word in word_count:
        if word_count[word] < len(lex) * upper and word_count[word] > len(lex) * lower:  # 这写死了，好像能用百分比
            ret.append(word)        # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
    return ret

In [3]:
res = create_lexicon(pos_file, neg_file)

In [4]:
len(res)

1006

In [5]:
# 把每条评论转换为向量, 转换原理：
# 假设lex为['woman', 'great', 'feel', 'actually', 'looking', 'latest', 'seen', 'is'] 当然实际上要大的多
# 评论'i think this movie is great' 转换为 [0,1,0,0,0,0,0,1], 把评论中出现的字在lex中标记，出现过的标记为1，其余标记为0
def normalize_dataset(lex):
    dataset = []
    # lex:词汇表；review:评论；clf:评论对应的分类，[0,1]代表负面评论 [1,0]代表正面评论 
    def string_to_vector(lex, review, clf):
        words = word_tokenize(line.lower())
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        
        features = np.zeros(len(lex))
        for word in words:
            if word in lex:
                features[lex.index(word)] = 1  # 一个句子中某个词可能出现两次,可以用+=1，其实区别不大
        return [features, clf]
 
    with open(pos_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            try:
                one_sample = string_to_vector(lex, line, [1,0])  # [array([ 0.,  1.,  0., ...,  0.,  0.,  0.]), [1,0]]
                dataset.append(one_sample)
            except Exception:
                continue
    with open(neg_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            try:
                one_sample = string_to_vector(lex, line, [0,1])  # [array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), [0,1]]]
                dataset.append(one_sample)
            except Exception:
                continue
    #print(len(dataset))
    return dataset
 
dataset = normalize_dataset(res)
random.shuffle(dataset)

In [6]:
#把整理好的数据保存到文件，方便使用。到此完成了数据的整理工作
with open('./data/p1/save.pickle', 'wb') as f:
    pickle.dump(dataset, f)

In [7]:
# 取样本中的10%做为测试数据
test_size = int(len(dataset) * 0.1)
 
dataset = np.array(dataset)
 
train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]
 
# Feed-Forward Neural Network
# 定义每个层有多少'神经元''
n_input_layer = len(res)  # 输入层
 
n_layer_1 = 1024    # hiden layer
n_layer_2 = 512    # hiden layer(隐藏层)听着很神秘，其实就是除输入输出层外的中间层
n_layer_3 = 256    # hiden layer

n_output_layer = 2       # 输出层

In [8]:
# 定义待训练的神经网络
def neural_network(data):
    # 定义第一层"神经元"的权重和biases
    layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
    # 定义第二层"神经元"的权重和biases
    layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
    # 定义输出层"神经元"的权重和biases
    layer_3_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_layer_3])), 'b_':tf.Variable(tf.random_normal([n_layer_3]))}
    # 定义输出层"神经元"的权重和biases
    layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_3, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
 
    # w·x+b
    # data n*1006, layer1 1006*1000, layer2 1000*1000, output, 1000*2
    layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
    layer_1 = tf.nn.relu(layer_1)  # 激活函数
    layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
    layer_2 = tf.nn.relu(layer_2 ) # 激活函数
    layer_3 = tf.add(tf.matmul(layer_2, layer_3_w_b['w_']), layer_3_w_b['b_'])
    layer_3 = tf.nn.relu(layer_3 ) # 激活函数
    layer_output = tf.add(tf.matmul(layer_3, layer_output_w_b['w_']), layer_output_w_b['b_'])
    
    return layer_output

In [9]:
# 每次使用128条数据进行训练
batch_size = 64
 
X = tf.placeholder('float', [None, len(train_dataset[0][0])]) 
#[None, len(train_x)]代表数据数据的高和宽（矩阵），好处是如果数据不符合宽高，tensorflow会报错，不指定也可以。
Y = tf.placeholder('float')

In [10]:
# 使用数据训练神经网络
def train_neural_network(X, Y):
    predict = neural_network(X)
    cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict, Y))
    optimizer = tf.train.AdamOptimizer().minimize(cost_func)  # learning rate 默认 0.001 
    
    epochs = 20
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        epoch_loss = 0
 
        i = 0
        random.shuffle(train_dataset)
        train_x = dataset[:, 0]
        train_y = dataset[:, 1]
        for epoch in range(epochs):
            while i < len(train_x):
                # create batch
                start = i
                end = i + batch_size
 
                batch_x = train_x[start:end]
                batch_y = train_y[start:end]
 
                _, c = session.run([optimizer, cost_func], feed_dict={X:list(batch_x),Y:list(batch_y)})
                epoch_loss += c
                i += batch_size
 
            print(epoch, ' : ', epoch_loss)
 
        text_x = test_dataset[: ,0]
        text_y = test_dataset[:, 1]
        correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        print('Accuracy: ', accuracy.eval({X:list(text_x) , Y:list(text_y)}))
        
train_neural_network(X,Y)

(0, ' : ', 355273.65789794922)
(1, ' : ', 355273.65789794922)
(2, ' : ', 355273.65789794922)
(3, ' : ', 355273.65789794922)
(4, ' : ', 355273.65789794922)
(5, ' : ', 355273.65789794922)
(6, ' : ', 355273.65789794922)
(7, ' : ', 355273.65789794922)
(8, ' : ', 355273.65789794922)
(9, ' : ', 355273.65789794922)
(10, ' : ', 355273.65789794922)
(11, ' : ', 355273.65789794922)
(12, ' : ', 355273.65789794922)
(13, ' : ', 355273.65789794922)
(14, ' : ', 355273.65789794922)
(15, ' : ', 355273.65789794922)
(16, ' : ', 355273.65789794922)
(17, ' : ', 355273.65789794922)
(18, ' : ', 355273.65789794922)
(19, ' : ', 355273.65789794922)
('Accuracy: ', 0.65292424)
