## 用LSTM实现情感分析

### 了解数据集
- pos.xls ：今天天气真好，心情也变得很好呢！
- neg.xls ：下雨天把鞋弄湿了，好烦啊 ><!

### 数据预处理
- 基本步骤

    - 转换源数据编码格式为utf-8格式
    - 过滤字符
        - 去除所有非中文字符，如标点符号、英文字符、数字、网站链接等特殊字符。
    - 过滤停用词
    - 对文本内容进行分词处理

In [None]:
# -*- coding: utf-8 -*-
#Created by huxiaoman 2018.1.28
#transfer.py:生成pos和neg数据
import jieba
import sys
import os
import re

# 判断邮件中的字符是否是中文
def check_contain_chinese(check_str):
    for ch in check_str.decode('utf-8'):
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

# 加载文本数据的label
def load_label_files(label_file):
    label_dict ={}
    for line in open(label_file).readlines():
        list1 = line.strip().split("..")
        label_dict[list1[1].strip()] = list1[0].strip()
    return label_dict

# 加载停用词词表
def load_stop_train(stop_word_path):
    stop_dict = {}
    for line in open(stop_word_path).readlines():
        line = line.strip()
        stop_dict[line] = 1
    return stop_dict

# 读取文本数据，并转换为utf-8格式，生成pos和neg样本
def read_files(file_path,label_dict,stop_dict,pos_file_path,neg_file_path):
    parents = os.listdir(file_path)
    pos_file = open(pos_file_path,'a')
    neg_file = open(neg_file_path,'a')
    for parent in parents:
        child = os.path.join(file_path,parent)
        if os.path.isdir(child):
            read_files(child,label_dict,stop_dict,pos_file_path,neg_file_path)
        else:
            print child[10:]
            label = "unk"
            if child[10:] in label_dict:
                label = label_dict[child[10:]]
            # deal file
            temp_list = []
            for line in open(child).readlines():
                line = line.strip().decode("gbk",'ignore').encode('utf-8')
                if not check_contain_chinese(line):
                    continue
                seg_list = jieba.cut(line, cut_all=False)
                for word in seg_list:
                    if word in stop_dict:
                        continue
                    else:
                        temp_list.append(word)
            line = " ".join(temp_list)
            print label
            if label == "pos":
                pos_file.write(line.encode("utf-8","ignore") + "\n")
            if label == "neg":
                neg_file.write(line.encode("utf-8","ignore")+"\n")

# 生成word2vec词表
def generate_word2vec(file_path,label_dict,stop_dict,word_vec):
    parents = os.listdir(file_path)
    fh1 = open(word_vec,'a')
    i = 0

    for parent in parents:
        child = os.path.join(file_path,parent)
        if os.path.isdir(child):
            generate_word2vec(child,label_dict,stop_dict,word_vec)
        else:
            print child[10:]
            i += 1
            print i
            label = "unk"
            if child[10:] in label_dict:
                label = label_dict[child[10:]]
            # deal file
            temp_list = []
            for line in open(child).readlines():
                line = line.strip().decode("gbk",'ignore').encode('utf-8')
                if not check_contain_chinese(line):
                    continue
                if len(line) == 0:
                    continue
                seg_list = jieba.cut(line, cut_all=False)
                for word in seg_list:
                    if word in stop_dict:
                        continue
                    else:
                        temp_list.append(word)
            line = " ".join(temp_list)
            fh1.write(line.encode("utf-8","ingore")+"\n")

if __name__=="__main__":
    file_path = sys.argv[1]
    label_path = sys.argv[2]
    stop_word_path = "stop_words.txt"
    word_vec_path = "word2vec.txt"
    pos_data = "pos.txt"
    neg_data = "neg.txt"
    label_dict = load_label_files(label_path)
    stop_dict = load_stop_train(stop_word_path)
    read_files(file_path,label_dict,stop_dict,pos_data,neg_data)

- 运行方式 :run.sh

In [None]:
bashif [ $1 = "test" ]; then
    echo "test"
    python transfer.py ../test/ ../data/pos.txt
else
    echo "whole"
    python transfer.py ../data/ ../trec06c/full/index
fi

- 运行方式
sh run.sh

- 运行结果
    - pos.txt: 正样本：正面情感
        - 今天 天气 真好 心情 也 变得 很好 呢  
    - neg.txt: 负样本：负面情感
        - 下雨 天 把 鞋 弄湿 了 好烦 啊
    - word2vec.txt: 所有文本的分词内容，为训练WordVec模型提供语料
        - 今天 天气 真好 心情 也 变得 很好 呢 下雨 天 把 鞋 弄湿 了 好烦 啊
   

### 生成Word2vec模型文件

In [None]:
# -*- coding: utf-8 -*-
# Created by huxiaoman 2018.1.28
# word2vec.py:生成word2vec模型

import os
import sys
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
import codecs

reload(sys)
sys.setdefaultencoding( "utf-8" )

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in codecs.open(os.path.join(self.dirname, fname),"r", encoding="utf-8",errors="ignore"):
                yield line.strip().split()

# word2vec.txt数据的地址
train_path = "rawData/"

# 生成的word2vec模型的地址
model_path = "/modelPath/"
sentences = MySentences(train_path) 

# 此处min_count=5代表5元模型，size=100代表词向量维度，worker=15表示15个线程
model = Word2Vec(sentences,min_count = 5,size=100,workers=15)

#保存模型
model.save(model_path+'/Word2vec_model.pkl')

- 运行方式： python word2vec.py
- 运行结果： Word2vec_model.pkl 

### 模型训练

#### 定义网络结构

In [None]:
##定义网络结构
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    nb_classes = 3
    print 'Defining a Simple Keras Model...'
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    print vocab_dim
    print n_symbols
    #model.add(LSTM(output_dim=50, activation='relu',inner_activation='hard_sigmoid'))
    #model.add(LSTM(output_dim=25, activation='relu', return_sequences=True))
    model.add(LSTM(64, input_dim=vocab_dim, activation='relu', return_sequences=True))
    model.add(LSTM(32, return_sequences=True))
    model.add(Dropout(0.5))
    #model.add(Dense(nb_classes))
    #model.add(Activation('softmax'))
    print model.summary()
    model.add(NonMasking())
    model.add(Flatten())
    model.add(Dense(output_dim=nb_classes, activation='softmax'))
    print 'Compiling the Model...'
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print "Train..."
    print y_train
    model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=(x_test, y_test))
    print "Evaluate..."
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('lstm_data/lstm_koubei.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('lstm_data/lstm_koubei.h5')
    print 'Test score:', score

#### 训练模型

In [None]:
#训练模型，并保存
def train():
    print 'Loading Data...'
    combined,y=loadfile()
    print len(combined),len(y)
    print 'Tokenising...'
    combined = tokenizer(combined)
    print 'Training a Word2vec model...'
    index_dict, word_vectors,combined=word2vec_train(combined)
    print 'Setting up Arrays for Keras Embedding Layer...'
    n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y)
    print x_train.shape,y_train.shape
    train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)
    
if __name__=='__main__':
    train()   