In [1]:
import pandas as pd
import numpy as np
import jieba
import multiprocessing
import keras

from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml
np.random.seed(1337)  # For Reproducibility
import sys
sys.setrecursionlimit(1000000)
import yaml

# set parameters:
cpu_count = multiprocessing.cpu_count() # 4
vocab_dim = 100     #词向量维度
n_iterations = 1  # ideally more..
n_exposures = 2 # 忽略所有频数小于10的词语
window_size = 7 #一个句子中当前单词和被预测单词的最大距离。
n_epoch = 4
input_length = 100
maxlen = 100

batch_size = 32
sen1 = pd.read_table('../data/1.txt',encoding='gbk',index_col=None)
sen2 = pd.read_table('../data/2.txt', encoding='gbk',index_col=None)
sen3 = pd.read_table('../data/3.txt', encoding='gbk',index_col=None)
#三分类数据
combined = np.concatenate((sen1['content'], sen2['content'], sen3['content']))
# #三分类数据真值
y = np.concatenate((np.ones(len(sen1), dtype=int), np.zeros(len(sen2), dtype=int),-1*np.ones(len(sen3), dtype=int)))

Using TensorFlow backend.


In [2]:
combined = [jieba.lcut(document.replace('\n', '')) for document in combined]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\sunhd13\AppData\Local\Temp\jieba.cache
Loading model cost 0.861 seconds.
Prefix dict has been built succesfully.


In [3]:
model = Word2Vec(sentences=combined,size=vocab_dim, min_count=n_exposures,window=window_size,workers=cpu_count,iter=n_iterations)
model.save('../model/Word2vec_model.pkl')

In [4]:
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.wv.vocab.keys(),allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k)
w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word))
data=[]
for sentence in combined:
    new_txt = []
    for word in sentence:
        try:
            new_txt.append(w2indx[word])
        except:
            new_txt.append(0) # freqxiao10->0
    data.append(new_txt)
combined= sequence.pad_sequences(data, maxlen=maxlen)

n_symbols = len(w2indx) + 1  # 所有单词的索引数，频数小于10的词语索引为0，所以加1
embedding_weights = np.zeros((n_symbols, vocab_dim)) # 初始化 索引为0的词语，词向量全为0
for word, index in w2indx.items(): # 从索引为1的词语开始，对每个词语对应其词向量
    embedding_weights[index, :] = w2vec[word]

x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
y_train = keras.utils.to_categorical(y_train,num_classes=3) 

model = Sequential()  # or Graph or whatever
# Adding Input Length
model.add(Embedding(output_dim=vocab_dim,input_dim=n_symbols,mask_zero=True,weights=[embedding_weights],input_length=input_length)) 
model.add(LSTM(output_dim=50, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax')) # Dense=>全连接层,输出维度=3
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1)

  after removing the cwd from sys.path.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x1f05d5bff88>

In [5]:
y_test = keras.utils.to_categorical(y_test,num_classes=3)
score = model.evaluate(x_test, y_test,batch_size=batch_size)
print ('Test score:', score)

Test score: [0.1387313417427087, 0.9610534310340881]


In [6]:
yaml_string = model.to_yaml()
with open('../model/lstm.yml', 'w') as outfile:
    outfile.write(yaml.dump(yaml_string) )
model.save_weights('../model/lstm.h5')

In [48]:
maxlen = 100

def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined): # 闭包-->临时使用
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
        return w2indx, w2vec,combined
    else:
        print ('No data provided...')


def input_transform(string):
    words=jieba.lcut(string)
    words=np.array(words).reshape(1,-1)
    model=Word2Vec.load('../model/Word2vec_model.pkl')
    _,_,combined=create_dictionaries(model,words)
    return combined
string = '联通'
with open('../model/lstm.yml', 'r') as f:
    yaml_string = yaml.load(f)
model = model_from_yaml(yaml_string)
model.load_weights('../model/lstm.h5')
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
data=input_transform(string)
data.reshape(1,-1)
result=model.predict_classes(data)
if result[0]==1:
    print (string,' 满意')
else:
    print (string,' 不满意')



联通  不满意


