In [60]:
import pandas as pd
import numpy as np
import re
import jieba
import random
from collections import Counter
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Bidirectional
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import regularizers

In [2]:
data = pd.read_csv('douban_movie_comments.csv',encoding='gb18030')
data['comment'] = data['comment'].fillna('')
data.head()

Unnamed: 0,id,name,year,comment,star
0,1304485,穿越时空爱上你 Kate & Leopold,2001,在这个速食、什么都讲究快捷的年代，我们不可能被如此温柔绅士地对待，生活也不会随便扔给我们一个...,4
1,1304485,穿越时空爱上你 Kate & Leopold,2001,特么滥套的大俗片啊，特么不严谨的剧情啊，可是俺一点也不讨厌它呀，谁让演员长得好也演的好啊！,4
2,1304485,穿越时空爱上你 Kate & Leopold,2001,...太浪漫了...简直是每个女人都想拥有的男人 =vvvvvvv=,5
3,1304485,穿越时空爱上你 Kate & Leopold,2001,啊，布莱因小姐\r\n 您的舞姿就象……就象一群牲口\r\n 您是个少有的女人\...,3
4,1304485,穿越时空爱上你 Kate & Leopold,2001,我的软肋 就是爱情轻喜剧.....,5


In [3]:
# 样本均衡(查看样本分布情况)
count = Counter(data['star'])
count

Counter({4: 22692, 5: 16146, 3: 14966, 1: 10822, 2: 7134})

In [4]:
def is_CN_char(ch):
    """
    判断是否是中文
    """
    return ch >=u'\u4e00' and ch<=u'\u9fa5'

In [5]:
def cut(string):
    return list(jieba.cut(string))

In [6]:
def token(string):
    return ' '.join(re.findall('[\w|\d]+',string))

In [7]:
def get_stopwords(filepath='F:/python/NLP_course/NLP_course/Course_11/stop_words.txt'):
    stopwords_dic = open(filepath,encoding='utf-8')
    stopwords = stopwords_dic.readlines()
    stopwords = [w.strip() for w in stopwords]
    stopwords_dic.close()
    return stopwords

In [8]:
stopwords = get_stopwords()
print(stopwords[:50])

['\ufeff!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '--', '.', '..', '...', '......', '...................', './', '.一', '.数', '.日', '/', '//', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '://', '::', ';', '<', '=', '>', '>>', '?', '@', 'A', 'Lex', '[', '\\', ']']


In [9]:
def clean_sentence(sentence):
    stopwords = get_stopwords()
    sentence = ''.join(filter(is_CN_char,sentence))
    #words = (token(str(a)) for a in sentence)
    words = [w for w in cut(sentence) if len(w) > 1 and w not in stopwords]
    words = ' '.join(words)
    return words

In [10]:
data['comment'] = data['comment'].apply(clean_sentence)
data['comment'][:10]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\64248\AppData\Local\Temp\jieba.cache
Loading model cost 0.975 seconds.
Prefix dict has been built succesfully.


0    速食 讲究 快捷 年代 温柔 绅士 生活 随便 扔给 谦彬 公爵 穿越 喜欢 说话 慢条斯理...
1                          特么 滥套 大俗片 严谨 剧情 一点 讨厌 演员 长得
2                                          浪漫 女人 拥有 男人
3    布莱因 小姐 舞姿 一群 牲口 少有 女人 离开 一间 屋子 活跃 富有 布莱因 小姐 第三...
4                                            软肋 爱情 轻喜剧
5    前半段 笑点 太多 几段 后半段 挑起 少得 可怜 浪漫 因子 实在 美好 休叔 太帅 爱情...
6                                    穿越 唯美 写清 小姑娘 好好学习
7    穿越 印象 深刻 公爵 一本正经 鱿鱼 切成 一段 整部 片子 一部 浪漫 轻喜剧 段子 极...
8    岁月 两性 公平 梅格瑞恩 迈入 四十 大关 苍老 憔悴 干瘪 过期 蛋糕 杰克 曼愈 弥坚...
9                                       男主爱上 女主 理由 是因为
Name: comment, dtype: object

In [42]:
# id-to-word & word-to-id
def word_to_id(comment):
    vocab = ''.join(comment).split()
    counts = Counter(vocab)
    vocab = sorted(counts,key=counts.get,reverse=True)
    word_to_id = {word:i for i,word in enumerate(vocab)}
    # id_to_word = {i:word for i,word in enumerate(vocab)}
    return word_to_id

In [50]:
vocab = ''.join(data['comment']).split()
counts = Counter(vocab)
vocab = sorted(counts,key=counts.get,reverse=True)
word_to_id = {word:i for i,word in enumerate(vocab)}
word_to_id

{'电影': 0,
 '故事': 1,
 '导演': 2,
 '一部': 3,
 '喜欢': 4,
 '片子': 5,
 '剧情': 6,
 '真的': 7,
 '这部': 8,
 '感觉': 9,
 '生活': 10,
 '影片': 11,
 '演员': 12,
 '镜头': 13,
 '演技': 14,
 '不错': 15,
 '人物': 16,
 '爱情': 17,
 '结尾': 18,
 '角色': 19,
 '世界': 20,
 '表演': 21,
 '好看': 22,
 '实在': 23,
 '情节': 24,
 '一种': 25,
 '两个': 26,
 '观众': 27,
 '中国': 28,
 '节奏': 29,
 '结局': 30,
 '青春': 31,
 '人生': 32,
 '一点': 33,
 '孩子': 34,
 '配乐': 35,
 '细节': 36,
 '作品': 37,
 '剧本': 38,
 '女人': 39,
 '音乐': 40,
 '特别': 41,
 '真实': 42,
 '叙事': 43,
 '画面': 44,
 '风格': 45,
 '时间': 46,
 '喜剧': 47,
 '台词': 48,
 '经典': 49,
 '男人': 50,
 '动作': 51,
 '题材': 52,
 '看过': 53,
 '现实': 54,
 '社会': 55,
 '本片': 56,
 '情感': 57,
 '可爱': 58,
 '主角': 59,
 '可惜': 60,
 '精彩': 61,
 '地方': 62,
 '美国': 63,
 '永远': 64,
 '那种': 65,
 '家庭': 66,
 '东西': 67,
 '只能': 68,
 '编剧': 69,
 '时代': 70,
 '感情': 71,
 '表现': 72,
 '剪辑': 73,
 '完美': 74,
 '发现': 75,
 '简单': 76,
 '依然': 77,
 '那段': 78,
 '烂片': 79,
 '希望': 80,
 '之间': 81,
 '确实': 82,
 '几个': 83,
 '感动': 84,
 '美好': 85,
 '场景': 86,
 '令人': 87,
 '日本': 88,
 '场面': 89,
 '韩国': 90,
 '关系': 91

In [51]:
# comment to id
def comment_to_id(word_to_id,comments):
    comment_to_id = []
    for comment in comments:
        comment_to_id.append([word_to_id[word] for word in comment.split() if word in word_to_id])
    return comment_to_id

In [52]:
comment_to_id = comment_to_id(word_to_id,data['comment'])
comment_to_id[:10]

[[19959,
  2200,
  30102,
  110,
  493,
  2166,
  10,
  860,
  17175,
  42910,
  19960,
  410,
  4,
  685,
  19961,
  493,
  13448,
  138,
  3304,
  159,
  1711],
 [8265, 42912, 30103, 2284, 6, 33, 381, 12, 581],
 [206, 39, 389, 50],
 [42915,
  1321,
  17176,
  229,
  19962,
  2385,
  39,
  316,
  5059,
  6571,
  10279,
  2055,
  42915,
  1321,
  3499,
  19960,
  17177,
  927,
  1631,
  30104,
  42916,
  1972,
  10280,
  108,
  912,
  10,
  1972,
  7777,
  4076,
  1375],
 [6323, 17, 2121],
 [555,
  245,
  434,
  1541,
  393,
  12202,
  13449,
  608,
  206,
  30105,
  23,
  85,
  5707,
  1607,
  17,
  9465,
  9466,
  6286,
  42919,
  244,
  5249,
  776],
 [410, 875, 42921, 1608, 12381],
 [410,
  136,
  157,
  19960,
  4346,
  30106,
  42923,
  92,
  194,
  5,
  3,
  206,
  2121,
  770,
  42924,
  337,
  10281,
  1282],
 [748,
  8822,
  3305,
  12203,
  19963,
  3400,
  42926,
  5479,
  12204,
  5708,
  8278,
  5709,
  1126,
  42927,
  19964,
  42928,
  5480,
  6572,
  42929,
  4347,
  4

In [53]:
# padding
def pad_sequences(comment_to_id,maxlen,padding='post',truncating='post'):
    features = np.zeros((len(comment_to_id),maxlen),dtype=int)
    for i,comment in enumerate(comment_to_id):
        if len(comment) <= maxlen and padding == 'pre':
            features[i,-len(comment):] = np.array(comment)[:maxlen]
        if len(comment) <= maxlen and padding == 'post':
            features[i,:len(comment)] = np.array(comment)[:maxlen]
        if len(comment) > maxlen and truncating == 'post':
            features[i,:] = np.array(comment)[:maxlen]
        if len(comment) > maxlen and truncating == 'pre':
            features[i,:] = np.array(comment)[len(comment)-maxlen:]
    return features

In [54]:
pad_comments = pad_sequences(comment_to_id,maxlen=200,padding='post',truncating='post')
pad_comments[:10]

array([[19959,  2200, 30102, ...,     0,     0,     0],
       [ 8265, 42912, 30103, ...,     0,     0,     0],
       [  206,    39,   389, ...,     0,     0,     0],
       ...,
       [  410,   136,   157, ...,     0,     0,     0],
       [  748,  8822,  3305, ...,     0,     0,     0],
       [   93,   732,   210, ...,     0,     0,     0]])

In [55]:
def split_dataset(pad_comments,labels,split_frac):
    split_index = int(len(pad_comments)*split_frac)
    data_list = list(zip(pad_comments,labels))
    random.shuffle(data_list)
    pad_comments,labels = zip(*data_list)
    x_train,x_test = pad_comments[:split_index],pad_comments[split_index:]
    y_train,y_test = labels[:split_index],labels[split_index:]
    return x_train,y_train,x_test,y_test

In [56]:
num_labels = 5
#labels = (np.arange(num_labels) == np.array(sub_data['star'])[:,None]).astype(np.float32)
labels = (np.arange(num_labels) == np.array(data['star'])[:,None]).astype(np.float32)
print(labels[:10])

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [57]:
x_train,y_train,x_test,y_test = split_dataset(pad_comments,labels,0.8)

In [61]:
# Model
max_comment_length = 200
embedding_vector_length = 300

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(len(word_to_id),embedding_vector_length,input_length=max_comment_length,dropout=0.2))
model.add(Conv1D(filters=32,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(50,return_sequences=True)))
model.add(Bidirectional(LSTM(10)))
model.add(Dropout(0.5))
model.add(Dense(5,activation='softmax',activity_regularizer=regularizers.l2(0.001)))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
history = model.fit(x_train,y_train,validation_split=0.2,epochs=3,verbose=1,batch_size=128)

  # This is added back by InteractiveShellApp.init_path()




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 300)          46114500  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 200, 32)           28832     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 100, 32)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 100)          33200     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 20)                8880      
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 105       
Total pa

In [None]:
# 使用matplotlib画出loss变化的趋势
print(history.history.keys())
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'],loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'],loc='upper left')
plt.show()

pred = model.predict(x_test)
def accuracy(pred,labels):
    return (100.0*np.sum(np.argmax(pred,1) == np.argmax(labels,1)
            /pred.shape[0]))

print(accuracy(pred,y_test))