In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.preprocessing.text import Tokenizer
import jieba

In [2]:
import pandas as pd
df = pd.read_csv('./0819/tapei_tripadvisor_top200hotel_comment.csv')

In [2]:
df.isna().sum()

hotel_name             0
comment_title          0
comment_date           0
date_of_stay        5636
userName               0
user_profile           0
rating                 0
country             8817
recommend_count     9050
userId             22071
dtype: int64

In [13]:
df['rating'].value_counts()/ len(df)

5.0    0.575993
4.0    0.292847
3.0    0.086319
1.0    0.022464
2.0    0.022377
Name: rating, dtype: float64

In [3]:
feature = df['comment_title'].copy()
target = df['rating'].copy()

In [4]:
# word segmentation
def parsing(x):
    return " ".join(jieba.cut(x))

In [5]:
comment_segmentation = feature.apply(parsing).values

Building prefix dict from the default dictionary ...
I0910 15:21:54.084909  4976 __init__.py:111] Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Wade\AppData\Local\Temp\jieba.cache
I0910 15:21:54.086907  4976 __init__.py:131] Loading model from cache C:\Users\Wade\AppData\Local\Temp\jieba.cache
Loading model cost 0.494 seconds.
I0910 15:21:54.580904  4976 __init__.py:163] Loading model cost 0.494 seconds.
Prefix dict has been built succesfully.
I0910 15:21:54.581904  4976 __init__.py:164] Prefix dict has been built succesfully.


In [6]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comment_segmentation)
string_sequences = tokenizer.texts_to_sequences(comment_segmentation)

In [7]:
from keras.preprocessing import sequence
sequence_padded = sequence.pad_sequences(string_sequences, maxlen=20)

In [8]:
le = LabelEncoder()
label_le = le.fit_transform(target)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sequence_padded, label_le, test_size=0.3, random_state=42)

In [None]:
# 透過評論標題做情感分析 (把評分當成類別)

In [20]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding

model_CNN = Sequential()

model_CNN.add(Embedding(output_dim=64, input_dim=len(tokenizer.word_index), input_length=20))

model_CNN.add(Conv1D(64, 5, activation='relu'))
model_CNN.add(MaxPooling1D())
model_CNN.add(Conv1D(64, 5, activation='relu'))
model_CNN.add(GlobalMaxPooling1D())
model_CNN.add(Dense(5, activation='softmax'))
model_CNN.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 64)            591808    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 16, 64)            20544     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 8, 64)             0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 4, 64)             20544     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
Total params: 633,221
Trainable params: 633,221
Non-trainable params: 0
________________________________________________

In [21]:
model_CNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_history_CNN = model_CNN.fit(X_train, y_train, batch_size=50, epochs=10, verbose=2, validation_split=0.2)

Train on 12988 samples, validate on 3247 samples
Epoch 1/10
 - 1s - loss: 1.0303 - acc: 0.5800 - val_loss: 0.9522 - val_acc: 0.5867
Epoch 2/10
 - 1s - loss: 0.8534 - acc: 0.6432 - val_loss: 0.8876 - val_acc: 0.6070
Epoch 3/10
 - 1s - loss: 0.7012 - acc: 0.7054 - val_loss: 0.9074 - val_acc: 0.6055
Epoch 4/10
 - 1s - loss: 0.5915 - acc: 0.7542 - val_loss: 0.9774 - val_acc: 0.5996
Epoch 5/10
 - 1s - loss: 0.5074 - acc: 0.7907 - val_loss: 1.0780 - val_acc: 0.5735
Epoch 6/10
 - 1s - loss: 0.4450 - acc: 0.8136 - val_loss: 1.1723 - val_acc: 0.5895
Epoch 7/10
 - 1s - loss: 0.3986 - acc: 0.8355 - val_loss: 1.2252 - val_acc: 0.5830
Epoch 8/10
 - 1s - loss: 0.3610 - acc: 0.8491 - val_loss: 1.2922 - val_acc: 0.5907
Epoch 9/10
 - 1s - loss: 0.3286 - acc: 0.8643 - val_loss: 1.4001 - val_acc: 0.5808
Epoch 10/10
 - 1s - loss: 0.3008 - acc: 0.8738 - val_loss: 1.4628 - val_acc: 0.5605


In [22]:
model_CNN.evaluate(X_test, y_test)



[1.4598028650911556, 0.565823512503593]

In [56]:
from keras import backend as K
from keras.engine.topology import Layer


class Position_Embedding(Layer):

    def __init__(self, size=None, mode='sum', **kwargs):
        self.size = size  # 必须为偶数
        self.mode = mode
        super(Position_Embedding, self).__init__(**kwargs)

    def call(self, x):
        if (self.size == None) or (self.mode == 'sum'):
            self.size = int(x.shape[-1])
        batch_size, seq_len = K.shape(x)[0], K.shape(x)[1]
        position_j = 1. / K.pow(10000., 2 * K.arange(self.size / 2, dtype='float32') / self.size)
        position_j = K.expand_dims(position_j, 0)
        position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1  # K.arange不支持变长，只好用这种方法生成
        position_i = K.expand_dims(position_i, 2)
        position_ij = K.dot(position_i, position_j)
        position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
        if self.mode == 'sum':
            return position_ij + x
        elif self.mode == 'concat':
            return K.concatenate([position_ij, x], 2)

    def compute_output_shape(self, input_shape):
        if self.mode == 'sum':
            return input_shape
        elif self.mode == 'concat':
            return (input_shape[0], input_shape[1], input_shape[2] + self.size)


class Attention(Layer):

    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        # 歸一化
        self.size_per_head = size_per_head
        self.output_dim = nb_head * size_per_head
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ',
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK',
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV',
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)

    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:, 0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape) - 2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12

    def call(self, x):
        # 如果只传入Q_seq,K_seq,V_seq，那么就不做Mask
        # 如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len，那么对多余部分做Mask
        if len(x) == 3:
            Q_seq, K_seq, V_seq = x
            Q_len, V_len = None, None
        elif len(x) == 5:
            Q_seq, K_seq, V_seq, Q_len, V_len = x
        # 对Q、K、V做线性变换
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3))
        # 计算内积，然后mask，然后softmax
        print(Q_seq.shape, K_seq.shape, K.batch_dot(Q_seq, K_seq, axes=[3, 3]).shape)
        A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5
        A = K.permute_dimensions(A, (0, 3, 2, 1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0, 3, 2, 1))
        A = K.softmax(A)
        # 输出并mask
        O_seq = K.batch_dot(A, V_seq, axes=[3, 2])
        O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq

    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [57]:
# __import__('0830.attention')
from keras.models import Model
from keras.layers import *

S_inputs = Input(shape=(None,), dtype='int32')
embeddings = Embedding(len(tokenizer.word_index), 128)(S_inputs)
embeddings = Position_Embedding()(embeddings)  # 增加Position_Embedding能轻微提高准确率
O_seq = Attention(10, 20)([embeddings, embeddings, embeddings]) # 給 Q, K, V
O_seq = GlobalAveragePooling1D()(O_seq)
O_seq = Dropout(0.5)(O_seq)
O_seq = Dense(units=50, activation='relu')(O_seq)
outputs = Dense(5, activation='softmax')(O_seq)

model = Model(inputs=S_inputs, outputs=outputs)
model.summary()

(?, 10, ?, 20) (?, 10, ?, 20) (?, 10, ?, 10, ?)


ValueError: Dimension must be 5 but is 4 for 'attention_24/transpose_8' (op: 'Transpose') with input shapes: [?,10,?,10,?], [4].

In [47]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_history_attention = model.fit(X_train, y_train, batch_size=30, epochs=10, verbose=2, validation_split=0.2)

W0910 16:48:27.726542  4976 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0910 16:48:27.741125  4976 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\backend\tensorflow_backend.py:3622: The name tf.log is deprecated. Please use tf.math.log instead.

W0910 16:48:27.787633  4976 deprecation.py:323] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 12988 samples, validate on 3247 samples
Epoch 1/10


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: In[0] mismatch In[1] shape: 10 vs. 20: [30,4000,10] [30,20,200] 0 0
	 [[{{node attention_19/MatMul_4}}]]
	 [[loss/mul/_253]]
  (1) Invalid argument: In[0] mismatch In[1] shape: 10 vs. 20: [30,4000,10] [30,20,200] 0 0
	 [[{{node attention_19/MatMul_4}}]]
0 successful operations.
0 derived errors ignored.