## 利用CNN對中文文本進行正面與負面語意的分類
CNN只是一種提取特徵的工具，只要讓資料整理成類似於影像的格式，照樣可以丟進CNN<br />
而最後在測試集上的準確率可以達到90%以上!!

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import math

### 讀取數據

In [2]:
positive_examples = open('chinese/rt-polarity.pos' , 'r' , encoding = 'utf-8').readlines()
positive_examples = [s.strip() for s in positive_examples] 
negative_examples = open('chinese/rt-polarity.neg' , 'r' , encoding = 'utf-8').readlines()
negative_examples = [s.strip() for s in negative_examples] 

### 數據預處理

In [3]:
sentences = positive_examples + negative_examples
# 把sentence中的空白刪除
sentences = [sentence.split(' ') for sentence in sentences]

# 生成標籤
positive_labels = [[0 , 1] for _ in positive_examples]
negative_labels = [[1 , 0] for _ in negative_examples]
labels = np.concatenate([positive_labels , negative_labels] , axis = 0)

sequence_length = 50
sentences_padding = []
for i in range(0 , len(sentences)):
    sentence = sentences[i]
    if sequence_length > len(sentence):
        # 不足sequence_length的sentence，就直接補'PAD'，補到長度為sequence_length
        while True:
            sentence += ['PAD']
            if len(sentence) == sequence_length:
                break
        sentences_padding.append(sentence)
    else:
        sentences_padding.append(sentence[:sequence_length])

In [4]:
# word_counts包含了每個字對應的頻率
word_counts = {}
for sent in sentences_padding:
    for word in sent:
        if word not in word_counts.keys():
            word_counts[word] = 1
        else:
            word_counts[word] += 1
word_counts = sorted(word_counts.items() , key = lambda item : item[1] , reverse = True)

# 取前多少個常用字，可以將len(word_counts)換成其他數目
words = [i[0] for i in word_counts]
words = words[:len(words) + 1]

# 每個字映射為一個數字ID
word_to_int = {}
for idx , word in enumerate(words):
    word_to_int[word] = idx
    
x = []
for sentence in sentences_padding:
    # 把sentence的每個詞拿出來，去找出word_to_int所對應的index，組合起來當作這個名字的向量
    temp = [word_to_int[word] for word in sentence]
    x.append(temp)
x = np.array(x)
y = np.array(labels)   

# 切分數據集
train_x , test_x , train_y , test_y = \
train_test_split(x , y , test_size = 0.1 , random_state = 42)

### 建立神經網路

In [5]:
# hyperparameter
input_size = sequence_length
num_classes = 2 # 標籤總共2種類別
batch_size = 30
num_batch = len(train_x) // batch_size
vocabulary_size = len(word_to_int)
embedding_size = 256
num_filters = 512

X = tf.placeholder(tf.int32, [None , input_size])
Y = tf.placeholder(tf.float32, [None , num_classes])
dropout_keep_prob = tf.placeholder(tf.float32)

# embedding layer
vocabulary_size = len(word_to_int)
with tf.variable_scope('embedding'):
    initializer = tf.contrib.layers.xavier_initializer()
    W = tf.Variable(initializer([vocabulary_size , embedding_size]) , name = 'embedding')
    embedded_chars = tf.nn.embedding_lookup(W , X)
    # 為了要丟進CNN，所以最後加了一個維度，這樣就類似於影像了
    embedded_chars_expanded = tf.expand_dims(embedded_chars , 3)

![](工作流程.png)

依據考慮上下文的長度的不同，可以用不同長度的filter<br />
但最後的output的維度會不同，此時可以透過max pooling做調整，將輸出統一成同一個維度

<img src="cnn_filter.png" style="width:750px;height:400px;float:middle">

In [6]:
print('step_0 → 輸入CNN之前的維度 : {}\n'.format(embedded_chars_expanded.shape))

# convolution + maxpool layer
pooled_outputs = []
filter_sizes = [3 , 4 , 5] # 總共有3種filter
with tf.variable_scope('conv_layer_{}'.format(filter_sizes[0])):
    initializer = tf.contrib.layers.xavier_initializer()
    filter_shape = [filter_sizes[0] , embedding_size , 1 , num_filters]
    W_conv1 = tf.Variable(initializer(filter_shape) , name = 'weight')
    b_conv1 = tf.Variable(tf.constant(0.0001 , shape = [num_filters]) , name = 'bias')
    conv_1 = tf.nn.conv2d(embedded_chars_expanded ,
                          W_conv1 ,
                          strides = [1 , 1 , 1 , 1] , 
                          padding = 'VALID')
    h_1 = tf.nn.relu(conv_1 + b_conv1)
    print('step_1-1 → 以第1種filtert處理之後的維度 : {}\n'.format(h_1.shape))
   
    pooled_1 = tf.nn.max_pool(h_1 , 
                              ksize = [1 , input_size - filter_sizes[0] + 1 , 1 , 1] , 
                              strides = [1 , 1 , 1 , 1] , 
                              padding = 'VALID')
    print('step_1-2 → 以第1次pooling之後的維度 : {}\n'.format(pooled_1.shape))
    pooled_outputs.append(pooled_1)

    
with tf.variable_scope('conv_layer_{}'.format(filter_sizes[1])):
    initializer = tf.contrib.layers.xavier_initializer()
    filter_shape = [filter_sizes[1] , embedding_size , 1 , num_filters]
    W_conv2 = tf.Variable(initializer(filter_shape) , name = 'weight')
    b_conv2 = tf.Variable(tf.constant(0.0001 , shape = [num_filters]) , name = 'bias')
    conv_2 = tf.nn.conv2d(embedded_chars_expanded , 
                          W_conv2 , 
                          strides = [1 , 1 , 1 , 1] , 
                          padding = 'VALID')
    h_2 = tf.nn.relu(conv_2 + b_conv2)
    print('step_2-1 → 以第2種filtert處理之後的維度 : {}\n'.format(h_2.shape))
    
    pooled_2 = tf.nn.max_pool(h_2 , 
                              ksize = [1 , input_size - filter_sizes[1] + 1 , 1 , 1] , 
                              strides = [1 , 1 , 1 , 1] , 
                              padding = 'VALID')
    print('step_2-2 → 以第2次pooling之後的維度 : {}\n'.format(pooled_2.shape))
    pooled_outputs.append(pooled_2)

    
with tf.variable_scope('conv_layer_{}'.format(filter_sizes[2])):
    initializer = tf.contrib.layers.xavier_initializer()
    filter_shape = [filter_sizes[2] , embedding_size , 1 , num_filters]
    W_conv3 = tf.Variable(initializer(filter_shape) , name = 'weight')
    b_conv3 = tf.Variable(tf.constant(0.0001 , shape = [num_filters]) , name = 'bias')
    conv_3 = tf.nn.conv2d(embedded_chars_expanded , 
                          W_conv3 , 
                          strides = [1 , 1 , 1 , 1] , 
                          padding = 'VALID')
    h_3 = tf.nn.relu(conv_3 + b_conv3)
    print('step_3-1 → 以第3種filtert處理之後的維度 : {}\n'.format(h_3.shape))
    
    pooled_3 = tf.nn.max_pool(h_3 , 
                              ksize = [1 , input_size - filter_sizes[2] + 1 , 1 , 1] , 
                              strides = [1 , 1 , 1 , 1] , 
                              padding = 'VALID')
    print('step_3-2 → 以第3次pooling之後的維度 : {}\n'.format(pooled_3.shape))
    pooled_outputs.append(pooled_3)

    
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(pooled_outputs , axis = 3)
print('step_4 → 將(pooled_1 , pooled_2 , pooled_3)拼接之後的維度 : {}'.format(h_pool.shape))
h_pool_flat = tf.reshape(h_pool , [-1 , num_filters_total])

# dropout
with tf.variable_scope('dropout'):
    h_drop = tf.nn.dropout(h_pool_flat , dropout_keep_prob)
    
# output
with tf.variable_scope('output'):
    W_flat = tf.get_variable(shape = [num_filters_total, num_classes] , 
                             initializer = tf.contrib.layers.xavier_initializer() , 
                             name = 'weight')
    b_flat = tf.Variable(tf.constant(0.1 , shape = [num_classes]) , name = 'bias')
    output = tf.nn.xw_plus_b(h_drop , W_flat , b_flat)
    output = tf.nn.softmax(output)

step_0 → 輸入CNN之前的維度 : (?, 50, 256, 1)

step_1-1 → 以第1種filtert處理之後的維度 : (?, 48, 1, 512)

step_1-2 → 以第1次pooling之後的維度 : (?, 1, 1, 512)

step_2-1 → 以第2種filtert處理之後的維度 : (?, 47, 1, 512)

step_2-2 → 以第2次pooling之後的維度 : (?, 1, 1, 512)

step_3-1 → 以第3種filtert處理之後的維度 : (?, 46, 1, 512)

step_3-2 → 以第3次pooling之後的維度 : (?, 1, 1, 512)

step_4 → 將(pooled_1 , pooled_2 , pooled_3)拼接之後的維度 : (?, 1, 1, 1536)


In [7]:
correct = tf.equal(tf.cast(tf.greater_equal(output , 0.5) , tf.int32) , tf.cast(Y , tf.int32))
accuracy = tf.reduce_mean(tf.reduce_min(tf.cast(correct , tf.float32) , 1))

optimizer = tf.train.AdamOptimizer(1e-4)
cross_entropy_temp = -tf.reduce_sum(Y * tf.log(output + 1e-9) , axis = 1)
cross_entropy = tf.reduce_mean(cross_entropy_temp)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
train_op = optimizer.apply_gradients(grads_and_vars)

saver = tf.train.Saver(tf.global_variables())
sess = tf.Session()
sess.run(tf.global_variables_initializer())

### 開始訓練神經網路

In [8]:
# minibatch data index
epochs = 4
step = (math.ceil(len(train_x) / batch_size)) * batch_size
temp = []
j = 0
index = []
for ii in range(0 , step):
    j = j + 1
    if j > len(train_x):
        j = j - (len(train_x))
    temp.append(j)
    if len(temp) == batch_size:
        index.append(temp)
        temp = []
index = list(np.array(index) - 1)

for epoch_i in range(0 , epochs):
    for batch_i in range(0 , num_batch):
        batch_x = train_x[index[batch_i] , :]
        batch_y = train_y[index[batch_i] , :]

        feed_dict = {X : batch_x , Y : batch_y , dropout_keep_prob : 0.9}
        _ , train_loss , train_acc = sess.run([train_op , cross_entropy , accuracy] , feed_dict)
        
        if batch_i % 700 == 0:
            print('=' * 30)
            print('epoch_i : {}'.format(epoch_i))
            print('batch_i : {}'.format(batch_i))
            print('train_loss : {:.2f}'.format(train_loss))
            print('train_accuracy : {:.2%}\n'.format(train_acc))

    feed_dict = {X : test_x , Y : test_y , dropout_keep_prob : 1}
    test_loss , test_acc = sess.run([cross_entropy , accuracy] , feed_dict)

    print('*' * 30)
    print('epoch_i : {}'.format(epoch_i))
    print('test_loss : {:.2f}'.format(test_loss))
    print('test_accuracy : {:.2%}'.format(test_acc))
    print('*' * 30 , '\n')

epoch_i : 0
batch_i : 0
train_loss : 0.69
train_accuracy : 36.67%

epoch_i : 0
batch_i : 700
train_loss : 0.34
train_accuracy : 86.67%

epoch_i : 0
batch_i : 1400
train_loss : 0.38
train_accuracy : 90.00%

******************************
epoch_i : 0
test_loss : 0.28
test_accuracy : 89.21%
****************************** 

epoch_i : 1
batch_i : 0
train_loss : 0.14
train_accuracy : 100.00%

epoch_i : 1
batch_i : 700
train_loss : 0.19
train_accuracy : 96.67%

epoch_i : 1
batch_i : 1400
train_loss : 0.37
train_accuracy : 90.00%

******************************
epoch_i : 1
test_loss : 0.26
test_accuracy : 90.86%
****************************** 

epoch_i : 2
batch_i : 0
train_loss : 0.09
train_accuracy : 100.00%

epoch_i : 2
batch_i : 700
train_loss : 0.08
train_accuracy : 96.67%

epoch_i : 2
batch_i : 1400
train_loss : 0.29
train_accuracy : 93.33%

******************************
epoch_i : 2
test_loss : 0.28
test_accuracy : 90.91%
****************************** 

epoch_i : 3
batch_i : 0
train_lo