# PTT Text Classification

In [1]:
import random
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from gensim.models import word2vec

%matplotlib inline

## Step 0. Loading dataset

### Step 0.1 Load article cutted and article df and define y

In [2]:
with open("data/article_cutted", "rb") as file:
    docs = pickle.load(file)

In [3]:
df = pd.read_csv('data/article_preprocessed.csv')

diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()
df['type'] = np.clip(df['push']-df['boo'], 0, 1)

### Step 0.2 Create word <-> id mappings and word vector

In [4]:
w2v = word2vec.Word2Vec.load('word2vec_model/CBOW')

In [5]:
word2id = { k:i for i, k in enumerate(w2v.wv.vocab.keys()) }
id2word = { i:k for k, i in word2id.items() }

In [6]:
words_len = len(word2id)

In [24]:
embedding = np.zeros((words_len+2, 256))
for k, v in word2id.items():
    embedding[v] = w2v.wv[k]

### Step 0.3 Transform sentence to sequence

In [8]:
input_length = 80
docs_id = []
for doc in docs:
    text = doc[:input_length]
    ids = [words_len]*input_length
    ids[:len(text)] = [ word2id[w] if w in word2id else words_len+1 for w in text ]
    
    docs_id.append(ids)

In [9]:
print(docs[0])
print(docs_id[0])

['韓瑜', '協志', '前妻', '正', '女演員', '周子', '瑜', 'TWICE', '團裡裡面', '台灣', '人', '正', '兩個', '要當', '鄉民', '老婆', '選', '五樓', '真', '勇氣']
[0, 1, 2, 3, 4, 5, 6, 7, 100035, 8, 9, 3, 10, 11, 12, 13, 14, 15, 16, 17, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034]


## Step 1. Data preprocessing

### Step 1.1 Create Training and Testing sets and create generator

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, stratify=df['type'])

In [12]:
def train_data_generator(df, batch_size, docs_id):
    dfs = [ sub_df for key, sub_df in df.groupby('type')]
    df_n = len(dfs)
    
    docs_id = np.array(docs_id)
    while True:
        selected = pd.concat([ sub_df.sample(int(batch_size/2)) for sub_df in dfs ], axis=0)
        selected = selected.sample(frac=1)
        x = docs_id[selected['idx']]
        y = np.array(selected['type'].tolist()).reshape((batch_size, 1))
                    
        yield x, y
        
def test_data_generator(df, docs_id):
    docs_id = np.array(docs_id)
    x = docs_id[df['idx']]
    y = np.array(df['type'].tolist()).reshape((len(x), 1))

    return x, y

In [13]:
X_test, Y_test = test_data_generator(test, docs_id)

## Create RNN

In [14]:
epochs = 100
batch_size = 32
update_per_epochs = 100

In [15]:
def LSTM_cell(hidden_layer_size, batch_size, number_of_layers, dropout=True, dropout_rate=0.8):
    layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
    
    if dropout:
        layer = tf.contrib.rnn.DropoutWrapper(layer, output_keep_prob=dropout_rate)
        
    cell = tf.contrib.rnn.MultiRNNCell([layer]*number_of_layers)
    
    init_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, init_state

In [16]:
def output_layer(lstm_output, in_size, out_size):
    x = lstm_output[:, -1, :]
    print(x)
    
    weights = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=0.05), name='output_layer_weights')
    bias = tf.Variable(tf.zeros([out_size]), name='output_layer_bias')
    
    output = tf.matmul(x, weights) + bias
    output = tf.nn.sigmoid(output)
    
    return output

In [17]:
def opt_loss(logits, targets, learning_rate, grad_clip_margin):
    loss = tf.losses.sigmoid_cross_entropy(targets, logits)
    
    # Cliping the gradient loss
    gradients = tf.gradients(loss, tf.trainable_variables())
    clipper_, _ = tf.clip_by_global_norm(gradients, grad_clip_margin)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_optimizer = optimizer.apply_gradients(zip(clipper_, tf.trainable_variables()))
    
    return loss, train_optimizer

In [26]:
class TextClassificationRNN(object):
    def __init__(self, learning_rate=0.001, hidden_layer_size=64, number_of_layers=1, dropout=True, 
                 dropout_rate=0.8, number_of_classes=1, gradient_clip_margin=4, input_length=input_length, wv=embedding):
    
        self.inputs = tf.placeholder(tf.int32, [None, input_length], name='input_data')
        self.targets = tf.placeholder(tf.float32, [None, 1], name='targets')
        self.bz = tf.placeholder(tf.int32, [], name='batch_size')
        
        ## embedding lookup table
        em_W = tf.Variable(wv.astype(np.float32), trainable=True)
        x = tf.nn.embedding_lookup(em_W, self.inputs)

        cell, init_state = LSTM_cell(hidden_layer_size, self.bz, number_of_layers, dropout, dropout_rate)

        outputs, states = tf.nn.dynamic_rnn(cell, x, initial_state=init_state)

        self.logits = output_layer(outputs, hidden_layer_size, number_of_classes)

        self.loss, self.opt = opt_loss(self.logits, self.targets, learning_rate, gradient_clip_margin)

In [27]:
tf.reset_default_graph()
model = TextClassificationRNN()

Tensor("strided_slice:0", shape=(?, 64), dtype=float32)


In [20]:
tf.global_variables()

[<tf.Variable 'Variable:0' shape=(100035, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0' shape=(320, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'output_layer_weights:0' shape=(64, 1) dtype=float32_ref>,
 <tf.Variable 'output_layer_bias:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Variable/Adam:0' shape=(100035, 256) dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_1:0' shape=(100035, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights/Adam:0' shape=(320, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights/Adam_1:0' shape=(320, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases/Adam:0' shape=(256,) dtype=float32_ref

## Train the network

In [29]:
session =  tf.Session()

In [30]:
session.run(tf.global_variables_initializer())

In [31]:
from datetime import datetime
from sklearn.metrics import roc_auc_score

train_generate = train_data_generator(train, batch_size, docs_id)

train_loss = []
train_auc = []
test_loss = []
test_auc = []

for i in range(epochs):
    trained_scores = []
    epoch_loss = []
    for j in range(update_per_epochs):
        X_batch, y_batch = next(train_generate) 
        
        o, c, _ = session.run([model.logits, model.loss, model.opt], feed_dict={
            model.inputs: X_batch, 
            model.targets: y_batch,
            model.bz: np.array(batch_size)
        })
        
        epoch_loss.append(c)
        trained_scores.append(roc_auc_score(y_batch, o))
    
    to, tc = session.run([model.logits, model.loss], feed_dict={
        model.inputs: X_test, 
        model.targets: Y_test,
        model.bz:np.array(len(X_test))
    })
    
    train_loss.append(np.mean(epoch_loss))
    train_auc.append(np.mean(trained_scores))
    test_loss.append(tc)
    test_auc.append(roc_auc_score(Y_test, to))
    
    if (i % 5) == 0:
        print(str(datetime.now()),
              'Epoch {}/{}'.format(i, epochs), 
              'Train loss: {}'.format(np.mean(epoch_loss)), 
              'Train auc: {}'.format(np.mean(trained_scores)), 
              'Test loss: {}'.format(tc), 
              'Test auc: {}'.format(roc_auc_score(Y_test, to)))

2018-03-08 16:29:31.367544 Epoch 0/100 Train loss: 0.6992793083190918 Train auc: 0.531796875 Test loss: 0.6531446576118469 Test auc: 0.5295907051510311
2018-03-08 16:32:35.981090 Epoch 5/100 Train loss: 0.6408179998397827 Train auc: 0.7740234375 Test loss: 0.48110008239746094 Test auc: 0.6776090893367653
2018-03-08 16:35:53.274726 Epoch 10/100 Train loss: 0.6273868083953857 Train auc: 0.757890625 Test loss: 0.47387897968292236 Test auc: 0.6928253858440754
2018-03-08 16:39:19.684644 Epoch 15/100 Train loss: 0.6171111464500427 Train auc: 0.852265625 Test loss: 0.4186032712459564 Test auc: 0.6814764322267552
2018-03-08 16:42:41.228812 Epoch 20/100 Train loss: 0.6226093173027039 Train auc: 0.78921875 Test loss: 0.3884423077106476 Test auc: 0.6897465179924918
2018-03-08 16:46:02.989483 Epoch 25/100 Train loss: 0.5978072285652161 Train auc: 0.829140625 Test loss: 0.3968924582004547 Test auc: 0.6997207272283322
2018-03-08 16:49:09.750765 Epoch 30/100 Train loss: 0.6052713990211487 Train auc: 

In [35]:
# 顯示最後結果
print('Train loss: {}'.format(np.mean(epoch_loss)), 
      'Train auc: {}'.format(np.mean(trained_scores)), 
      'Test loss: {}'.format(tc), 
      'Test auc: {}'.format(roc_auc_score(Y_test, to)))

Train loss: 0.5530333518981934 Train auc: 0.8741015625 Test loss: 0.43621957302093506 Test auc: 0.6846856528064625
