<br><font color = "#CC3D3D">
# Module 8 - Sentiment Analysis


In [1]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers import Input, LSTM, Dense, Embedding
from keras.models import Sequential, Model
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from keras.utils import plot_model
import keras.backend as K

from sklearn.model_selection import train_test_split
from IPython.display import Image

import tensorflow as tf 

import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import time

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import os, shutil
def set_gpu_fraction(gpu_fraction=0, visible_devices='0'):
    """
    GPU 사용유무와 사용량을 설정하는 함수

    Args:
        gpu_fraction: 만일  gpu_fraction <= 0 경우 GPU 메모리를 필요한 만큼 증가 시키면서 할당한다.
        visible_devices: DEVICE ID 문자열 입력. GPU를 두장 사용할 경우 '0,1'과 같이 입력한다. 만일 CPU 모드로 동작시키려면 '-1'할>당

    Returns:
        Tensorflow Session 객체
    """
    import tensorflow as tf

    if visible_devices == '-1':
        print('force to use CPU!')
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        config = tf.ConfigProto()
    else:
        print('force to use GPU!')
        os.environ['CUDA_VISIBLE_DEVICES'] = visible_devices
        config = tf.ConfigProto()
        if gpu_fraction <= 0 :
            print('allocate dynamic GPU memory')
            config.gpu_options.allow_growth = True
        elif gpu_fraction  <= 0.99:
            print('allocate {}% GPU memory'.format(gpu_fraction*100))
            config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction
        else:
            print('can not allocate GPU memory!')
            assert False


    return(tf.Session(config=config))



In [3]:
!pip install nltk --user

[33mYou are using pip version 9.0.2, however version 10.0.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Data Preparation

In [4]:
DATA_DIR = "data"

MAX_FEATURES = 2000   # vocabulary size
MAX_SENTENCE_LENGTH = 40  

EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 512
NUM_EPOCHS = 100

#### Read training data and generate vocabulary


In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/kookmin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
#ftrain = open(os.path.join(DATA_DIR, "training2.txt"), 'rb')
ftrain = open(os.path.join(DATA_DIR, "umich-sentiment-train.txt"), 'rb')
for line in ftrain:
    label, sentence = line.decode('utf8').strip().split("\t")
    words = nltk.word_tokenize(sentence.lower())
    if len(words) > maxlen:
        maxlen = len(words)  # the maximum number of words in a sentence
    for word in words:
        word_freqs[word] += 1  # frequency for each word
    num_recs += 1 # total number of records
ftrain.close()

- collections.Counter: dict의 subclass로 요소들의 개수 저장

In [7]:
word_freqs['awesome']

1028

In [8]:
maxlen, len(word_freqs)

(42, 2328)

#### Mapping word to index / index to word

In [9]:
# 1 is UNK, 0 is PAD
vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"] = 0   # for sentences shorter than MAX_SENTENCE_LENGTH
word2index["UNK"] = 1   # for words not in the vocabulary
index2word = {v:k for k, v in word2index.items()}

#### Convert sentences to sequences

In [10]:
X = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
ftrain = open(os.path.join(DATA_DIR, "umich-sentiment-train.txt"), 'rb')

for line in ftrain:
    label, sentence = line.decode('utf8').strip().split("\t")
    words = nltk.word_tokenize(sentence.lower())
    seqs = []
    for word in words:
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X[i] = seqs
    y[i] = int(label)
    i += 1
ftrain.close()

In [11]:
X[0:5], y[0:5]

(array([list([5, 10, 9, 12, 101, 17, 48, 22, 4]),
        list([67, 19, 5, 115, 969, 970, 2, 358, 136, 110, 3, 44, 317, 319, 23, 971, 3, 6, 10, 9, 12, 137, 118, 972, 341, 67, 4]),
        list([2, 122, 5, 10, 9, 12, 18, 325, 4]),
        list([2, 122, 5, 10, 9, 12, 18, 325, 4]),
        list([2, 122, 5, 10, 9, 12, 44, 24, 973, 313, 66, 974, 25, 648, 24, 28, 485, 4])],
       dtype=object), array([1., 1., 1., 1., 1.]))

#### Pad the sequences (left padded with zeros)


In [12]:
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)

#### Split input into training and test


In [13]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=0)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

(5668, 40) (1418, 40) (5668,) (1418,)


#### Define performance measures

In [14]:
# AUC for a binary classifier
def auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true, y_pred, k) for k in np.linspace(0,1,1000)], axis=0)
    pfas = tf.stack([binary_PFA(y_true, y_pred, k) for k in np.linspace(0,1,1000)], axis=0)
    pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s=ptas*binSizes
    return K.sum(s, axis=0)

# PFA, prob False alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred>=threshold, 'float32')
    # N = total number of negative labels
    N=K.sum(1-y_true) 
    # FP = total number of false alerts, alerts from the negative class labels
    FP=K.sum(y_pred-y_pred*y_true)
    return FP/N

# PTA, prob True alert for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred>=threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred*y_true)
    return TP/P

## Model building

In [15]:
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, 
                    input_length=MAX_SENTENCE_LENGTH, mask_zero = True))
model.add(Dropout(0.2))
model.add(LSTM(HIDDEN_LAYER_SIZE, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(HIDDEN_LAYER_SIZE, recurrent_dropout=0.2, return_sequences=True))

model.add(LSTM(HIDDEN_LAYER_SIZE, recurrent_dropout=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy", auc])


In [16]:
# ## using functional API
# from keras.layers import Input

# input=Input(shape=(MAX_SENTENCE_LENGTH,))
# embed=Embedding(vocab_size, EMBEDDING_SIZE, input_length=MAX_SENTENCE_LENGTH, mask_zero=True)(input)
# dropout=Dropout(0.2)(embed)
# lstm=LSTM(HIDDEN_LAYER_SIZE, recurrent_dropout=0.2)(dropout)
# dense=Dense(1)(lstm)
# out=Activation('sigmoid')(dense)
# model=Model(inputs=input, outputs=out)

# plot_model(model, show_shapes=True)
# Image('model.png')

In [17]:
set_gpu_fraction(0.5, '0')

force to use GPU!
allocate 50.0% GPU memory


<tensorflow.python.client.session.Session at 0x7f8d6fbc2f98>

In [18]:
now = time.strftime("%c")
tensorboard = TensorBoard(log_dir='./logs/sentiment_analysis/'+now, histogram_freq=1, write_graph=True)


model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(Xtest, ytest))

Train on 5668 samples, validate on 1418 samples
Epoch 1/100


InternalError: Blas GEMM launch failed : a.shape=(512, 128), b.shape=(128, 64), m=512, n=64, k=128
	 [[Node: lstm_1/while/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/while/TensorArrayReadV3, lstm_1/while/MatMul/Enter)]]
	 [[Node: lstm_1/while/add_5/_473 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_21051_lstm_1/while/add_5", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](^_clooplstm_1/while/Const_4/_123)]]

Caused by op 'lstm_1/while/MatMul', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/opt/venv/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/opt/venv/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/opt/venv/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/opt/venv/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 112, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1426, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/opt/venv/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 102, in _handle_events
    handler_func(fileobj, events)
  File "/opt/venv/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/opt/venv/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/opt/venv/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/opt/venv/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/opt/venv/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/opt/venv/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/opt/venv/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/opt/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/opt/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-96d6127358a9>", line 5, in <module>
    model.add(LSTM(HIDDEN_LAYER_SIZE, recurrent_dropout=0.2, return_sequences=True))
  File "/opt/venv/lib/python3.6/site-packages/keras/models.py", line 492, in add
    output_tensor = layer(self.outputs[0])
  File "/opt/venv/lib/python3.6/site-packages/keras/layers/recurrent.py", line 499, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/keras/engine/topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/keras/layers/recurrent.py", line 2151, in call
    initial_state=initial_state)
  File "/opt/venv/lib/python3.6/site-packages/keras/layers/recurrent.py", line 608, in call
    input_length=timesteps)
  File "/opt/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2767, in rnn
    swap_memory=True)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2816, in while_loop
    result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2640, in BuildLoop
    pred, body, original_loop_vars, loop_vars, shape_invariants)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2590, in _BuildLoop
    body_result = body(*packed_vars_for_body)
  File "/opt/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2726, in _step
    tuple(constants))
  File "/opt/venv/lib/python3.6/site-packages/keras/layers/recurrent.py", line 599, in step
    return self.cell.call(inputs, states, **kwargs)
  File "/opt/venv/lib/python3.6/site-packages/keras/layers/recurrent.py", line 1920, in call
    x_i = K.dot(inputs_i, self.kernel_i)
  File "/opt/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 1075, in dot
    out = tf.matmul(x, y)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1891, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 2437, in _mat_mul
    name=name)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/opt/venv/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InternalError (see above for traceback): Blas GEMM launch failed : a.shape=(512, 128), b.shape=(128, 64), m=512, n=64, k=128
	 [[Node: lstm_1/while/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/while/TensorArrayReadV3, lstm_1/while/MatMul/Enter)]]
	 [[Node: lstm_1/while/add_5/_473 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_21051_lstm_1/while/add_5", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](^_clooplstm_1/while/Const_4/_123)]]


#### Evaluate the model


In [None]:
loss_test, acc_test, auc_test = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
print("Test loss: %.3f, accuracy: %.3f, auc: %.3f" % (loss_test, acc_test, auc_test))

#### Prediction for test sentences

In [None]:
for i in range(5):
    idx = np.random.randint(len(Xtest))
    xtest = Xtest[idx].reshape(1,40)
    ylabel = ytest[idx]
    ypred = model.predict(xtest)[0][0]
    sent = " ".join([index2word[x] for x in xtest[0].tolist() if x != 0])
    print("%.0f\t%d\t%s" % (ypred, ylabel, sent))

## Extract intermediate layer outputs

In [None]:
embedding_layer_model = Model(inputs=model.input, outputs=model.get_layer('embedding_1').output)
embedding_output = embedding_layer_model.predict(Xtrain)

In [None]:
np.min(embedding_output)

## Visualize word embedding

#### Extract embedding weights

In [None]:
weights = model.layers[0].get_weights()[0]

In [None]:
weights.shape

#### Visualize embedding weights to 2-D using PCA

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
weight_pc=pca.fit_transform(weights)

fig = plt.figure(figsize=(10,10))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.scatter(weight_pc[:, 0], weight_pc[:, 1])

words = list(index2word.values())
for i, word in enumerate(words):
    plt.annotate(word, xy=(weight_pc[i, 0], weight_pc[i, 1]))
plt.show()


In [None]:
fig = plt.figure(figsize=(10,10))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.scatter(weight_pc[:, 0], weight_pc[:, 1])

words = list(index2word.values())
for i, word in enumerate(words):
    plt.annotate(word, xy=(weight_pc[i, 0], weight_pc[i, 1]))
plt.show()
