# TF2 기반 Bi-LSTM 모델로 Sentiment Classification

In [1]:
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY
%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

In [2]:
import numpy as np
import os
import shutil
import tensorflow as tf

from sklearn.metrics import accuracy_score, confusion_matrix


# 데이터 준비

In [72]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    local_file = local_file.replace("%20", " ")
    p = tf.keras.utils.get_file(local_file, url, extract=True)   # ~/.keras/datasets
    local_folder = os.path.join("datasets", local_file.split('.')[0])
    labeled_sentences = []
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith("_labelled.txt"):
            with open(os.path.join(local_folder, labeled_filename), "r") as f:
                for line in f:
                    sentence, label = line.strip().split('\t')
                    labeled_sentences.append((sentence, label))
    return labeled_sentences

# download and read data into data structures
labeled_sentences = download_and_read(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip")
sentences = [s for (s, l) in labeled_sentences]
labels = [int(l) for (s, l) in labeled_sentences]
for i in range(5):
    print(f'[{labels[i]}] {sentences[i]}')


[0] A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
[0] Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  
[0] Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  
[0] Very little music or anything to speak of.  
[1] The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  


In [77]:
# tokenizer로 integer list로 변환 후 tf.dataset 준비

MAX_SEQLEN = 64

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_counts)
print("vocabulary size: {:d}".format(vocab_size))

word2idx = tokenizer.word_index
idx2word = {v:k for (k, v) in word2idx.items()}
idx2word[0] = 'PAD'

# create dataset
sentences_as_ints = tokenizer.texts_to_sequences(sentences)
sentences_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sentences_as_ints, maxlen=MAX_SEQLEN)
labels = np.array(labels)
dataset = tf.data.Dataset.from_tensor_slices((sentences_as_ints, labels))
for sentence, label in dataset.take(1):
    print(sentence)
    print(label)

vocabulary size: 5271
tf.Tensor(
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    4   17   17   17  230  746
 2267   24   54    4 2268 2269  878  324], shape=(64,), dtype=int32)
tf.Tensor(0, shape=(), dtype=int64)


In [80]:
# train/valid/test 분리

dataset = dataset.shuffle(10000)
test_size = len(sentences) // 3
val_size = (len(sentences) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

BATCH_SIZE = 32
train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)
train_dataset

<BatchDataset shapes: ((None, 64), (None,)), types: (tf.int32, tf.int64)>

# 모델 

In [83]:
class SentimentAnalysisModel(tf.keras.Model):
    
    def __init__(self, vocab_size, emb_sz, **kwargs):
        super(SentimentAnalysisModel, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_sz)
        
        self.bilstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(emb_sz)
        )
        
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, x):
        x = self.embedding(x)
        x = self.bilstm(x)
        x = self.dense1(x)
        x = self.dense2(x)
        return x
    
    
EMBED_SZ = 64
model = SentimentAnalysisModel(vocab_size+1, EMBED_SZ) # vocab_size + 1 to account for PAD character
model.build(input_shape=(BATCH_SIZE, MAX_SEQLEN))
model.summary()        

Model: "sentiment_analysis_model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      multiple                  337408    
_________________________________________________________________
bidirectional_6 (Bidirection multiple                  66048     
_________________________________________________________________
dense_12 (Dense)             multiple                  8256      
_________________________________________________________________
dense_13 (Dense)             multiple                  65        
Total params: 411,777
Trainable params: 411,777
Non-trainable params: 0
_________________________________________________________________


# 학습

In [25]:
# compile
model.compile(
    loss="binary_crossentropy",
    optimizer="adam", 
    metrics=["accuracy"]
)

data_dir = '/tmp/logs/BiLSTM'

# train

# 개선된 epoch 마다 모델 저장
checkpoint_path = data_dir + "/cp-{epoch:04d}.ckpt"
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
    save_weights_only=True,verbose=1, save_freq='epoch', save_best_only=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=data_dir)
num_epochs = 10
history = model.fit(train_dataset, epochs=num_epochs, 
    validation_data=val_dataset,
    callbacks=[checkpoint, tensorboard])

Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.62423, saving model to /tmp/logs/BiLSTM/cp-0001.ckpt
Epoch 2/10
Epoch 00002: val_loss improved from 0.62423 to 0.27867, saving model to /tmp/logs/BiLSTM/cp-0002.ckpt
Epoch 3/10
Epoch 00003: val_loss improved from 0.27867 to 0.18607, saving model to /tmp/logs/BiLSTM/cp-0003.ckpt
Epoch 4/10
Epoch 00004: val_loss improved from 0.18607 to 0.11631, saving model to /tmp/logs/BiLSTM/cp-0004.ckpt
Epoch 5/10
Epoch 00005: val_loss improved from 0.11631 to 0.08209, saving model to /tmp/logs/BiLSTM/cp-0005.ckpt
Epoch 6/10
Epoch 00006: val_loss improved from 0.08209 to 0.07620, saving model to /tmp/logs/BiLSTM/cp-0006.ckpt
Epoch 7/10
Epoch 00007: val_loss improved from 0.07620 to 0.02619, saving model to /tmp/logs/BiLSTM/cp-0007.ckpt
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.02619
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.02619
Epoch 10/10
Epoch 00010: val_loss improved from 0.02619 to 0.00785, saving model to /tmp/

In [27]:
best_model_path = tf.train.latest_checkpoint(os.path.dirname(checkpoint_path))
best_model_path

'/tmp/logs/BiLSTM/cp-0010.ckpt'

# 테스트 집합에 대한 성능 평가

In [89]:
best_model = SentimentAnalysisModel(vocab_size+1, EMBED_SZ) # vocab_size + 1 to account for PAD character
best_model.build(input_shape=(BATCH_SIZE, MAX_SEQLEN))
best_model.load_weights(best_model_path)
best_model.compile(loss="binary_crossentropy",optimizer="adam", metrics=["accuracy"])

test_loss, test_acc = best_model.evaluate(test_dataset)
print('test loss : {:.3f}, test accuracy {:.3f}'.format(test_loss, test_acc))

test loss : 0.030, test accuracy 0.993


In [101]:
total_labels, predictions = [], []
is_first_batch = True
for test_batch in test_dataset:
    features_b, labels_b = test_batch
    predicts_b = best_model.predict(features_b)
    predictions.extend( [1 if pred_pp > 0.5 else 0 for pred_pp in predicts_b] )
    total_labels.extend([l for l in labels_b.numpy()])
    if is_first_batch:
        print('LBL\tPRD:\tSENT')
        for rid in range(features_b.shape[0]):
            words = [idx2word[idx] for idx in features_b[rid].numpy()]
            words = [w for w in words if w != "PAD"]
            sentence = ' '.join(words)
            print('{:d}\t{:d}\t{:s}'.format(total_labels[rid], predictions[rid], sentence))
        is_first_batch = False

print("accuracy score: {:.3f}".format(accuracy_score(total_labels, predictions)))
print("confusion matrix")
print(confusion_matrix(total_labels, predictions))

LBL	PRD:	SENT
0	0	this is the first phone i've had that has been so cheaply made
0	0	the phone takes forever to charge like 2 to 5 hours literally
0	0	was not happy
1	1	great brunch spot
1	1	there was a warm feeling with the service and i felt like their guest for a special treat
0	0	overall this movie was cheap trash
1	1	the food is good
1	1	the rest of the cast also play well
0	0	after i pulled up my car i waited for another 15 minutes before being acknowledged
1	1	go to place for gyros
0	0	will never ever go back
1	1	my boyfriend and i came here for the first time on a recent trip to vegas and could not have been more pleased with the quality of food and service
1	1	this place is amazing
1	1	the first time i ever came here i had an amazing experience i still tell people how awesome the duck was
0	0	today the graphics are crap
1	1	love this product
0	0	painful on the ear
0	0	so i am here to warn you do not rent this movie it is the dumbest thing you have never seen
0	0	i was not impr