# Sentiment Analysis (Classification) using BERT - Tensorflow

This time, We will perform a sentiment Analysis using Google BERT model on the movie data with default tensorflow 2.0

For detailed examplantaion on BERT and fine-tuning to tackle this problem, please refer to the previous notebook '02_Sentiment _Analysis_using BERT' that is performed in Keras

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

I0626 14:05:38.525708 13176 file_utils.py:39] PyTorch version 1.5.0 available.
I0626 14:05:38.527736 13176 file_utils.py:55] TensorFlow version 2.2.0 available.


# 1. Load the data
##### For data loading and preprocessing step, it is the same as the Keras version

In [2]:
os.listdir('./data/bert')

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'bert_model.ckpt.meta',
 'vocab.txt']

In [3]:
path = os.path.abspath('./data')

In [4]:
train = pd.read_table(os.path.join(path,"ratings_train.txt"))
test = pd.read_table(os.path.join(path,"ratings_test.txt"))

In [5]:
print(train.shape)
print(test.shape)

(150000, 3)
(50000, 3)


In [6]:
train[0:10]

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
5,5403919,막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.,0
6,7797314,원작의 긴장감을 제대로 살려내지못했다.,0
7,9443947,별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...,0
8,7156791,액션이 없는데도 재미 있는 몇안되는 영화,1
9,5912145,왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?,1


# 2. Create a BERT input
##### Through Huggingface, you can easily call 'bert-base-multilingual-cased' tokenizer

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

I0626 14:05:50.342952 13176 filelock.py:274] Lock 1314914939912 acquired on C:\Users\bokhy/.cache\torch\transformers\96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729.lock
I0626 14:05:50.346936 13176 file_utils.py:436] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt not found in cache or force_download set to True, downloading to C:\Users\bokhy\.cache\torch\transformers\tmp0g_0we9u


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…

I0626 14:05:51.235238 13176 file_utils.py:440] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt in cache at C:\Users\bokhy/.cache\torch\transformers\96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
I0626 14:05:51.236236 13176 file_utils.py:443] creating metadata file for C:\Users\bokhy/.cache\torch\transformers\96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
I0626 14:05:51.238231 13176 filelock.py:318] Lock 1314914939912 released on C:\Users\bokhy/.cache\torch\transformers\96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729.lock
I0626 14:05:51.239228 13176 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at 




Before we get into BERT, a simple intro on how to use tokenizer would be: 

tokenizer.encode => Change the sentence to 'numerical' token (that could be used as an input to BERT) 
tokenizer.tokenize => Tokenize the sentence

In [8]:
# 'This move is so much fun'
print(tokenizer.encode("너무 재미있는 영화"))
print(tokenizer.tokenize("너무 재미있는 영화"))

[101, 9004, 32537, 9659, 22458, 84177, 42428, 102]
['너', '##무', '재', '##미', '##있는', '영화']


So we are bascially tokenizing all training data 
BERT's input takes 3 shapes: Token, Segment, Mask

Token: Indexed numbers

Segment: Numbers that tell whether it's a front sentence or back sentence

Mask: Automatically assgined (shows whether sentence is valid or not). If it's valid, fill with 1, it not fill with 0

In [9]:
print(tokenizer.encode("너무 재미있는 영화였다. 꼭 한번 다시 보고싶은 영화"))
print(tokenizer.tokenize("너무 재미있는 영화였다. 꼭 한번 다시 보고싶은 영화"))
print(tokenizer.encode("너무 재미있는 영화였다. 꼭 한번 다시 보고싶은 영화", max_length=128, pad_to_max_length=True))

[101, 9004, 32537, 9659, 22458, 84177, 42428, 34776, 119, 8955, 9954, 35465, 25805, 98199, 119088, 10892, 42428, 102]
['너', '##무', '재', '##미', '##있는', '영화', '##였다', '.', '꼭', '한', '##번', '다시', '보고', '##싶', '##은', '영화']
[101, 9004, 32537, 9659, 22458, 84177, 42428, 34776, 119, 8955, 9954, 35465, 25805, 98199, 119088, 10892, 42428, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
def convert_data(data_df):
    global tokenizer
    
    SEQ_LEN = 128 #SEQ_LEN : Length of input
    
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # token : Tokenize the sentence
        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, pad_to_max_length=True)
       
        # If it's valid (not padding), fill with 1, it not (padding) fill with 0
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        
        # Segement is 0 (because we only input 1 sentence) 
        segment = [0]*SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        
        # target value: 1 or 0 
        targets.append(data_df[LABEL_COLUMN][i])

    # change tokens, masks, segments as numpy array  
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

In [11]:
# Load dataframe and split it into train/test

def load_data(df):
    data_df = df
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_df[LABEL_COLUMN] = data_df[LABEL_COLUMN].astype(int)
    data_x, data_y = convert_data(data_df)
    return data_x, data_y

SEQ_LEN = 128
BATCH_SIZE = 20

DATA_COLUMN = "document"
LABEL_COLUMN = "label"

In [12]:
train_x, train_y = load_data(train)
test_x, test_y = load_data(test)

100%|████████████████████████████████████████████████████████████████████████| 150000/150000 [00:50<00:00, 2974.36it/s]
100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:16<00:00, 3006.82it/s]


# 3. Create a model

In [13]:
model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

# define each input
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')

# output shape of BERT is [batch_size, length of the sentence, 768]
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

I0626 14:07:27.754609 13176 filelock.py:274] Lock 1315105674888 acquired on C:\Users\bokhy/.cache\torch\transformers\45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.65df3cef028a0c91a7b059e4c404a975ebe6843c71267b67019c0e9cfa8a88f0.lock
I0626 14:07:27.757560 13176 file_utils.py:436] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json not found in cache or force_download set to True, downloading to C:\Users\bokhy\.cache\torch\transformers\tmpeelszb2b


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…

I0626 14:07:28.171469 13176 file_utils.py:440] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json in cache at C:\Users\bokhy/.cache\torch\transformers\45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.65df3cef028a0c91a7b059e4c404a975ebe6843c71267b67019c0e9cfa8a88f0
I0626 14:07:28.173465 13176 file_utils.py:443] creating metadata file for C:\Users\bokhy/.cache\torch\transformers\45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.65df3cef028a0c91a7b059e4c404a975ebe6843c71267b67019c0e9cfa8a88f0
I0626 14:07:28.175475 13176 filelock.py:318] Lock 1315105674888 released on C:\Users\bokhy/.cache\torch\transformers\45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.65df3cef028a0c91a7b059e4c404a975ebe6843c71267b67019c0e9cfa8a88f0.lock
I0626 14:07:28.182441 13176 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.j




I0626 14:08:48.558371 13176 filelock.py:274] Lock 1315105652936 acquired on C:\Users\bokhy/.cache\torch\transformers\273ed844d60ef1d5a4ea8f7857e3c3869d05d7b22296f4ae9bc56026ed40eeb7.1b4841f14bf42137fc7ecee17a46c1b2f22b417f636347e4b810bd06dd9c45ea.h5.lock
I0626 14:08:48.560464 13176 file_utils.py:436] https://cdn.huggingface.co/bert-base-multilingual-cased-tf_model.h5 not found in cache or force_download set to True, downloading to C:\Users\bokhy\.cache\torch\transformers\tmpt_00t2o_


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…

I0626 14:13:07.895117 13176 file_utils.py:440] storing https://cdn.huggingface.co/bert-base-multilingual-cased-tf_model.h5 in cache at C:\Users\bokhy/.cache\torch\transformers\273ed844d60ef1d5a4ea8f7857e3c3869d05d7b22296f4ae9bc56026ed40eeb7.1b4841f14bf42137fc7ecee17a46c1b2f22b417f636347e4b810bd06dd9c45ea.h5
I0626 14:13:07.899083 13176 file_utils.py:443] creating metadata file for C:\Users\bokhy/.cache\torch\transformers\273ed844d60ef1d5a4ea8f7857e3c3869d05d7b22296f4ae9bc56026ed40eeb7.1b4841f14bf42137fc7ecee17a46c1b2f22b417f636347e4b810bd06dd9c45ea.h5
I0626 14:13:07.901078 13176 filelock.py:318] Lock 1315105652936 released on C:\Users\bokhy/.cache\torch\transformers\273ed844d60ef1d5a4ea8f7857e3c3869d05d7b22296f4ae9bc56026ed40eeb7.1b4841f14bf42137fc7ecee17a46c1b2f22b417f636347e4b810bd06dd9c45ea.h5.lock
I0626 14:13:07.902076 13176 modeling_tf_utils.py:393] loading weights file https://cdn.huggingface.co/bert-base-multilingual-cased-tf_model.h5 from cache at C:\Users\bokhy/.cache\torch\tra




I0626 14:13:11.355507 13176 modeling_tf_utils.py:435] Layers from pretrained model not used in TFBertModel: ['nsp___cls', 'mlm___cls']


In [14]:
bert_outputs

(<tf.Tensor 'tf_bert_model/Identity:0' shape=(None, 128, 768) dtype=float32>,
 <tf.Tensor 'tf_bert_model/Identity_1:0' shape=(None, 768) dtype=float32>)

In [15]:
bert_outputs = bert_outputs[1]

In [16]:
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

# Let try using Rectified Adam optimizer
import tensorflow_addons as tfa
opt = tfa.optimizers.RectifiedAdam(lr=1.0e-5, weight_decay=0.0025)

sentiment_model.compile(optimizer=opt, loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

In [17]:
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_segment (InputLayer)      [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 177853440   input_word_ids[0][0]             
                                                                 input_masks[0][0]            

# 4. Model Training

In [None]:
sentiment_model.fit(train_x, train_y, epochs=1, shuffle=True, batch_size=100, validation_data=(test_x, test_y))

In [None]:
# Save weights for later use
# sentiment_model.save_weights(path+"/huggingface_bert.h5")

# 5. Model Testing with test-set

In [None]:
def predict_convert_data(data_df):
    global tokenizer
    tokens, masks, segments = [], [], []
    
    for i in tqdm(range(len(data_df))):

        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, pad_to_max_length=True)
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        segment = [0]*SEQ_LEN

        tokens.append(token)
        segments.append(segment)
        masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def predict_load_data(df):
    data_df = df
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_x = predict_convert_data(data_df)
    return data_x

In [None]:
test_set = predict_load_data(test)

In [None]:
test_set

In [None]:
preds = sentiment_model.predict(test_set)

# Negative: 0, Positive: 1
preds

In [None]:
# Check F1 Score
from sklearn.metrics import classification_report
y_true = test['label']
# F1 Score 확인
print(classification_report(y_true, np.round(preds,0)))

# 6. Check with actual sentence

In [None]:
def sentence_convert_data(data):
    global tokenizer
    tokens, masks, segments = [], [], []
    token = tokenizer.encode(data, max_length=SEQ_LEN, pad_to_max_length=True)
    
    num_zeros = token.count(0) 
    mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros 
    segment = [0]*SEQ_LEN

    tokens.append(token)
    segments.append(segment)
    masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def movie_evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = sentiment_model.predict(data_x)
    predict_value = np.ravel(predict)
    predict_answer = np.round(predict_value,0).item()
    
    if predict_answer == 0:
        print("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1-predict_value))
    elif predict_answer == 1:
        print("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value)

In [None]:
movie_evaluation_predict("보던거라 계속보고있는데 전개도 느리고 주인공인 은희는 한두컷 나오면서 소극적인모습에 ")