# Sentiment Analysis terhadap Vaksinisasi COVID-19 di Indonesia: BERT Model
<h2>Tim Yaudahlah</h2>


---

Kaenova Mahendra Auditama<sup>1</sup><br>
Fendi Irfan Amorokhman<sup>2</sup><br>
Ananda Affan Fattahila<sup>3</sup><br>
<sup>1</sup><a href="mailto:kaenova@student.telkomuniversity.ac.id">kaenova@student.telkomuniversity.ac.id</a><br>
<sup>2</sup><a href="mailto:fendiirfan@student.telkomuniversity.ac.id">fendiirfan@student.telkomuniversity.ac.id</a><br>
<sup>3</sup><a href="mailto:affanfattahila@student.telkomuniversity.ac.id">affanfattahila@student.telkomuniversity.ac.id</a><br>
Informatics Engineering, Telkom University, Indonesia<br>
2021


# Installing and Importing Pre-requisite, Setting up GPU

## PIP Prerequisite

In [None]:
!pip install tqdm transformers tensorboard_plugin_profile

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.2 MB/s 
[?25hCollecting tensorboard_plugin_profile
  Downloading tensorboard_plugin_profile-2.4.0-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 58.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 55.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 62.1 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.6 MB/s 
Collecting gviz-api>=1.9.0
  Downloading gviz_a

## Checking GPU Availability

In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if not device_name:
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Importing Library and Modules

In [None]:
# Optional
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/notebooks/modules')

import pandas as pd
import numpy as np
import tensorflow as tf

from postprocessing import DataPostProcessing
from preprocessing import DataPreProcessing

# Modeling

In [None]:
###
# reference:
# https://github.com/huggingface/transformers/issues/3627
###

import tensorflow as tf
from transformers import TFBertModel, TFBertForSequenceClassification
from datetime import datetime
import os
from tqdm import tqdm

class BertModel:
  model = None
  transformer_model = None

  def __init__(self, model_name:str = 'MODEL', max_length:int=128, learning_rate=3e-6, epsilon=None, clipnorm=None, bert_trainable:bool = True, load_path:str = None):
    if load_path != None:
      self.model = tf.keras.models.load_model(load_path)
      print("BertModel[Initialize]: Model Loaded")
    else:
      print("BertModel[Initialize]: Fetching Bert Model")
      self.transformer_model = TFBertForSequenceClassification.from_pretrained(model_name, output_hidden_states=True)
      self.model = self.__CreateModel__(self.transformer_model, max_length, learning_rate, epsilon, clipnorm, bert_trainable)
      print("BertModel[Initialize]: Model Created, \n    To train the model, Call Model.train(train_data, val_data, epoch)")

  def __CreateModel__(self, transformer_model, max_length, learning_rate, epsilon, clipnorm, bert_trainable):
    input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32)
    embeddings = transformer_model.bert(input_ids, attention_mask=mask, training = bert_trainable)[0]
    out = tf.keras.layers.Flatten()(embeddings)
    output = tf.keras.layers.Dense(2, activation='softmax')(out)

    model = tf.keras.models.Model(inputs = [input_ids, mask], outputs = output)

    def get_f1(y_true, y_pred): #taken from old keras source code
      true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
      possible_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true, 0, 1)))
      predicted_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_pred, 0, 1)))
      precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
      recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
      f1_val = 2*(precision*recall)/(precision+recall+tf.keras.backend.epsilon())
      return f1_val

    model.compile(
      tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=clipnorm), 
      loss = 'categorical_crossentropy', 
      metrics = [
        tf.keras.metrics.TruePositives(name='TruePositive'),
        tf.keras.metrics.FalsePositives(name='FalsePositive'),
        tf.keras.metrics.TrueNegatives(name='TrueNegative'),
        tf.keras.metrics.FalseNegatives(name='FalseNegative'), 
        tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
        get_f1
      ], 
      # loss_weights = None, 
      # sample_weight_mode = None, 
      # weighted_metrics = None, 
      # target_tensors = None
    )

    if not bert_trainable:
      model.layers[2].trainable = False

    return model

  def Train(self, x, y, epoch:int = 5, batch_size:int = 32,callbacks=None):
    model_history = self.model.fit(x, y,
                          epochs=epoch,
                          verbose=1,
                          callbacks=callbacks,
                          batch_size = batch_size
                          )
    
  def Evaluate(self, x, y, callbacks=None):
    self.model.evaluate(x, y, callbacks=callbacks)

  def SaveWeights(self,path:str, model_name:str='BERT_WEIGHTS', time:bool = True):
    if os.path.exists(path) == False:
      os.mkdir(path)
    
    if time:
      now = datetime.now()
      dt_string = now.strftime("D%dM%mY%Y H%HM%MS%S")
      model_name = (model_name+'_{}'.format(dt_string))

    os.mkdir(path+ model_name)
    dir_save = path + model_name+'/'

    self.model.save_weights(dir_save+'model.h5', save_format='h5')
    tf.keras.utils.plot_model(  self.model,
                                to_file=dir_save+"model.png",
                                show_shapes=True,
                                show_dtype=True,
                                show_layer_names=True,
                                rankdir="TB",
                                expand_nested=True,
                                dpi=300
                              )
    print("BertModel[save_weight]: Model Weights and Image saved in {}".format(dir_save))

  def SaveModel(self,path:str, model_name:str='BERT_MODELS'):
    if os.path.exists(path) == False:
      os.mkdir(path)

    final_path = path+'/'+model_name+'.h5'

    self.model.save(final_path)
    print("BertModel[save_model]: Model saved in {}".format(final_path))

  def LoadWeights(self, path:str = None):
    self.model.load_weights(path)
    print("BertModel[load_weights]: Model weights loaded")

  def ModelSummary(self):
    self.model.summary()
    
  def GetModel(self):
    return self.model

  def Predict(self, x):
    prediction = self.model.predict(x)
    return prediction


---
# === Additional Code (Important) ===

## Training a Model

### Fetch Training Data

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/training/dataFixBangetTrainingAffan.csv')

In [None]:
## Mengecilkan data (optional)
# df_temp = (df[df['labels'] == 0].sample(200))
# df_temp = pd.concat([df_temp, df[df['labels'] == 1].sample(200)])
# df_temp = pd.concat([df_temp, df[df['labels'] == -1].sample(200)] )
# df_temp.reset_index()
# df = df_temp.reset_index()
# df.drop(columns='index', axis=1, inplace=True)

### Assigning Constant

In [None]:
# reference: https://stackoverflow.com/questions/7740683/set-environment-variable-with-space-in-linux

from datetime import datetime
from tensorflow.keras.callbacks import TensorBoard
import os

#@markdown ---
#@markdown # Model Naming Constant
MODEL_NAME = "BERT_P2_Balance_V2" #@param {type:"string"}
PATH_SAVE = "/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/models" #@param {type:"string"}
PATH_FILE_LOAD = None #@param {type:"string"}
USE_TIME = True  #@param ["False", "True"] {type:"raw"}

if MODEL_NAME == None:
    MODEL_NAME = 'BERT'
if USE_TIME:
    now = datetime.now()
    dt_string = now.strftime("D%dM%mY%Y H%HM%MS%S")
    MODEL_NAME = (MODEL_NAME+'_{}'.format(dt_string))

#@markdown ---
#@markdown # Tensorboard Settings
use_tensorboard = True #@param ["False", "True"] {type:"raw"}
if use_tensorboard:
  # reference: https://colab.research.google.com/github/tensorflow/tensorboard/blob/master/docs/tensorboard_profiling_keras.ipynb
  #@markdown If you have whitespaces in the directories, don't forget to add `\`.
  PATH_TO_TENSORBOARD_LOGS_DIR = "/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/tensorboard_logs/" #@param {type:"string"}
  PATH_TENSORBOARD_CALLBACKS = PATH_TO_TENSORBOARD_LOGS_DIR + "TEMP_" + MODEL_NAME
  os.mkdir(PATH_TENSORBOARD_CALLBACKS)
  tensorboard = TensorBoard(log_dir = PATH_TENSORBOARD_CALLBACKS, histogram_freq=1)
  os.environ['PATH_TO_TENSORBOARD_LOGS_DIR'] = PATH_TO_TENSORBOARD_LOGS_DIR

#@markdown ---
#@markdown # Bert Model
BERT_MODEL = 'indobenchmark/indobert-base-p2'#@param {type:"string"}

#@markdown ---
#@markdown # Data Preparation Constant
MAX_LENGTH = 128 #@param {type:"integer"}
BATCH_SIZE = 32 #@param {type:"integer"}
#@markdown Split Ratio Test/Training Data
SPLIT_RATIO = 0.2 #@param {type:"slider", min:0, max:1, step:0.01}

#@markdown ---
#@markdown # Learning and Fitting Constant
LEARNING_RATE = 3e-6 #@param {type:"raw"}
EPSILON = 1e-08 #@param {type:"raw"}
CLIPNORM = 1.0 #@param {type:"raw"}
EPOCH =  5#@param {type:"integer"}
#@markdown If you use tensorboard don't forget to add `[tensorboard]` in the callbacks
CALLBACKS = [tensorboard] #@param {type:"raw"}

print('Directory created at {}'.format(PATH_TENSORBOARD_CALLBACKS))


Directory created at /content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/tensorboard_logs/TEMP_BERT_P2_Balance_V2_D26M07Y2021 H08M35S48


### Preprocessed Data

In [None]:
x_train,x_test,y_train,y_test, one_hot_mappings = DataPreProcessing.PreProcessBatchBERT(df, BERT_MODEL, MAX_LENGTH, SPLIT_RATIO)

DataPreProcessing[PreProcessBatch|OneHotEncodingLabels]: Below zero detected in labels data
DataPreProcessing[PreProcessBatch|OneHotEncodingLabels]: Non-Category datatype detected, converting to Category datatype
DataPreProcessing[PreProcessBatch|Tokenize]: Fetching Tokenizer


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229167.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1534.0, style=ProgressStyle(description…


DataPreProcessing[PreProcessBatch]: labels shape (13496, 2), input shape (13496, 2, 128)
DataPreProcessing[PreProcessBatch]: Creating train and testing set with split ratio 0.2
DataPreProcessing[PreProcessBatch]: Train and Test data Created with Train data shape (10796, 2, 128), use out_mappings for argmax prediction


In [None]:
one_hot_mappings

{0: -1, 1: 1}

### Create Model

In [None]:
model = BertModel(BERT_MODEL, 128, LEARNING_RATE, EPSILON, CLIPNORM, bert_trainable=True)

BertModel[Initialize]: Fetching Bert Model


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel[Initialize]: Model Created, 
    To train the model, Call Model.train(train_data, val_data, epoch)


In [None]:
model.ModelSummary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 124441344   input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 98304)        0           bert[0][13]                

### Training Model

In [None]:
train = True #@param ["False", "True"] {type:"raw"}

if train:
  model.Train([x_train[:,0], x_train[:,1]], y_train, EPOCH, BATCH_SIZE, callbacks=CALLBACKS)
  model.SaveModel(path=PATH_SAVE, model_name=MODEL_NAME)
  if use_tensorboard:
    os.system('mv {} {}'.format(PATH_TENSORBOARD_CALLBACKS, PATH_TO_TENSORBOARD_LOGS_DIR + MODEL_NAME))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BertModel[save_model]: Model saved in /content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/models/BERT_P2_Balance_V2_D26M07Y2021 H08M35S48.h5


# Evaluate part 1

In [None]:
model.Evaluate([x_test[:,0],x_test[:,1]], y_test, CALLBACKS)



# Evaluate part 2

In [None]:
df_val = pd.read_csv('/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/data/validation/validation_test.csv')
x_val, y_val, one_hot_mappings = DataPreProcessing.PreProcessBatchValidation(df_val, BERT_MODEL, MAX_LENGTH)

DataPreProcessing[PreProcessBatch|OneHotEncodingLabels]: Below zero detected in labels data
DataPreProcessing[PreProcessBatch|OneHotEncodingLabels]: Non-Category datatype detected, converting to Category datatype
DataPreProcessing[PreProcessBatch|Tokenize]: Fetching Tokenizer
DataPreProcessing[PreProcessBatchValidation]: labels shape (80, 2), input shape (80, 2, 128)


In [None]:
model.Evaluate([x_val[:,0],x_val[:,1]], y_val, CALLBACKS)



---
# === Additional Code (Not Important) ===

## TF Js Conversion

In [None]:
!pip install tensorflowjs

In [None]:
import tensorflowjs as tfjs
load_model_convert = Model(load_path='/content/drive/Shareddrives/GEMASTIK IV: Yaudahlah/models/BERT_BASE_D19M07Y2021 H04M24S31.h5')
load_model_convert = load_model_convert.model

In [None]:
tfjs.converters.save_keras_model(load_model_convert, '/content/drive/Shareddrives/GEMASTIK IV: Yaudahlah/models/TFJS_MODEL_BERT_BASE')

## Usefull CommandLine (No need to be executed)

In [None]:
!rm -f -r "/content/drive/Shareddrives/GEMASTIK XIV: Yaudahlah/tensorboard_logs/TEMP_BERT_P2_Balance_Kaenova_D26M07Y2021 H03M38S51"

## Not So Usefull Code

In [None]:
sentence = 'saya suka mangga'

bert_model_name = 'indobenchmark/indobert-base-p2'
tokenizer_luar = BertTokenizer.from_pretrained(bert_model_name)
tokens = tokenizer_luar(sentence, max_length=128,
                                    truncation=True, padding='max_length',
                                    return_attention_mask = True,
                                    return_token_type_ids=False, return_tensors='tf')['input_ids']

In [None]:
tokens

In [None]:
sentence = 'Udah pusing disini deh beb. Kita disiplin dng kesadaran sendiri, vaksin buru2, eeeeh… ada dan banyak yg terbalik malah hoax didengerin'

bert_model_name = 'indobenchmark/indobert-base-p2'
tokenizer_luar = BertTokenizer.from_pretrained(bert_model_name)
tokens = tokenizer_luar(sentence, max_length=128,
                                    truncation=True, padding='max_length',
                                    return_attention_mask = True,
                                    return_token_type_ids=False, return_tensors='tf')['input_ids']