<a href="https://colab.research.google.com/github/jmachima/Is_this_love/blob/main/Loving_Sentiment_with_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Jasmine Machima

jasmine.machima@gmail.com

A Transformer model for a sentiment analysis of Thai text segments on the subject of romantic relationships.



In [None]:
!pip install attacut # Thai Word Segmenter 
# Most accurate Thai tokenizer so far
from attacut import tokenize, Tokenizer
atta = Tokenizer(model="attacut-c")

**AttaCut: A Fast and Accurate Neural Thai Word Segmenter**
https://arxiv.org/ftp/arxiv/papers/1911/1911.07056.pdf

by Pattarawat Chormai, Ponrawee Prasertsom, Attapol Rutherford.

AttaCut will be used to segment Thai alphabets into separate words. Word tokenization for the Thai language is not straightforward due to the fact that words are written without spacing between them. For example, 'ความรักเป็นเรื่องซับซ้อนและละเอียดละอ่อน', in Thai, is a whole sentence meaning 'Love is a complicated and delicate matter.'

In [2]:
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
pd.options.display.max_rows = 9999
import collections
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
tf.get_logger().setLevel('ERROR')
print("TensorFlow version",tf.__version__)


TensorFlow version 2.8.2


In [3]:
# Set up for a TPU environment.

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [4]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [None]:
from google.colab import files
files.upload()
# upload PantipLoveSentiments.csv and love_test.csv

This dataset was manually curated from the popular Thai-language Pantip.com web forum, consisting of short text segments on the subject of romantic relationship experience. The set is divided into two classes of sentiments: positive or negative. 

In [6]:
bangrak = pd.read_csv('PantipLoveSentiments.csv')  # Mainly from the Bangrak board on Pantip
love_test = pd.read_csv('love_test.csv')

bangrak.head()

Unnamed: 0,text,category
0,อาจเป็นเพราะเราเข้าใจกัน,positive
1,พอมาถึงช่วงนึง เขาก็มีคนอื่นเข้ามาเพราะไปเที่ย...,negative
2,แฟนเราบอกว่า ทำไมไม่ให้ตังใช้บ้างเลย,negative
3,เปรียบเราเป็นผู้หญิงที่สวยที่สุดสำหรับเขา,positive
4,ผมไปรุกหาคู่ชีวิตสร้างอนคตไปด้วยกัน,positive


In [7]:
# Collect all the "words" (group of successive characters) tokenized by the AttCut model.

def token(Z):
  Tokens = []  

  for i in range(len(Z)):
    token = atta.tokenize(Z[i])
    Tokens.append(token)
  return Tokens

# Build vocabulary from both train/validation datasets and the separate test set.
Tokens = token(bangrak['text'])
Test_Tokens = token(love_test['text'])

# Build the vocabulary list "Words"
Words = []
for  sentence in Tokens:
  for word in sentence:
    if word not in Words:
      Words.append(word)
    else:
      pass
for  sentence in Test_Tokens:
  for word in sentence:
    if word not in Words:
      Words.append(word)
    else:
      pass


In [8]:
class Thai_to_id():
  def __init__(self, Vocab):
    
    self.Vocab = Vocab
  
  def convert_vocab_ids(self, items):
    
    vocab = collections.OrderedDict()
    index = 3
    
    for  word in self.Vocab:
        Token = str(word)
        vocab[Token] = index
        index += 1

    output = [1] # Beginning of text
    for item in items:
        if item in vocab.keys():
            output.append(vocab[item])
        else:
            output.append(3)  # 3 is designated "unknown"
    output.append(2) # End of text

    return output, vocab

  def id_seq(self, tokens):
  
    IDs = []
    exc = []
    for i in range(len(tokens)):

      token_id, v = self.convert_vocab_ids(tokens[i])
      if len(tokens[i]) > 64:
        print(len(tokens[i]),"    ",(tokens[i]))
        exc.append(i)
      else:
        pass

      IDs.append(token_id)
      
    return IDs, exc

thai_to_id = Thai_to_id(Vocab=Words)

IDs,_ = thai_to_id.id_seq(Tokens)
test_id, __ = thai_to_id.id_seq(Test_Tokens)

In [9]:
Class = set(bangrak['category'])  # List of unique categories
CL = dict(zip(Class,list(range(2)))) # convert categories into numericals
rvCL = dict(zip(list(range(2)),Class)) # reverse lookup for numerical label

print(CL)

def col_label(Table): 
 
  Table['Label'] = 0 # want integers only
  
  for i in Table.index:
      Table.at[i,'Label'] = CL[Table.at[i,'category']]
      
  return Table

data = col_label(bangrak)
labels=np.array(data['Label'])

{'negative': 0, 'positive': 1}


In [10]:
# Pad ids to the length of 64 with 0 for both train/validation and test datasets.

Padded_IDs = np.array(tf.keras.preprocessing.sequence.pad_sequences(IDs, maxlen=64, dtype='int32', padding='post', value=0))
test_IDs = tf.keras.preprocessing.sequence.pad_sequences(test_id, maxlen=64, dtype='int32', padding='post', value=0)

train_inp,val_inp,train_label,val_label=train_test_split(Padded_IDs,labels,test_size=0.2, random_state=88)

In [11]:
print('train_inp shape:',train_inp.shape)
print('val_inp shape:',val_inp.shape)

train_inp shape: (559, 64)
val_inp shape: (140, 64)


This basic Transformer model is built with elements from "Transformer Model for Language Understanding" at TensorFlow.org. The code below does NOT use any of the models available at TensorFlow Hub so that hyperparameters can be freely tuned.

In [12]:
with strategy.scope():
  input_vocab_size = 1000
  maximum_position_encoding = 1000
  d_model = 1024
  hidden_units = [2048, 1024]
  rate = 0.1
  dff = 2048
  learning_rate = 7.2e-6
  num_epochs = 22
  batch_size = 64
  num_heads = 8
  transformer_layers = 12
  num_classes = 2
  seq_len = 64

  def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

  def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                              np.arange(d_model)[np.newaxis, :],
                              d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

  def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding to the attention logits.
    return seq[:, tf.newaxis,tf.newaxis, :] # (batch_size, 1, 1, seq_len)

  
  # Set up positional encoding and sequence-ids embedding
  pos_encoding = positional_encoding(maximum_position_encoding, d_model)

  embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)

  def emb_pos(x):
    x = embedding(x)
    x *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    x += pos_encoding[:, :seq_len, :]

    return x


  def dropout(rate):
    return tf.keras.layers.Dropout(rate)

  def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
      x = layers.Dense(units, activation=tf.nn.relu)(x)
      x = layers.Dropout(dropout_rate)(x)
    return x

  class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
      super(MultiHeadAttention, self).__init__()
      self.num_heads = num_heads
      self.d_model = d_model

      assert d_model % self.num_heads == 0

      self.depth = d_model // self.num_heads

      self.wq = tf.keras.layers.Dense(d_model)
      self.wk = tf.keras.layers.Dense(d_model)
      self.wv = tf.keras.layers.Dense(d_model)

      self.dense = tf.keras.layers.Dense(d_model)


    def split_heads(self, x, batch_size):
 
      x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) #Split the last dimension into (num_heads, depth).
   
      return tf.transpose(x, perm=[0, 2, 1, 3])   # to get (batch_size, num_heads, seq_len, depth)

    def scaled_dot_product_attention(self, q, k, v, mask):

      matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

      # scale matmul_qk
      dk = tf.cast(tf.shape(k)[-1], tf.float32)
      scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
      
      # add the mask to the scaled tensor.
      if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

      # softmax step
      attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  

      output = tf.matmul(attention_weights, v)  

      return output, attention_weights

    def call(self, v, k, q, mask):
      batch_size = tf.shape(q)[0]

      q = self.wq(q)  # (batch_size, seq_len, d_model)
      k = self.wk(k)  # (batch_size, seq_len, d_model)
      v = self.wv(v)  # (batch_size, seq_len, d_model)

      q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
      k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
      v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

      # scaled_attention.shape = (batch_size, num_heads, seq_len_q, depth)
      # attention_weights.shape = (batch_size, num_heads, seq_len_q, seq_len_k)
      scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)

      scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

      concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

      output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

      return output, attention_weights
    
  class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, batch_size, rate):
      super(TransformerBlock, self).__init__()

      self.mha = MultiHeadAttention(d_model, num_heads)

      self.layernorm1 = tf.keras.layers.LayerNormalization(axis = -1,epsilon=1e-7)
      self.layernorm2 = tf.keras.layers.LayerNormalization(axis = -1,epsilon=1e-7)

      self.dropout1 = tf.keras.layers.Dropout(rate)
      self.dropout2 = tf.keras.layers.Dropout(rate)

      self.dense1 = tf.keras.layers.Dense(dff, activation='relu')
      self.dense2 =  tf.keras.layers.Dense(d_model)

    def call(self, x, mask):
      
      attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
      attn_output = self.dropout1(attn_output, training=True)
      attn_output = x + attn_output
      out1 = self.layernorm1(attn_output)  

      dns_output = self.dense1(out1)  
      dns_output = self.dense2(dns_output)

      dns_output = self.dropout2(dns_output, training=True)
      out2 = self.layernorm2(out1 + dns_output)  
      return out2

  def classifier():

    input = tf.keras.Input(shape=(seq_len, ),dtype=tf.float32, name='ID')
    
    x = emb_pos(input)
    mask = create_padding_mask(input)

    # Create multiple layers of the Transformer block.
    TB = [TransformerBlock(d_model, num_heads, dff,batch_size, rate=0.0) 
                      for _ in range(transformer_layers)]

    for i in range(transformer_layers):
          
        x = TB[i](x, mask)
    
    representation = layers.LayerNormalization(axis = -1, epsilon=1e-7)(x) 
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.1)(representation)  

    features = mlp(representation, hidden_units=hidden_units, dropout_rate=0.1)  
    logits = layers.Dense(num_classes)(features)
      
    model = tf.keras.Model(inputs=input, outputs=logits)
    return model

  model = classifier()

  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon = 1e-7)

  model.compile(
              optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"))
             
  history =  model.fit(x=train_inp,y=train_label, batch_size=batch_size,epochs=num_epochs, validation_data = (val_inp, val_label)) 


Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22


In [13]:
# def to prepare a table with predicted category vs. true category

def rev_label(pred, X,y_true):

  predicted =[]
  pred_cat = []
  true_cat = []
  for j in range(len(X)):
    index= np.argmax(pred[j])
    predicted.append(int(index))
    cat = rvCL[index]
    true = rvCL[y_true[j]]
    pred_cat.append(cat)
    true_cat.append(true)
  return predicted, pred_cat, true_cat


In [14]:
X_tr, X_ts, Y_tr, Y_ts=train_test_split(data['text'],data['Label'],test_size=0.2, random_state = 88)

val_set = pd.DataFrame(X_ts)
val_set['Label'] = Y_ts
y_true = list(Y_ts)


In [15]:
pred = model(val_inp)
predicted, pred_cat, true_cat = rev_label(pred, X_ts, y_true)
val_set['predicted'] = predicted
val_set['true_cat'] = true_cat
val_set['pred_cat'] = pred_cat

print('number of wrong predictions:',len(val_set[val_set['Label']!=val_set['predicted']]))  
wrong=val_set[val_set['Label']!=val_set['predicted']]  
print('predicted negative while actually positive:',len(wrong[wrong['true_cat']=='positive']))


number of wrong predictions: 29
predicted negative while actually positive: 18


In [16]:
wrong.head()

Unnamed: 0,text,Label,predicted,true_cat,pred_cat
265,สิ่งเล็ก ๆ น้อย ๆ เหล่านั้นมันทำให้เราค่อย ๆ ร...,1,0,positive,negative
642,จะกลับมาคบ แต่เราเองเป็นฝ่ายที่ไม่คบ,0,1,negative,positive
356,เราเป็นคนไม่ค่อยซีเรียส,1,0,positive,negative
278,ไม่ว่าจะยังไงยังอยากมีเขาอยุ่,1,0,positive,negative
529,เวลาคุยกับเราตะคอกด่าว่าเราสาระพัด,0,1,negative,positive


In [17]:
l_test=col_label(pd.read_csv('love_test.csv'))  # Separate test dataset

with strategy.scope():
  test_pred = model(test_IDs)


test_predicted, test_pred_cat, test_true_cat = rev_label(test_pred, l_test['text'], l_test['Label'])

l_test['predicted'] = test_predicted
l_test['pred_cat'] = test_pred_cat

ACCU = sklearn.metrics.accuracy_score(l_test['Label'], l_test['predicted'], normalize=True, sample_weight=None)
print('Test Accuracy =',round((ACCU*100),2),'%')

Test Accuracy = 81.9 %
