# IndoXTC - Fine-tune Full TripAdvisor [XLM-R]
Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.

This kernel is a part of my undergraduate final year project.
Checkout the full github repository:
https://github.com/ilhamfp/indonesian-text-classification-multilingual

In [1]:
#############################
# Experiment configurations #
#############################

INDO_DATA_NAME = 'trip_advisor'
DATA_PATH_INDO = '../input/dataset-tripadvisor'

FOREIGN_DATA_NAME = 'yelp'
DATA_PATH_FOREIGN = '../input/yelp-review-dataset'

MODEL_NAME = 'XLM_R'

EXPERIMENT_TYPE = 'C' # A / B / C
TOTAL_DATA = 9816 # 500 / 750 / 1000 / 2500 / 5000 / 7500 / 9816
FOREIGN_LANG_DATA_MULT = 3 # 0.5 / 1 / 1.5 / 2 / 3
RANDOM_SEED = 1
VALIDATION_DATA = 0.1
EPOCHS = 20
LEARNING_RATE = 5e-6
USE_TPU = True
REMOVE_DUPLICATION = True

In [2]:
from model_full import set_seed, regular_encode, build_model, callback
from load_data import load_experiment_dataset
import os
import random
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import transformers
from transformers import TFAutoModel, AutoTokenizer

set_seed(seed=RANDOM_SEED)

## TPU Configs

In [3]:
if USE_TPU:
    # Detect hardware, return appropriate distribution strategy
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    BATCH_SIZE = 8 * strategy.num_replicas_in_sync
    
else:
    BATCH_SIZE = 8 * 8

print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE
MAX_LEN = 512

if MODEL_NAME == 'XLM_R':
    MODEL = 'jplu/tf-xlm-roberta-large'
elif MODEL_NAME == 'mBERT':
    MODEL = 'bert-base-multilingual-cased'

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8
REPLICAS:  8


## Load Data

In [4]:
(x_train, y_train), (x_valid, y_valid), (x_test, y_test) = load_experiment_dataset(INDO_DATA_NAME,
                                                                                   FOREIGN_DATA_NAME,
                                                                                   tipe=EXPERIMENT_TYPE, 
                                                                                   total_data=TOTAL_DATA, 
                                                                                   foreign_mult=FOREIGN_LANG_DATA_MULT, 
                                                                                   valid_size=VALIDATION_DATA,
                                                                                   remove_duplication=REMOVE_DUPLICATION)

~~~Train Data~~~
Shape:  (12389, 2)
                                                text  label
0  number ruangan itu tidak siap oleh number 30 j...      0
1  ada tidak ada pantai dan number menit berjalan...      0

Label:
1    6281
0    6108
Name: label, dtype: int64

~~~Test Data~~~
Shape:  (2429, 2)
                                                text  label
0  kekecewaan untuk ritz standar menginap number ...      0
1  kekecewaan untuk ritz standar menginap number ...      0
2  kekurangan number tidak ada fasilitas apapun l...      0
3  kelebihan lokasi strategis breakfast standard ...      0

Label:
0    1304
1    1125
Name: label, dtype: int64
Removing duplication...
Previous shape:  (12389, 2)
Current shape:  (9816, 2)
Duplicate removed.
~~~Data~~~
Shape:  (560000, 2)
   label                                               text
0      0  unfortunately the frustration of being dr gold...
1      1  been going to dr goldberg for over 10 years i ...

Label:
1    280000
0    280000
N

## Tokenizing

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [6]:
%%time 

x_train = regular_encode(x_train, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(x_valid, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(x_test, tokenizer, maxlen=MAX_LEN)

CPU times: user 37.8 s, sys: 165 ms, total: 38 s
Wall time: 38 s


## Build Datasets Objects

In [7]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(len(x_train),
             seed=RANDOM_SEED,
             reshuffle_each_iteration=True)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

## Load Model

In [8]:
%%time
if USE_TPU:
    with strategy.scope():
        transformer_layer = TFAutoModel.from_pretrained(MODEL)
        model = build_model(transformer_layer, max_len=MAX_LEN, learning_rate=LEARNING_RATE)
        
else:
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, learning_rate=LEARNING_RATE)
    
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 512, 1024), (None 559890432 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dropout_74 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1025      
Total params: 559,891,457
Trainable params: 559,891,457
Non-trainable params: 0
_________________________________________________________________
CPU times: user 2min 1s, sys: 39.4 s, total: 2min 40s
Wall time: 2min 41s


## Train

In [9]:
n_steps = x_train.shape[0] // BATCH_SIZE
print(n_steps)

552


In [10]:
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    callbacks = callback(), 
    epochs=EPOCHS
)

Train for 552 steps, validate for 16 steps
Epoch 1/20


  num_elements)
  num_elements)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 00007: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-06.
Epoch 8/20
Epoch 9/20
Epoch 00009: ReduceLROnPlateau reducing learning rate to 1.249999968422344e-06.
Epoch 10/20
Epoch 00010: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-07.
Epoch 11/20
Epoch 00011: ReduceLROnPlateau reducing learning rate to 3.12499992105586e-07.
Epoch 12/20
Epoch 00012: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-07.


## Save

In [11]:
model.save_weights('model.h5') 

In [12]:
result = pd.DataFrame()
result['y_pred'] = model.predict(test_dataset, verbose=1).flatten()
result['y_true'] = y_test
result.to_csv('result_{}_{}_{}_{}_{}_{}_full.csv'.format(INDO_DATA_NAME,
                                                    FOREIGN_DATA_NAME,
                                                    MODEL_NAME,
                                                    EXPERIMENT_TYPE,
                                                    TOTAL_DATA,
                                                    FOREIGN_LANG_DATA_MULT),
              index=False)

