# IndoXTC - Fine-tune Head Toxic [mBERT] [ALL]
Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.

This kernel is a part of my undergraduate final year project.
Checkout the full github repository:
https://github.com/ilhamfp/indonesian-text-classification-multilingual

In [1]:
#############################
# Experiment configurations #
#############################

INDO_DATA_NAME = 'trip_advisor'
DATA_PATH_INDO = '../input/indoxtc-extracting-tripadvisor-features-mbert'

FOREIGN_DATA_NAME = 'yelp'
DATA_PATH_FOREIGN = '../input/indoxtc-combining-yelp-features-mbert'

MODEL_NAME = 'mBERT'

EXPERIMENT_TYPE_LIST = ['A', 'B', 'C'] # A / B / C
TOTAL_DATA_LIST = [500, 1000, 2500, 5000, 7500, 12389]
FOREIGN_LANG_DATA_MULT_LIST = [0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10]
RANDOM_SEED_LIST = [4,5,6]
VALIDATION_DATA = 0.1

In [2]:
import os
import numpy as np
import pandas as pd
from load_data import load_experiment_features, set_random_seed_data
from model_head import set_seed, train, test, evaluate

CUDA is not available.  Training on CPU ...


## Experiment

In [3]:
result_list_final = []
for RANDOM_SEED in RANDOM_SEED_LIST:
    print("\n############################")
    print("### START RANDOM_SEED {} ###".format(RANDOM_SEED))
    print("############################")
    result_list = []
    for TOTAL_DATA in TOTAL_DATA_LIST:
        print("\n##### TOTAL_DATA {} #####".format(TOTAL_DATA))

        for EXPERIMENT_TYPE in EXPERIMENT_TYPE_LIST:
            for FOREIGN_LANG_DATA_MULT in FOREIGN_LANG_DATA_MULT_LIST:
                set_seed(seed=RANDOM_SEED)
                set_random_seed_data(seed=RANDOM_SEED)
                if EXPERIMENT_TYPE != 'C':
                    print('\n~~Result_{}_{}~~'.format(EXPERIMENT_TYPE,
                                                       TOTAL_DATA))
                else:
                    print('\n~~Result_{}_{}_{}~~'.format(EXPERIMENT_TYPE,
                                                   TOTAL_DATA,
                                                   FOREIGN_LANG_DATA_MULT))

                train_loader, valid_loader, test_loader = load_experiment_features(DATA_PATH_INDO,
                                                                                   DATA_PATH_FOREIGN,
                                                                                   tipe=EXPERIMENT_TYPE, 
                                                                                   total_data=TOTAL_DATA, 
                                                                                   foreign_mult=FOREIGN_LANG_DATA_MULT,
                                                                                   valid_size=VALIDATION_DATA)

                train(train_loader, valid_loader, learning_rate=0.0001, input_dim=768)
                print("## Test phase...")
                y_true, y_pred_proba = test(test_loader, input_dim=768)
                max_f1, max_recall, max_precision, max_threshold = evaluate(y_true, 
                                                                            y_pred_proba,
                                                                            threshold=0.5)

                result = pd.DataFrame({
                    'y_true': y_true,
                    'y_pred_proba': y_pred_proba
                })
                
                OUTPUT_FOREIGN_LANG_DATA_MULT = FOREIGN_LANG_DATA_MULT
                if EXPERIMENT_TYPE == 'A':
                    OUTPUT_FOREIGN_LANG_DATA_MULT = 0
                elif EXPERIMENT_TYPE == 'B':
                    OUTPUT_FOREIGN_LANG_DATA_MULT = -1
                
                OUTPUT_TOTAL_FOREIGN_DATA = int(TOTAL_DATA*FOREIGN_LANG_DATA_MULT)
                if EXPERIMENT_TYPE == 'A':
                    OUTPUT_TOTAL_FOREIGN_DATA = 0
                elif EXPERIMENT_TYPE == 'B':
                    OUTPUT_TOTAL_FOREIGN_DATA = TOTAL_DATA

                result_list.append({
                    'type':EXPERIMENT_TYPE,
                    'total_data':TOTAL_DATA, 
                    'seed':RANDOM_SEED,
                    'foreign_mult':OUTPUT_FOREIGN_LANG_DATA_MULT,
                    'total_foreign_data':OUTPUT_TOTAL_FOREIGN_DATA,
                    'f1':max_f1,
                    'recall':max_recall,
                    'precision':max_precision
                })

                os.remove("model.pt")

                if EXPERIMENT_TYPE != 'C':
                    break
                    
    final_result = pd.DataFrame(result_list) 
    final_result = final_result.sort_values(by=['total_data', 'foreign_mult'])
    final_result.to_csv('final_{}_{}_{}_result_combined_seed_{}.csv'.format(INDO_DATA_NAME,
                                                                           FOREIGN_DATA_NAME,
                                                                           MODEL_NAME,
                                                                           RANDOM_SEED),
                        index=False)
    
    result_list_final.extend(result_list)


############################
### START RANDOM_SEED 4 ###
############################

##### TOTAL_DATA 500 #####

~~Result_A_500~~
EarlyStopping! Epoch 20
Last: 7 	T-Loss: 0.673595 	T-Acc: 0.593333 	V-Loss: 0.703982 	V-Acc: 0.500000
## Test phase...
Final test Loss: 0.709043
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.48358873 	Recall: 0.60177778 	Prec: 0.45836154

~~Result_B_500~~
EarlyStopping! Epoch 32
Last: 19 	T-Loss: 0.684471 	T-Acc: 0.566667 	V-Loss: 0.698719 	V-Acc: 0.480000
## Test phase...
Final test Loss: 0.698008
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.48426899 	Recall: 0.59822222 	Prec: 0.45844687

~~Result_C_500_0.25~~
EarlyStopping! Epoch 46
Last: 33 	T-Loss: 0.666279 	T-Acc: 0.600355 	V-Loss: 0.697509 	V-Acc: 0.480000
## Test phase...
Final test Loss: 0.708653
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EarlyStopping! Epoch 52
Last: 39 	T-Loss: 0.555052 	T-Acc: 0.712717 	V-Loss: 0.606468 	V-Acc: 0.664969
## Test phase...
Final test Loss: 0.628227
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.62770583 	Recall: 0.76266667 	Prec: 0.57583893

~~Result_C_12389_10~~
EarlyStopping! Epoch 33
Last: 20 	T-Loss: 0.554769 	T-Acc: 0.713001 	V-Loss: 0.619492 	V-Acc: 0.646640
## Test phase...
Final test Loss: 0.641569
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.61337394 	Recall: 0.75200000 	Prec: 0.56400000

############################
### START RANDOM_SEED 5 ###
############################

##### TOTAL_DATA 500 #####

~~Result_A_500~~
EarlyStopping! Epoch 30
Last: 17 	T-Loss: 0.672133 	T-Acc: 0.595556 	V-Loss: 0.711188 	V-Acc: 0.500000
## Test phase...
Final test Loss: 0.714548
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.475947

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EarlyStopping! Epoch 41
Last: 28 	T-Loss: 0.554993 	T-Acc: 0.712241 	V-Loss: 0.615585 	V-Acc: 0.651731
## Test phase...
Final test Loss: 0.624673
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.63640210 	Recall: 0.78755556 	Prec: 0.58174655

~~Result_C_12389_10~~
EarlyStopping! Epoch 40
Last: 27 	T-Loss: 0.553115 	T-Acc: 0.712976 	V-Loss: 0.617350 	V-Acc: 0.664969
## Test phase...
Final test Loss: 0.631599
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.62427832 	Recall: 0.76888889 	Prec: 0.57246856

############################
### START RANDOM_SEED 6 ###
############################

##### TOTAL_DATA 500 #####

~~Result_A_500~~
EarlyStopping! Epoch 14
Last: 1 	T-Loss: 0.703487 	T-Acc: 0.511111 	V-Loss: 0.689131 	V-Acc: 0.580000
## Test phase...
Final test Loss: 0.700725
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.3174001

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EarlyStopping! Epoch 30
Last: 17 	T-Loss: 0.557558 	T-Acc: 0.709282 	V-Loss: 0.612309 	V-Acc: 0.653768
## Test phase...
Final test Loss: 0.629169
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.62595343 	Recall: 0.76266667 	Prec: 0.57429719

~~Result_C_12389_10~~
EarlyStopping! Epoch 33
Last: 20 	T-Loss: 0.556212 	T-Acc: 0.710001 	V-Loss: 0.624410 	V-Acc: 0.647658
## Test phase...
Final test Loss: 0.631984
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.62227495 	Recall: 0.74933333 	Prec: 0.57191316


## Save

In [4]:
final_result = pd.DataFrame(result_list_final) 
final_result = final_result.sort_values(by=['total_data', 'foreign_mult'])
final_result.to_csv('final_{}_{}_{}_result_combined.csv'.format(INDO_DATA_NAME,
                                                               FOREIGN_DATA_NAME,
                                                               MODEL_NAME),
                        index=False)


In [5]:
print(final_result.shape)
final_result.head()

(288, 8)


Unnamed: 0,type,total_data,seed,foreign_mult,total_foreign_data,f1,recall,precision
1,B,500,4,-1.0,500,0.484269,0.598222,0.458447
97,B,500,5,-1.0,500,0.479491,0.608889,0.456362
193,B,500,6,-1.0,500,0.318254,1.0,0.463535
0,A,500,4,0.0,0,0.483589,0.601778,0.458362
96,A,500,5,0.0,0,0.475948,0.637333,0.457562
