# IndoXTC - Fine-tune Head TripAdvisor [XLM-R] [ALL]
Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.

This kernel is a part of my undergraduate final year project.
Checkout the full github repository:
https://github.com/ilhamfp/indonesian-text-classification-multilingual

In [1]:
#############################
# Experiment configurations #
#############################

INDO_DATA_NAME = 'trip_advisor'
DATA_PATH_INDO = '../input/indoxtc-extracting-tripadvisor-features-xlm-r'

FOREIGN_DATA_NAME = 'yelp'
DATA_PATH_FOREIGN = '../input/indoxtc-combining-yelp-features-xlm-r'

MODEL_NAME = 'XLM_R'

EXPERIMENT_TYPE_LIST = ['A', 'B', 'C'] # A / B / C
TOTAL_DATA_LIST = [500, 1000, 2500, 5000, 7500, 12389]
FOREIGN_LANG_DATA_MULT_LIST = [0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10]
RANDOM_SEED_LIST = [4,5,6]
VALIDATION_DATA = 0.1

In [2]:
import os
import numpy as np
import pandas as pd
from load_data import load_experiment_features, set_random_seed_data
from model_head import set_seed, train, test, evaluate

CUDA is not available.  Training on CPU ...


## Experiment

In [3]:
result_list_final = []
for RANDOM_SEED in RANDOM_SEED_LIST:
    print("\n############################")
    print("### START RANDOM_SEED {} ###".format(RANDOM_SEED))
    print("############################")
    result_list = []
    for TOTAL_DATA in TOTAL_DATA_LIST:
        print("\n##### TOTAL_DATA {} #####".format(TOTAL_DATA))

        for EXPERIMENT_TYPE in EXPERIMENT_TYPE_LIST:
            for FOREIGN_LANG_DATA_MULT in FOREIGN_LANG_DATA_MULT_LIST:
                set_seed(seed=RANDOM_SEED)
                set_random_seed_data(seed=RANDOM_SEED)
                if EXPERIMENT_TYPE != 'C':
                    print('\n~~Result_{}_{}~~'.format(EXPERIMENT_TYPE,
                                                       TOTAL_DATA))
                else:
                    print('\n~~Result_{}_{}_{}~~'.format(EXPERIMENT_TYPE,
                                                   TOTAL_DATA,
                                                   FOREIGN_LANG_DATA_MULT))

                train_loader, valid_loader, test_loader = load_experiment_features(DATA_PATH_INDO,
                                                                                   DATA_PATH_FOREIGN,
                                                                                   tipe=EXPERIMENT_TYPE, 
                                                                                   total_data=TOTAL_DATA, 
                                                                                   foreign_mult=FOREIGN_LANG_DATA_MULT,
                                                                                   valid_size=VALIDATION_DATA)

                train(train_loader, valid_loader, learning_rate=0.0001)
                print("## Test phase...")
                y_true, y_pred_proba = test(test_loader)
                max_f1, max_recall, max_precision, max_threshold = evaluate(y_true, 
                                                                            y_pred_proba,
                                                                            threshold=0.5)

                result = pd.DataFrame({
                    'y_true': y_true,
                    'y_pred_proba': y_pred_proba
                })
                
                OUTPUT_FOREIGN_LANG_DATA_MULT = FOREIGN_LANG_DATA_MULT
                if EXPERIMENT_TYPE == 'A':
                    OUTPUT_FOREIGN_LANG_DATA_MULT = 0
                elif EXPERIMENT_TYPE == 'B':
                    OUTPUT_FOREIGN_LANG_DATA_MULT = -1
                
                OUTPUT_TOTAL_FOREIGN_DATA = int(TOTAL_DATA*FOREIGN_LANG_DATA_MULT)
                if EXPERIMENT_TYPE == 'A':
                    OUTPUT_TOTAL_FOREIGN_DATA = 0
                elif EXPERIMENT_TYPE == 'B':
                    OUTPUT_TOTAL_FOREIGN_DATA = TOTAL_DATA

                result_list.append({
                    'type':EXPERIMENT_TYPE,
                    'total_data':TOTAL_DATA, 
                    'seed':RANDOM_SEED,
                    'foreign_mult':OUTPUT_FOREIGN_LANG_DATA_MULT,
                    'total_foreign_data':OUTPUT_TOTAL_FOREIGN_DATA,
                    'f1':max_f1,
                    'recall':max_recall,
                    'precision':max_precision
                })

                os.remove("model.pt")

                if EXPERIMENT_TYPE != 'C':
                    break
                    
    final_result = pd.DataFrame(result_list) 
    final_result = final_result.sort_values(by=['total_data', 'foreign_mult'])
    final_result.to_csv('final_{}_{}_{}_result_combined_seed_{}.csv'.format(INDO_DATA_NAME,
                                                                           FOREIGN_DATA_NAME,
                                                                           MODEL_NAME,
                                                                           RANDOM_SEED),
                        index=False)
    
    result_list_final.extend(result_list)


############################
### START RANDOM_SEED 4 ###
############################

##### TOTAL_DATA 500 #####

~~Result_A_500~~
EarlyStopping! Epoch 36
Last: 23 	T-Loss: 0.682547 	T-Acc: 0.664444 	V-Loss: 0.673167 	V-Acc: 0.620000
## Test phase...
Final test Loss: 0.690155
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.51678565 	Recall: 0.82044444 	Prec: 0.50217628

~~Result_B_500~~
EarlyStopping! Epoch 36
Last: 23 	T-Loss: 0.689532 	T-Acc: 0.548889 	V-Loss: 0.677213 	V-Acc: 0.500000
## Test phase...
Final test Loss: 0.694628
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.46409262 	Recall: 0.66755556 	Prec: 0.45460048

~~Result_C_500_0.25~~
EarlyStopping! Epoch 43
Last: 30 	T-Loss: 0.674175 	T-Acc: 0.655417 	V-Loss: 0.668412 	V-Acc: 0.660000
## Test phase...
Final test Loss: 0.684803
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EarlyStopping! Epoch 45
Last: 32 	T-Loss: 0.401991 	T-Acc: 0.835023 	V-Loss: 0.411892 	V-Acc: 0.813646
## Test phase...
Final test Loss: 0.435642
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.80238028 	Recall: 0.87288889 	Prec: 0.74450341

~~Result_C_12389_10~~
EarlyStopping! Epoch 43
Last: 30 	T-Loss: 0.399397 	T-Acc: 0.835783 	V-Loss: 0.415293 	V-Acc: 0.827902
## Test phase...
Final test Loss: 0.432160
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.80485635 	Recall: 0.87200000 	Prec: 0.74828375

############################
### START RANDOM_SEED 5 ###
############################

##### TOTAL_DATA 500 #####

~~Result_A_500~~
EarlyStopping! Epoch 14
Last: 1 	T-Loss: 0.696380 	T-Acc: 0.513333 	V-Loss: 0.673274 	V-Acc: 0.620000
## Test phase...
Final test Loss: 0.704791
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.3165447

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EarlyStopping! Epoch 67
Last: 54 	T-Loss: 0.386750 	T-Acc: 0.840647 	V-Loss: 0.409568 	V-Acc: 0.815682
## Test phase...
Final test Loss: 0.421284
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.80938645 	Recall: 0.87466667 	Prec: 0.75344564

~~Result_C_12389_10~~
EarlyStopping! Epoch 59
Last: 46 	T-Loss: 0.377284 	T-Acc: 0.842531 	V-Loss: 0.400623 	V-Acc: 0.818737
## Test phase...
Final test Loss: 0.417441
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.81349541 	Recall: 0.88533333 	Prec: 0.75454545

############################
### START RANDOM_SEED 6 ###
############################

##### TOTAL_DATA 500 #####

~~Result_A_500~~
EarlyStopping! Epoch 23
Last: 10 	T-Loss: 0.684933 	T-Acc: 0.593333 	V-Loss: 0.679587 	V-Acc: 0.700000
## Test phase...
Final test Loss: 0.688743
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.526585

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EarlyStopping! Epoch 68
Last: 55 	T-Loss: 0.380717 	T-Acc: 0.840509 	V-Loss: 0.400525 	V-Acc: 0.812627
## Test phase...
Final test Loss: 0.413873
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.81103322 	Recall: 0.87466667 	Prec: 0.75576037

~~Result_C_12389_10~~
EarlyStopping! Epoch 53
Last: 40 	T-Loss: 0.390967 	T-Acc: 0.838135 	V-Loss: 0.412075 	V-Acc: 0.813646
## Test phase...
Final test Loss: 0.427073
[Evaluate] Threshold argument set. Using 0.5 as threshold
[Evaluate] ##MAX## 
THRESHOLD: 0.500 	F1: 0.80401993 	Recall: 0.87733333 	Prec: 0.74490566


## Save

In [4]:
final_result = pd.DataFrame(result_list_final) 
final_result = final_result.sort_values(by=['total_data', 'foreign_mult'])
final_result.to_csv('final_{}_{}_{}_result_combined.csv'.format(INDO_DATA_NAME,
                                                               FOREIGN_DATA_NAME,
                                                               MODEL_NAME),
                        index=False)


In [5]:
print(final_result.shape)
final_result.head()

(288, 8)


Unnamed: 0,type,total_data,seed,foreign_mult,total_foreign_data,f1,recall,precision
1,B,500,4,-1.0,500,0.464093,0.667556,0.4546
97,B,500,5,-1.0,500,0.316545,1.0,0.463154
193,B,500,6,-1.0,500,0.350663,0.002667,0.25
0,A,500,4,0.0,0,0.516786,0.820444,0.502176
96,A,500,5,0.0,0,0.316545,1.0,0.463154
