In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import text, sequence

from contextlib import redirect_stdout
from tensorflow.keras.utils import plot_model

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # supress tensorflow warnings

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import All_UT_Models
import UT_Utils

In [3]:
# hyper parameters for this model

MAX_LEN = 130
EMBED_SIZE = 768
BERT_TRAINABLE = True

DRPT = 0.4
FC_WEIGHTS_INIT = 'he_uniform'
FC_ACT = 'elu'
LR_RATE = 1e-5
OPTIMIZER = 'adam'

BATCH = 32
NEPOCHS = 20
PATIENCE = 4
DECAY = True
DECAY_RATE = 0.3
DECAY_AFTER = 1

In [4]:
modelname = 'hfFineTuneBert'

modelpath = './Saved Models/' + modelname + '/'
modelresults = './Model Results'
modelsummaries = './Model - Summaries-Figures'

DIRECTORIES_TO_BE_CREATED = [ modelpath, modelresults, modelsummaries ]
for directory in DIRECTORIES_TO_BE_CREATED:
    if not os.path.exists( directory ):
        os.makedirs( directory )

In [5]:
bertmodelname = 'bert-base-multilingual-cased'

hfcachedir = './Embeddings/huggingface/{}/'.format( bertmodelname )
hfcachedir

'./Embeddings/huggingface/bert-base-multilingual-cased/'

In [6]:
def hms_string( sec_elapsed ):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{} hrs {:>02} mins {:>05.2f} secs".format( h, m, s )

In [7]:
sheets = [ pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet1' ), pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )

df.Urdu = df.Urdu.astype( 'str' )
df.Urdu = df.Urdu.apply( UT_Utils.urdu_preprocessing )

df.dropna( inplace=True )
df.reset_index( drop=True, inplace=True )

del sheets
gc.collect()

xcolumn = 'Urdu'
ycolumn = 'Toxic'

df.shape

(72771, 3)

In [8]:
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.models import Model

def FineTuneBertWithDenseLayer(
    bert_model, max_len=128, drpt=0.1, fc_weights_init='', fc_act='', optimizer='' ):
    
    input_ids_in = tf.keras.layers.Input( shape=( max_len, ), dtype='int32' )
    input_masks_in = tf.keras.layers.Input( shape=( max_len, ), dtype='int32' )
    
    embedding_layer = bert_model( input_ids_in, attention_mask=input_masks_in )[ 1 ]
    
    d = Dropout( drpt )( embedding_layer )
    d = Dense( 768, activation=fc_act, kernel_initializer=fc_weights_init )( d )
    outp = Dense( 1, activation='sigmoid' )( d )
    
    model = Model( inputs=[ input_ids_in, input_masks_in ], outputs=outp )
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=optimizer,
        metrics=['accuracy']
    )
    return model

In [9]:
from transformers import BertTokenizer, BertConfig, TFBertModel
from transformers import utils
utils.logging.set_verbosity( 40 ) # supress huggingface warnings
os.environ[ "TOKENIZERS_PARALLELISM" ] = "false" # disabling tokenizers parallelism to avoid deadlocks

In [10]:
# initialize bert tokenizer
BERT_TOKENIZER = BertTokenizer.from_pretrained( bertmodelname )

# define function to tokenize input Urdu comments using bert tokenizer
def get_tokens_and_masks( bert_tokenizer, sentences, max_length=128, padding='max_length' ):
    encoded_input = bert_tokenizer(
        sentences,
        add_special_tokens=True,
        max_length=max_length,
        padding=padding,
        truncation=True,
        return_attention_mask = True
    )
    return np.array( encoded_input[ 'input_ids' ] ), np.array( encoded_input[ 'attention_mask' ] )

In [11]:
BERT_CONFIG = BertConfig.from_pretrained( bertmodelname )
BERT_CONFIG

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

In [12]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [13]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()
print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []
hist = {}

fold = 1
for train_index, test_index in skf.split( df[ xcolumn ], df[ ycolumn ] ):
    # seprating train and test sets
    xtrain = df.loc[ train_index ][ xcolumn ].values
    xtest = df.loc[ test_index ][ xcolumn ].values
    
    ytrain = df.loc[ train_index ][ ycolumn ].values
    ytest = df.loc[ test_index ][ ycolumn ].values
    
    # splitting train and validation sets
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    xtrain_input_ids, xtrain_attention_masks = get_tokens_and_masks(
        BERT_TOKENIZER, xtrain.tolist(), max_length=MAX_LEN, padding='max_length'
    )
    
    xval_input_ids, xval_attention_masks = get_tokens_and_masks(
        BERT_TOKENIZER, xval.tolist(), max_length=MAX_LEN, padding='max_length'
    )
    
    xtest_input_ids, xtest_attention_masks = get_tokens_and_masks(
        BERT_TOKENIZER, xtest.tolist(), max_length=MAX_LEN, padding='max_length'
    )
    
    # instantiate bert model
    BERT_MODEL = TFBertModel.from_pretrained( bertmodelname, config=BERT_CONFIG, cache_dir=hfcachedir )
    BERT_MODEL.trainable = BERT_TRAINABLE
    
    # define a model
    model = FineTuneBertWithDenseLayer(
        bert_model=BERT_MODEL,
        max_len=MAX_LEN, drpt=DRPT, fc_weights_init=FC_WEIGHTS_INIT, fc_act=FC_ACT, optimizer=OPTIMIZER
    )
    
    # save model summaries and model architecture diagrams in the first fold only
    if fold == 1:
        plot_model( model=model, to_file='{}/{}.png'.format( modelsummaries, modelname ), show_shapes=False )
        
        with open( '{}/{}.txt'.format( modelsummaries, modelname ), 'w' ) as s:
            with redirect_stdout( s ):
                model.summary()
    
    K.set_value( model.optimizer.lr, LR_RATE )
    
    # train the model with callbacks for early stopping
    f1callback = UT_Utils.F1_score_callback_HF(
        val_data=( xval_input_ids, xval_attention_masks, yval ),
        filepath=modelpath + modelname + str( fold ), patience=PATIENCE,
        decay=DECAY, decay_rate=DECAY_RATE, decay_after=DECAY_AFTER
    )
    histmodel = model.fit(
        [ xtrain_input_ids, xtrain_attention_masks ], ytrain,
        batch_size=BATCH, epochs=NEPOCHS, verbose=0, callbacks=[ f1callback ]
    )
    
    # save history of all folds
    hist[ 'fold' + str( fold ) ] = histmodel.history.copy()
    
    hffinetuned = './Embeddings/hf_finetuned/{}-fold{}/'.format( bertmodelname, fold )
    BERT_MODEL.save_pretrained( hffinetuned )
    
    # delete trained model object
    del model, histmodel, f1callback, BERT_MODEL
    K.clear_session()
    gc.collect()
    
    # load saved model
    FINETUNED_BERT_MODEL = TFBertModel.from_pretrained( hffinetuned, config=BERT_CONFIG, cache_dir=hfcachedir )
    FINETUNED_BERT_MODEL.trainable = False
    loaded_model = load_model(
        modelpath + modelname + str( fold ),
        custom_objects={ 'TFBertModel': FINETUNED_BERT_MODEL }
    )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict(
        [ xval_input_ids, xval_attention_masks ], verbose=0, batch_size=BATCH
    )
    testpredictions = loaded_model.predict(
        [ xtest_input_ids, xtest_attention_masks ], verbose=0, batch_size=BATCH
    )
    
    # delete loaded model
    del loaded_model, FINETUNED_BERT_MODEL
    K.clear_session()
    gc.collect()
    
    # optimizer threshold on validation set
    threshold = UT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = ( valpredictions >= threshold ).astype( 'int32' )
    testlabels = ( testpredictions >= threshold ).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ xcolumn ] )
    com_label.extend( df.loc[ test_index ][ ycolumn ].tolist() )
    com_predicted.extend( testlabels[:,0].tolist() )
    com_prob.extend( testpredictions[:,0].tolist() )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print( f"Total runtime: { hms_string( time_took ) }" )

Local System Time: 02:22 AM
Epoch: 001 --LR: 1e-05 --MaxValF1: 0.8651685 --CurValF1: 0.8651685 --Patience: 01 --F1 improved: 0.8651685
Epoch: 002 --LR: 1e-05 --MaxValF1: 0.8898445 --CurValF1: 0.8898445 --Patience: 01 --F1 improved: 0.8898445
Epoch: 003 --LR: 1e-05 --MaxValF1: 0.8898474 --CurValF1: 0.8898474 --Patience: 01 --F1 improved: 0.8898474
Epoch: 004 --LR: 1e-05 --MaxValF1: 0.8924303 --CurValF1: 0.8924303 --Patience: 01 --F1 improved: 0.8924303
Epoch: 005 --LR: 1e-05 --MaxValF1: 0.8980263 --CurValF1: 0.8980263 --Patience: 01 --F1 improved: 0.8980263
Epoch: 006 --LR: 1e-05 --MaxValF1: 0.8980263 --CurValF1: 0.8933909 --Patience: 01
Epoch: 007 --LR: 3e-06 --MaxValF1: 0.9007021 --CurValF1: 0.9007021 --Patience: 02 --F1 improved: 0.9007021
Epoch: 008 --LR: 3e-06 --MaxValF1: 0.9007021 --CurValF1: 0.8957346 --Patience: 01
Epoch: 009 --LR: 9e-07 --MaxValF1: 0.9007021 --CurValF1: 0.8959306 --Patience: 02
Epoch: 010 --LR: 3e-07 --MaxValF1: 0.9007021 --CurValF1: 0.8955422 --Patience: 03
Ep

In [14]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9660', '0.9658', '0.9666', '0.9692', '0.9652'] 0.9665407076605976 +- 0.0013994380961825006 

Validation Precision
['0.9251', '0.9201', '0.9285', '0.9398', '0.9103'] 0.9247527191584325 +- 0.00969827975654265 

Validation Recall
['0.8775', '0.8824', '0.8778', '0.8817', '0.8902'] 0.881937634531497 +- 0.004573058170255745 

Validation F1
['0.9007', '0.9008', '0.9025', '0.9098', '0.9001'] 0.9027913272326711 +- 0.00360075513209736


In [15]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[1347  188]
 [ 109 7089]] 

[[1358  181]
 [ 118 7076]] 

[[1351  188]
 [ 104 7090]] 

[[1357  182]
 [  87 7107]] 

[[1370  169]
 [ 135 7059]] 



In [16]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9648', '0.9632', '0.9652', '0.9658', '0.9627'] 0.9643401838245864 +- 0.0011997628524600974 

Test Precision
['0.9410', '0.9165', '0.9171', '0.9323', '0.9074'] 0.9228542926066915 +- 0.012104933094552782 

Test Recall
['0.8584', '0.8752', '0.8870', '0.8732', '0.8828'] 0.8753162590464865 +- 0.009821960179478917 

Test F1
['0.8978', '0.8954', '0.9018', '0.9018', '0.8949'] 0.8983334738044839 +- 0.002997072821619214


In [17]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2249   371]
 [  141 11794]] 

[[ 2293   327]
 [  209 11725]] 

[[ 2323   296]
 [  210 11725]] 

[[ 2287   332]
 [  166 11769]] 

[[ 2312   307]
 [  236 11699]] 



In [18]:
file = open( '{}/ResultsMain.csv'.format( modelresults ), mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [19]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( '{}/{}.csv'.format( modelresults, modelname ), index=False )
dfPredictions.shape

(72771, 4)