In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import text, sequence

from contextlib import redirect_stdout
from tensorflow.keras.utils import plot_model

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # supress tensorflow warnings

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import All_UT_Models
import UT_Utils

In [3]:
# hyper parameters for this model

MAX_LEN = 131
EMBED_SIZE = 768
BERT_TRAINABLE = True

DRPT = 0.4
FC_WEIGHTS_INIT = 'he_uniform'
FC_ACT = 'elu'
LR_RATE = 6e-5
OPTIMIZER = 'adam'

BATCH = 32
NEPOCHS = 20
PATIENCE = 4
DECAY = True
DECAY_RATE = 0.3
DECAY_AFTER = 1

In [4]:
modelname = 'hfFineTuneMT5'

modelpath = './Saved Models/' + modelname + '/'
modelresults = './Model Results'
modelsummaries = './Model - Summaries-Figures'

DIRECTORIES_TO_BE_CREATED = [ modelpath, modelresults, modelsummaries ]
for directory in DIRECTORIES_TO_BE_CREATED:
    if not os.path.exists( directory ):
        os.makedirs( directory )

In [5]:
bertmodelname = 'google/mt5-base'

hfcachedir = './Embeddings/huggingface/{}/'.format( bertmodelname )
hfcachedir

'./Embeddings/huggingface/google/mt5-base/'

In [6]:
def hms_string( sec_elapsed ):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{} hrs {:>02} mins {:>05.2f} secs".format( h, m, s )

In [7]:
sheets = [ pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet1' ), pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )

df.Urdu = df.Urdu.astype( 'str' )
df.Urdu = df.Urdu.apply( UT_Utils.urdu_preprocessing )

df.dropna( inplace=True )
df.reset_index( drop=True, inplace=True )

del sheets
gc.collect()

xcolumn = 'Urdu'
ycolumn = 'Toxic'

df.shape

(72771, 3)

In [8]:
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.models import Model

def FineTuneBertWithDenseLayer(
    bert_model, max_len=128, drpt=0.1, fc_weights_init='', fc_act='', optimizer='' ):
    
    input_ids_in = tf.keras.layers.Input( shape=( max_len, ), dtype='int32' )
    input_masks_in = tf.keras.layers.Input( shape=( max_len, ), dtype='int32' )
    
    embedding_layer = bert_model( input_ids_in, attention_mask=input_masks_in )
    
    hidden_state = embedding_layer.last_hidden_state
    pooler = hidden_state[:, 0]
    
    d = Dropout( drpt )( pooler )
    d = Dense( 768, activation=fc_act, kernel_initializer=fc_weights_init )( d )
    outp = Dense( 1, activation='sigmoid' )( d )
    
    model = Model( inputs=[ input_ids_in, input_masks_in ], outputs=outp )
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=optimizer,
        metrics=['accuracy']
    )
    return model

In [9]:
from transformers import T5Tokenizer, MT5Config, TFMT5EncoderModel
from transformers import utils
utils.logging.set_verbosity( 40 ) # supress huggingface warnings
os.environ[ "TOKENIZERS_PARALLELISM" ] = "false" # disabling tokenizers parallelism to avoid deadlocks

In [10]:
# initialize bert tokenizer
BERT_TOKENIZER = T5Tokenizer.from_pretrained( bertmodelname )

# define function to tokenize input Urdu comments using bert tokenizer
def get_tokens_and_masks( bert_tokenizer, sentences, max_length=128, padding='max_length' ):
    encoded_input = bert_tokenizer(
        sentences,
        add_special_tokens=True,
        max_length=max_length,
        padding=padding,
        truncation=True,
        return_attention_mask = True
    )
    return np.array( encoded_input[ 'input_ids' ] ), np.array( encoded_input[ 'attention_mask' ] )

In [11]:
BERT_CONFIG = MT5Config.from_pretrained( bertmodelname )
BERT_CONFIG

MT5Config {
  "_name_or_path": "/home/patrick/hugging_face/t5/mt5-base",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 250112
}

In [12]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [13]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()
print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []
hist = {}

fold = 1
for train_index, test_index in skf.split( df[ xcolumn ], df[ ycolumn ] ):
    # seprating train and test sets
    xtrain = df.loc[ train_index ][ xcolumn ].values
    xtest = df.loc[ test_index ][ xcolumn ].values
    
    ytrain = df.loc[ train_index ][ ycolumn ].values
    ytest = df.loc[ test_index ][ ycolumn ].values
    
    # splitting train and validation sets
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    xtrain_input_ids, xtrain_attention_masks = get_tokens_and_masks(
        BERT_TOKENIZER, xtrain.tolist(), max_length=MAX_LEN, padding='max_length'
    )
    
    xval_input_ids, xval_attention_masks = get_tokens_and_masks(
        BERT_TOKENIZER, xval.tolist(), max_length=MAX_LEN, padding='max_length'
    )
    
    xtest_input_ids, xtest_attention_masks = get_tokens_and_masks(
        BERT_TOKENIZER, xtest.tolist(), max_length=MAX_LEN, padding='max_length'
    )
    
    # instantiate bert model
    BERT_MODEL = TFMT5EncoderModel.from_pretrained( bertmodelname, config=BERT_CONFIG, cache_dir=hfcachedir )
    BERT_MODEL.trainable = BERT_TRAINABLE
    
    # define a model
    model = FineTuneBertWithDenseLayer(
        bert_model=BERT_MODEL,
        max_len=MAX_LEN, drpt=DRPT, fc_weights_init=FC_WEIGHTS_INIT, fc_act=FC_ACT, optimizer=OPTIMIZER
    )
    
    # save model summaries and model architecture diagrams in the first fold only
    if fold == 1:
        plot_model( model=model, to_file='{}/{}.png'.format( modelsummaries, modelname ), show_shapes=False )
        
        with open( '{}/{}.txt'.format( modelsummaries, modelname ), 'w' ) as s:
            with redirect_stdout( s ):
                model.summary()
    
    K.set_value( model.optimizer.lr, LR_RATE )
    
    # train the model with callbacks for early stopping
    f1callback = UT_Utils.F1_score_callback_HF(
        val_data=( xval_input_ids, xval_attention_masks, yval ),
        filepath=modelpath + modelname + str( fold ), patience=PATIENCE,
        decay=DECAY, decay_rate=DECAY_RATE, decay_after=DECAY_AFTER
    )
    histmodel = model.fit(
        [ xtrain_input_ids, xtrain_attention_masks ], ytrain,
        batch_size=BATCH, epochs=NEPOCHS, verbose=0, callbacks=[ f1callback ]
    )
    
    # save history of all folds
    hist[ 'fold' + str( fold ) ] = histmodel.history.copy()
    
    hffinetuned = './Embeddings/hf_finetuned/{}-fold{}/'.format( bertmodelname, fold )
    BERT_MODEL.save_pretrained( hffinetuned )
    
    # delete trained model object
    del model, histmodel, f1callback, BERT_MODEL
    K.clear_session()
    gc.collect()
    
    # load saved model
    FINETUNED_BERT_MODEL = TFMT5EncoderModel.from_pretrained( hffinetuned, config=BERT_CONFIG, cache_dir=hfcachedir )
    FINETUNED_BERT_MODEL.trainable = False
    loaded_model = load_model(
        modelpath + modelname + str( fold ),
        custom_objects={ 'TFMT5EncoderModel': FINETUNED_BERT_MODEL }
    )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict(
        [ xval_input_ids, xval_attention_masks ], verbose=0, batch_size=BATCH
    )
    testpredictions = loaded_model.predict(
        [ xtest_input_ids, xtest_attention_masks ], verbose=0, batch_size=BATCH
    )
    
    # delete loaded model
    del loaded_model, FINETUNED_BERT_MODEL
    K.clear_session()
    gc.collect()
    
    # optimizer threshold on validation set
    threshold = UT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = ( valpredictions >= threshold ).astype( 'int32' )
    testlabels = ( testpredictions >= threshold ).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ xcolumn ] )
    com_label.extend( df.loc[ test_index ][ ycolumn ].tolist() )
    com_predicted.extend( testlabels[:,0].tolist() )
    com_prob.extend( testpredictions[:,0].tolist() )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print( f"Total runtime: { hms_string( time_took ) }" )

Local System Time: 11:32 PM
Epoch: 001 --LR: 6e-05 --MaxValF1: 0.6864216 --CurValF1: 0.6864216 --Patience: 01 --F1 improved: 0.6864216
Epoch: 002 --LR: 6e-05 --MaxValF1: 0.8027496 --CurValF1: 0.8027496 --Patience: 01 --F1 improved: 0.8027496
Epoch: 003 --LR: 6e-05 --MaxValF1: 0.8479173 --CurValF1: 0.8479173 --Patience: 01 --F1 improved: 0.8479173
Epoch: 004 --LR: 6e-05 --MaxValF1: 0.8511489 --CurValF1: 0.8511489 --Patience: 01 --F1 improved: 0.8511489
Epoch: 005 --LR: 6e-05 --MaxValF1: 0.8768043 --CurValF1: 0.8768043 --Patience: 01 --F1 improved: 0.8768043
Epoch: 006 --LR: 6e-05 --MaxValF1: 0.8833561 --CurValF1: 0.8833561 --Patience: 01 --F1 improved: 0.8833561
Epoch: 007 --LR: 6e-05 --MaxValF1: 0.8899868 --CurValF1: 0.8899868 --Patience: 01 --F1 improved: 0.8899868
Epoch: 008 --LR: 6e-05 --MaxValF1: 0.8940217 --CurValF1: 0.8940217 --Patience: 01 --F1 improved: 0.8940217
Epoch: 009 --LR: 6e-05 --MaxValF1: 0.8940217 --CurValF1: 0.8886019 --Patience: 01
Epoch: 010 --LR: 2e-05 --MaxValF1:

Epoch: 011 --LR: 6e-05 --MaxValF1: 0.8896435 --CurValF1: 0.8896435 --Patience: 01 --F1 improved: 0.8896435
Epoch: 012 --LR: 6e-05 --MaxValF1: 0.8919189 --CurValF1: 0.8919189 --Patience: 01 --F1 improved: 0.8919189
Epoch: 013 --LR: 6e-05 --MaxValF1: 0.8919189 --CurValF1: 0.8894772 --Patience: 01
Epoch: 014 --LR: 2e-05 --MaxValF1: 0.8955026 --CurValF1: 0.8955026 --Patience: 02 --F1 improved: 0.8955026
Epoch: 015 --LR: 2e-05 --MaxValF1: 0.8955026 --CurValF1: 0.8939597 --Patience: 01
Epoch: 016 --LR: 5e-06 --MaxValF1: 0.8955026 --CurValF1: 0.8936312 --Patience: 02
Epoch: 017 --LR: 2e-06 --MaxValF1: 0.8955026 --CurValF1: 0.8938585 --Patience: 03
Epoch: 018 --LR: 5e-07 --MaxValF1: 0.8955026 --CurValF1: 0.8944556 --Patience: 04
Training stopped due to the patience parameter. --Patience: 04
Fold: 05 out of 05 completed.
Local System Time: 02:24 PM
Total runtime: 14 hrs 51 mins 24.04 secs


In [14]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9658', '0.9662', '0.9655', '0.9670', '0.9638'] 0.9656704454368488 +- 0.0010579465899416979 

Validation Precision
['0.9204', '0.9174', '0.9182', '0.9229', '0.9118'] 0.9181612950435041 +- 0.0037115679456559696 

Validation Recall
['0.8814', '0.8882', '0.8830', '0.8869', '0.8798'] 0.8838889841324267 +- 0.0032175877327522656 

Validation F1
['0.9005', '0.9026', '0.9003', '0.9046', '0.8955'] 0.900696122323612 +- 0.0030282651919723734


In [15]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[1353  182]
 [ 117 7081]] 

[[1367  172]
 [ 123 7071]] 

[[1359  180]
 [ 121 7073]] 

[[1365  174]
 [ 114 7080]] 

[[1354  185]
 [ 131 7063]] 



In [16]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9642', '0.9643', '0.9649', '0.9646', '0.9607'] 0.9637492894772484 +- 0.0015436513351136962 

Test Precision
['0.9275', '0.9073', '0.9173', '0.9198', '0.9076'] 0.9158965700762352 +- 0.007664175002954552 

Test Recall
['0.8691', '0.8931', '0.8847', '0.8801', '0.8702'] 0.8794377843649899 +- 0.009037805249388955 

Test F1
['0.8973', '0.9002', '0.9007', '0.8995', '0.8885'] 0.8972409008526521 +- 0.0045169921986704855


In [17]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2277   343]
 [  178 11757]] 

[[ 2340   280]
 [  239 11695]] 

[[ 2317   302]
 [  209 11726]] 

[[ 2305   314]
 [  201 11734]] 

[[ 2279   340]
 [  232 11703]] 



In [18]:
file = open( '{}/ResultsMain.csv'.format( modelresults ), mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [19]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( '{}/{}.csv'.format( modelresults, modelname ), index=False )
dfPredictions.shape

(72771, 4)