In [None]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence

In [None]:
import All_RUT_Models
import RUT_Utils

In [None]:
# hyper parameters for this model

max_len = 150
embed_size = 300
pre_trained_flag = True
embed_trainable = False
emb_weights_init = 'glorot_normal'
spdrpt = 0.40
drpt = 0.2
fc_weights_init = 'glorot_uniform'
fc_act = 'elu'
lr_rate = 0.001
optimizer = 'adam'
fcl_loss_alp = 0.25
fcl_loss_gam = 5
multi_gpu_flag = 0
gpus = 2
batch = 64
nepochs = 30
patience = 5
decay = True
decay_rate = 0.5
decay_after = 3

In [None]:
#embeddingfile = './General_Embeddings/glove.txt'
#embeddingfile = './General_Embeddings/w2v_cbow.txt'
#embeddingfile = './General_Embeddings/w2v_sg.txt'
#embeddingfile = './General_Embeddings/ft_cbow.vec'
embeddingfile = './General_Embeddings/ft_sg.vec'

embedding_matrix = []
max_features = 100000

modelname = 'BGRU_P_ft_sg'

modelpath = './Models/' + modelname + '/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( './Results/' ):
    os.makedirs( './Results/' )

In [None]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [None]:
sheets = [ pd.read_excel( 'RomanUrduToxicity.xlsx', 'Sheet1' ), pd.read_excel( 'RomanUrduToxicity.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )
df.Comment = df.Comment.astype( 'str' )
del sheets
df.shape

In [None]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer ):
    word_index = tokenizer.word_index
    num_words = min( max_features, len( word_index ) + 1 )
    embedding_matrix = np.zeros( ( num_words, embed_size ) )
    for word, i in word_index.items(  ):
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get( word )
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix

if pre_trained_flag == True:
    embeddings_index = dict( get_coefs( *o.rstrip().rsplit(' ') ) for o in open( embeddingfile, encoding='utf-8' ) )

In [None]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

In [None]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

fold = 1
for train_index, test_index in skf.split( df.Comment, df.Toxic ):
    # clearing previous sessions
    K.clear_session()
    tf.reset_default_graph()
    
    # tokenization with keras tokenizer
    tokenizer = text.Tokenizer( num_words=max_features )
    tokenizer.fit_on_texts( df.loc[ train_index ][ 'Comment' ].values )

    traincomments = tokenizer.texts_to_sequences( df.loc[ train_index ][ 'Comment' ].values )
    testcomments = tokenizer.texts_to_sequences( df.loc[ test_index ][ 'Comment' ].values )
    
    # pad the tokenized sequences
    xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
    xtest = sequence.pad_sequences( testcomments, maxlen=max_len )
    
    ytrain = df.loc[ train_index ][ 'Toxic' ].values
    ytest = df.loc[ test_index ][ 'Toxic' ].values
    
    # split train and val
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    # check if pre-trained word embeddings flag is true
    if pre_trained_flag == True:
        embedding_matrix = get_vectors( tokenizer=tokenizer)
    
    # define a model
    model = All_RUT_Models.BGRU_P( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                  embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                  spdrpt=spdrpt, drpt=drpt, emb_weights_init=emb_weights_init,
                                  fc_weights_init=fc_weights_init, fc_act=fc_act, optimizer=optimizer,
                                  fcl_loss_alp=fcl_loss_alp, fcl_loss_gam=fcl_loss_gam,
                                  multi_gpu_flag=multi_gpu_flag, gpus=gpus )
    
    K.set_value( model.optimizer.lr, lr_rate )
    
    # train the model with callbacks for early stopping
    f1metric = RUT_Utils.F1Metrics( modelpath + modelname + str( fold ) + '.h5', patience=patience,
                                   decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=False )
    hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ),
                     epochs=nepochs, verbose=0, callbacks=[ f1metric ] )
    
    # load saved model
    custt_obss = { 'focal_loss_fixed':All_RUT_Models.focal_loss(alpha=fcl_loss_alp, gamma=fcl_loss_gam) }
    loaded_model = load_model( modelpath + modelname + str(fold) + '.h5', custom_objects=custt_obss )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict( xval, verbose=0, batch_size=2048 )
    testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=2048 )
    
    # optimizer threshold on validation set
    threshold = RUT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = (valpredictions>=threshold).astype( 'int32' )
    testlabels = (testpredictions>=threshold).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ 'Comment' ] )
    com_label.extend( df.loc[ test_index ][ 'Toxic' ].tolist() )
    com_predicted.extend( testlabels[:,0].tolist() )
    com_prob.extend( testpredictions[:,0].tolist() )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

In [None]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

In [None]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [None]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

In [None]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [None]:
file = open( 'Results/ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [None]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( 'Results/' + modelname + '.csv', index=False )
dfPredictions.shape