In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import text, sequence

from contextlib import redirect_stdout
from tensorflow.keras.utils import plot_model

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # supress tensorflow warnings

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import All_UT_Models
import UT_Utils

In [3]:
# hyper parameters for this model

MAX_LEN = 100
EMBED_SIZE = 300
PRE_TRAINED_FLAG = True
EMBED_TRAINABLE = False

EMB_WEIGHTS_INIT = 'he_normal'
DRPT = 0.4
CONV_WEIGHTS_INIT = 'he_uniform'
CONV_ACT = 'elu'
FC_WEIGHTS_INIT = 'he_uniform'
FC_ACT = 'elu'
LR_RATE = 1e-4
OPTIMIZER = 'adam'

BATCH = 32
NEPOCHS = 30
PATIENCE = 5
DECAY = True
DECAY_RATE = 0.3
DECAY_AFTER = 1

In [4]:
#EMBEDDING_FILE = './Embeddings/urduvec_140M_100K_300d.txt'
EMBEDDING_FILE = './Embeddings/cc.ur.300.vec'

EMBEDDING_MATRIX = []

modelname = 'CNN_gram-ft'

modelpath = './Saved Models/' + modelname + '/'
modelresults = './Model Results'
modelsummaries = './Model - Summaries-Figures'

DIRECTORIES_TO_BE_CREATED = [ modelpath, modelresults, modelsummaries ]
for directory in DIRECTORIES_TO_BE_CREATED:
    if not os.path.exists( directory ):
        os.makedirs( directory )

In [5]:
def hms_string( sec_elapsed ):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{} hrs {:>02} mins {:>05.2f} secs".format( h, m, s )

In [6]:
sheets = [ pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet1' ), pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )

df.Urdu = df.Urdu.astype( 'str' )
df.Urdu = df.Urdu.apply( UT_Utils.urdu_preprocessing )

df.dropna( inplace=True )
df.reset_index( drop=True, inplace=True )

del sheets
gc.collect()

xcolumn = 'Urdu'
ycolumn = 'Toxic'

df.shape

(72771, 3)

In [7]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer, emb_mean, emb_std, embed_size ):
    hits = 0
    misses = 0
    word_index = tokenizer.word_index
    num_words = len( word_index ) + 1
    embedding_matrix = np.random.normal( emb_mean, emb_std, ( num_words, embed_size ) ) # for unk tokens
    embedding_matrix[0] = np.zeros( ( embed_size ) ) # replace zero padding with zeros
    for word, i in word_index.items(  ):
        embedding_vector = EMBEDDINGS_INDEX.get( word )
        if embedding_vector is not None:
            embedding_matrix[ i ] = embedding_vector
            hits += 1
        else:
            misses += 1
    print( "Pretrained Embeddings: %d hits (%d misses)" % ( hits, misses ) )
    gc.collect()
    return embedding_matrix

if PRE_TRAINED_FLAG == True:
    EMBEDDINGS_INDEX = dict( get_coefs( *o.strip().split() ) for o in open( EMBEDDING_FILE, encoding='utf-8' )  if len(o)>19 )
    print( 'Total Words in Pretrained Embeddings: ', len( EMBEDDINGS_INDEX.values() ) )
    all_embs = np.stack( EMBEDDINGS_INDEX.values() )
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    print( 'Pretrained Embeddings mean:', emb_mean, ' std: ', emb_std )

Total Words in Pretrained Embeddings:  1153999
Pretrained Embeddings mean: -0.00027458658  std:  0.024344679


In [8]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [9]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()
print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []
hist = {}

fold = 1
for train_index, test_index in skf.split( df[ xcolumn ], df[ ycolumn ] ):
    # seprating train and test sets
    xtrain = df.loc[ train_index ][ xcolumn ].values
    xtest = df.loc[ test_index ][ xcolumn ].values
    
    ytrain = df.loc[ train_index ][ ycolumn ].values
    ytest = df.loc[ test_index ][ ycolumn ].values
    
    # splitting train and validation sets
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    # tokenization with keras tokenizer
    tokenizer = text.Tokenizer(  )
    tokenizer.fit_on_texts( np.append( xtrain, xval ) )

    xtrain = tokenizer.texts_to_sequences( xtrain )
    xval = tokenizer.texts_to_sequences( xval )
    xtest = tokenizer.texts_to_sequences( xtest )
    
    # pad the tokenized sequences
    xtrain = sequence.pad_sequences( xtrain, maxlen=MAX_LEN )
    xval = sequence.pad_sequences( xval, maxlen=MAX_LEN )
    xtest = sequence.pad_sequences( xtest, maxlen=MAX_LEN )
    
    # check if pre-trained word embeddings flag is true
    if PRE_TRAINED_FLAG == True:
        EMBEDDING_MATRIX = get_vectors(
            tokenizer=tokenizer, emb_mean=emb_mean, emb_std=emb_std, embed_size=EMBED_SIZE
        )
    
    # define a model
    EMBED_INP_SIZE = len( tokenizer.word_index ) + 1
    model = All_UT_Models.CNN_gram(
        max_len=MAX_LEN, embed_inp=EMBED_INP_SIZE, embed_size=EMBED_SIZE,
        embedding_matrix=EMBEDDING_MATRIX, embed_trainable=EMBED_TRAINABLE,
        drpt=DRPT, emb_weights_init=EMB_WEIGHTS_INIT,
        conv_weights_init=CONV_WEIGHTS_INIT, conv_act=CONV_ACT,
        fc_weights_init=FC_WEIGHTS_INIT, fc_act=FC_ACT, optimizer=OPTIMIZER
    )
    
    # save model summaries and model architecture diagrams in the first fold only
    if fold == 1:
        plot_model( model=model, to_file='{}/{}.png'.format( modelsummaries, modelname ), show_shapes=False )
        
        with open( '{}/{}.txt'.format( modelsummaries, modelname ), 'w' ) as s:
            with redirect_stdout( s ):
                model.summary()
    
    K.set_value( model.optimizer.lr, LR_RATE )
    
    # train the model with callbacks for early stopping
    f1callback = UT_Utils.F1_score_callback(
        val_data=( xval, yval ), filepath=modelpath + modelname + str( fold ),
        patience=PATIENCE, decay=DECAY, decay_rate=DECAY_RATE, decay_after=DECAY_AFTER
    )
    histmodel = model.fit(
        xtrain, ytrain, batch_size=BATCH, epochs=NEPOCHS, verbose=0, callbacks=[ f1callback ]
    )
    
    # save history of all folds
    hist[ 'fold' + str( fold ) ] = histmodel.history.copy()
    
    # delete trained model object
    del model, histmodel, f1callback
    EMBEDDING_MATRIX = []
    K.clear_session()
    gc.collect()
    # delete pre trained embeddings from the memory
    if ( PRE_TRAINED_FLAG == True ) and ( fold == skf.n_splits ):
        del EMBEDDINGS_INDEX
        gc.collect()
    
    # load saved model
    loaded_model = load_model( modelpath + modelname + str( fold ) )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict( xval, verbose=0, batch_size=BATCH )
    testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=BATCH )
    
    # delete loaded model
    del loaded_model
    K.clear_session()
    gc.collect()
    
    # optimizer threshold on validation set
    threshold = UT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = ( valpredictions >= threshold ).astype( 'int32' )
    testlabels = ( testpredictions >= threshold ).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ xcolumn ] )
    com_label.extend( df.loc[ test_index ][ ycolumn ].tolist() )
    com_predicted.extend( testlabels[:,0].tolist() )
    com_prob.extend( testpredictions[:,0].tolist() )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print( f"Total runtime: { hms_string( time_took ) }" )

Local System Time: 01:52 AM
Pretrained Embeddings: 28830 hits (16914 misses)
Epoch: 001 --LR: 1e-04 --MaxValF1: 0.7500000 --CurValF1: 0.7500000 --Patience: 01 --F1 improved: 0.7500000
Epoch: 002 --LR: 1e-04 --MaxValF1: 0.7956917 --CurValF1: 0.7956917 --Patience: 01 --F1 improved: 0.7956917
Epoch: 003 --LR: 1e-04 --MaxValF1: 0.8098847 --CurValF1: 0.8098847 --Patience: 01 --F1 improved: 0.8098847
Epoch: 004 --LR: 1e-04 --MaxValF1: 0.8128378 --CurValF1: 0.8128378 --Patience: 01 --F1 improved: 0.8128378
Epoch: 005 --LR: 1e-04 --MaxValF1: 0.8170155 --CurValF1: 0.8170155 --Patience: 01 --F1 improved: 0.8170155
Epoch: 006 --LR: 1e-04 --MaxValF1: 0.8170155 --CurValF1: 0.8065395 --Patience: 01
Epoch: 007 --LR: 3e-05 --MaxValF1: 0.8245734 --CurValF1: 0.8245734 --Patience: 02 --F1 improved: 0.8245734
Epoch: 008 --LR: 3e-05 --MaxValF1: 0.8245734 --CurValF1: 0.8095900 --Patience: 01
Epoch: 009 --LR: 9e-06 --MaxValF1: 0.8245734 --CurValF1: 0.8232462 --Patience: 02
Epoch: 010 --LR: 3e-06 --MaxValF1: 

In [10]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9414', '0.9385', '0.9363', '0.9410', '0.9386'] 0.9391732508874385 +- 0.0018483763906086024 

Validation Precision
['0.8683', '0.8508', '0.8157', '0.8832', '0.8433'] 0.8522507227908095 +- 0.0229645396028335 

Validation Recall
['0.7857', '0.7895', '0.8252', '0.7667', '0.8005'] 0.7935208149460392 +- 0.01923005239646285 

Validation F1
['0.8249', '0.8190', '0.8204', '0.8209', '0.8213'] 0.8213045672330272 +- 0.001957640948972297


In [11]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[1206  329]
 [ 183 7015]] 

[[1215  324]
 [ 213 6981]] 

[[1270  269]
 [ 287 6907]] 

[[1180  359]
 [ 156 7038]] 

[[1232  307]
 [ 229 6965]] 



In [12]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9406', '0.9385', '0.9359', '0.9347', '0.9369'] 0.9373238893740445 +- 0.0020439038268676877 

Test Precision
['0.8830', '0.8595', '0.8257', '0.8636', '0.8563'] 0.8576402035563808 +- 0.0184564747747249 

Test Recall
['0.7721', '0.7870', '0.8160', '0.7568', '0.7805'] 0.7824697090259379 +- 0.019561590099848012 

Test F1
['0.8239', '0.8217', '0.8208', '0.8067', '0.8166'] 0.8179310820789691 +- 0.006099230790065267


In [13]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2023   597]
 [  268 11667]] 

[[ 2062   558]
 [  337 11597]] 

[[ 2137   482]
 [  451 11484]] 

[[ 1982   637]
 [  313 11622]] 

[[ 2044   575]
 [  343 11592]] 



In [14]:
file = open( '{}/ResultsMain.csv'.format( modelresults ), mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [15]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( '{}/{}.csv'.format( modelresults, modelname ), index=False )
dfPredictions.shape

(72771, 4)