In [None]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence

In [2]:
import All_RUT_Models
import RUT_Utils

In [3]:
# hyper parameters for this model

max_len = 150
embed_size = 300
pre_trained_flag = True
embed_trainable = False
emb_weights_init = 'glorot_normal'
spdrpt = 0.3
drpt = 0.15
conv_weights_init = 'glorot_uniform'
conv_act = 'elu'
fc_weights_init = 'glorot_uniform'
fc_act = 'elu'
lr_rate = 0.001
optimizer = 'adam'
multi_gpu_flag = 0
gpus = 2
batch = 64
nepochs = 30
patience = 7
decay = True
decay_rate = 0.5
decay_after = 3

In [4]:
#embeddingfile = './General_Embeddings/glove.txt'
#embeddingfile = './General_Embeddings/w2v_cbow.txt'
#embeddingfile = './General_Embeddings/w2v_sg.txt'
#embeddingfile = './General_Embeddings/ft_cbow.vec'
embeddingfile = './General_Embeddings/ft_sg.vec'

embedding_matrix = []
max_features = 100000

modelname = 'CNN_Tweaked_ft_sg'

modelpath = './Models/' + modelname + '/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( './Results/' ):
    os.makedirs( './Results/' )

In [5]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [6]:
sheets = [ pd.read_excel( 'RomanUrduToxicity.xlsx', 'Sheet1' ), pd.read_excel( 'RomanUrduToxicity.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )
df.Comment = df.Comment.astype( 'str' )
del sheets
df.shape

(72771, 2)

In [7]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer ):
    word_index = tokenizer.word_index
    num_words = min( max_features, len( word_index ) + 1 )
    embedding_matrix = np.zeros( ( num_words, embed_size ) )
    for word, i in word_index.items(  ):
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get( word )
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix

if pre_trained_flag == True:
    embeddings_index = dict( get_coefs( *o.rstrip().rsplit(' ') ) for o in open( embeddingfile, encoding='utf-8' ) )

In [8]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [9]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

fold = 1
for train_index, test_index in skf.split( df.Comment, df.Toxic ):
    # clearing previous sessions
    K.clear_session()
    tf.reset_default_graph()
    
    # tokenization with keras tokenizer
    tokenizer = text.Tokenizer( num_words=max_features )
    tokenizer.fit_on_texts( df.loc[ train_index ][ 'Comment' ].values )

    traincomments = tokenizer.texts_to_sequences( df.loc[ train_index ][ 'Comment' ].values )
    testcomments = tokenizer.texts_to_sequences( df.loc[ test_index ][ 'Comment' ].values )
    
    # pad the tokenized sequences
    xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
    xtest = sequence.pad_sequences( testcomments, maxlen=max_len )
    
    ytrain = df.loc[ train_index ][ 'Toxic' ].values
    ytest = df.loc[ test_index ][ 'Toxic' ].values
    
    # split train and val
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    # check if pre-trained word embeddings flag is true
    if pre_trained_flag == True:
        embedding_matrix = get_vectors( tokenizer=tokenizer)
    
    # define a model
    model = All_RUT_Models.CNN_Tweaked( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                       embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                       spdrpt=spdrpt, drpt=drpt, emb_weights_init=emb_weights_init,
                                       conv_weights_init=conv_weights_init, conv_act=conv_act,
                                       fc_weights_init=fc_weights_init, fc_act=fc_act, optimizer=optimizer,
                                       multi_gpu_flag=multi_gpu_flag, gpus=gpus )
    
    K.set_value( model.optimizer.lr, lr_rate )
    
    # train the model with callbacks for early stopping
    f1metric = RUT_Utils.F1Metrics( modelpath + modelname + str( fold ) + '.h5', patience=patience,
                                   decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=False )
    hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ),
                     epochs=nepochs, verbose=0, callbacks=[ f1metric ] )
    
    # load saved model
    loaded_model = load_model( modelpath + modelname + str(fold) + '.h5' )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict( xval, verbose=0, batch_size=2048 )
    testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=2048 )
    
    # optimizer threshold on validation set
    threshold = RUT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = (valpredictions>=threshold).astype( 'int32' )
    testlabels = (testpredictions>=threshold).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ 'Comment' ] )
    com_label.extend( df.loc[ test_index ][ 'Toxic' ].tolist() )
    com_predicted.extend( testlabels[:,0].tolist() )
    com_prob.extend( testpredictions[:,0].tolist() )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

Epoch: 000 --MaxValF1: 0.85482782 --CurValF1: 0.85482782 --Patience: 00 --improved f1: 0.85482782
Epoch: 001 --MaxValF1: 0.86225523 --CurValF1: 0.86225523 --Patience: 00 --improved f1: 0.86225523
Epoch: 002 --MaxValF1: 0.86620870 --CurValF1: 0.86620870 --Patience: 00 --improved f1: 0.86620870
Epoch: 003 --MaxValF1: 0.86620870 --CurValF1: 0.86531427 --Patience: 00
Epoch: 004 --MaxValF1: 0.86620870 --CurValF1: 0.86375661 --Patience: 01
Epoch: 005 --MaxValF1: 0.86620870 --CurValF1: 0.86459378 --Patience: 02
Epoch: 006 --MaxValF1: 0.86620870 --CurValF1: 0.86117493 --Patience: 03
Epoch: 007 --MaxValF1: 0.86620870 --CurValF1: 0.86410684 --Patience: 04
Epoch: 008 --MaxValF1: 0.86620870 --CurValF1: 0.85843568 --Patience: 05
Epoch: 009 --MaxValF1: 0.86620870 --CurValF1: 0.86208043 --Patience: 06
Training stopped due to the patience parameter. --Patience: 07
Fold: 01 out of 05 completed.
Epoch: 000 --MaxValF1: 0.85017645 --CurValF1: 0.85017645 --Patience: 00 --improved f1: 0.85017645
Epoch: 001 

In [10]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9532', '0.9539', '0.9549', '0.9587', '0.9566'] 0.9554334134890645 +- 0.0019865107928273224 

Validation Precision
['0.8699', '0.8838', '0.8924', '0.9171', '0.8919'] 0.8910229101983141 +- 0.015381245866475584 

Validation Recall
['0.8625', '0.8499', '0.8460', '0.8415', '0.8577'] 0.8515204890014878 +- 0.007667908504861516 

Validation F1
['0.8662', '0.8665', '0.8686', '0.8777', '0.8745'] 0.8706860346715013 +- 0.004580823159118673


In [11]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[1324  211]
 [ 198 7000]] 

[[1308  231]
 [ 172 7022]] 

[[1302  237]
 [ 157 7037]] 

[[1295  244]
 [ 117 7077]] 

[[1320  219]
 [ 160 7034]] 



In [12]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9483', '0.9488', '0.9492', '0.9439', '0.9480'] 0.9476439705208058 +- 0.001904368607708735 

Test Precision
['0.8869', '0.8924', '0.8913', '0.9066', '0.8889'] 0.8932292909719652 +- 0.006973980620971416 

Test Recall
['0.8168', '0.8137', '0.8175', '0.7675', '0.8125'] 0.8056028610652046 +- 0.01915638953989145 

Test F1
['0.8504', '0.8513', '0.8528', '0.8313', '0.8490'] 0.8469462905143592 +- 0.007938130015467478


In [13]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2140   480]
 [  273 11662]] 

[[ 2132   488]
 [  257 11677]] 

[[ 2141   478]
 [  261 11674]] 

[[ 2010   609]
 [  207 11728]] 

[[ 2128   491]
 [  266 11669]] 



In [14]:
file = open( 'Results/ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [15]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( 'Results/' + modelname + '.csv', index=False )
dfPredictions.shape

(72771, 4)