In [None]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [None]:
import All_RUT_Models
import RUT_Utils

In [None]:
# hyper parameters for this model

nestimators = 1500
criterion = 'entropy'
bootstrap=True
oob_score=True
class_weight='balanced'

In [None]:
modelname = 'RF'

modelpath = './Models/' + modelname + '/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( './Results/' ):
    os.makedirs( './Results/' )

In [None]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [None]:
sheets = [ pd.read_excel( 'RomanUrduToxicity.xlsx', 'Sheet1' ), pd.read_excel( 'RomanUrduToxicity.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )
df.Comment = df.Comment.astype( 'str' )
del sheets
df.shape

In [None]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

In [None]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

fold = 1
for train_index, test_index in skf.split( df.Comment, df.Toxic ):
    # tf.idf vectorization    
    vectorizer = TfidfVectorizer(  )
    vectorizer.fit( df.loc[ train_index ][ 'Comment' ].values )
    
    xtrain = vectorizer.transform( df.loc[ train_index ][ 'Comment' ].values )
    xtest = vectorizer.transform( df.loc[ test_index ][ 'Comment' ].values )
    ytrain = df.loc[ train_index ][ 'Toxic' ].values
    ytest = df.loc[ test_index ][ 'Toxic' ].values
    
    # split train and val
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=1 )
    
    # define a model
    model = All_RUT_Models.RF_Model( n_est=nestimators, crit=criterion, bootstrap=bootstrap,
                                    oob_score=oob_score, class_weight=class_weight )
    
    # train the model
    model.fit( xtrain, ytrain )
    
    # save the model
    with open( modelpath + modelname + str(fold) + '.pkl', 'wb' ) as f:
        pickle.dump( model, f )
    
    # load saved model
    with open( modelpath + modelname + str(fold) + '.pkl', 'rb' ) as f:
        model = pickle.load( f )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = model.predict_proba( xval )[ :, 1 ]
    testpredictions = model.predict_proba( xtest )[ :, 1 ]
    
    # optimizer threshold on validation set
    threshold = RUT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = (valpredictions>=threshold).astype( 'int32' )
    testlabels = (testpredictions>=threshold).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ 'Comment' ] )
    com_label.extend( df.loc[ test_index ][ 'Toxic' ].tolist() )
    com_predicted.extend( testlabels )
    com_prob.extend( testpredictions )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

In [None]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

In [None]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [None]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

In [None]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [None]:
file = open( 'Results/ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [None]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( 'Results/' + modelname + '.csv', index=False )
dfPredictions.shape