In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [2]:
import All_UT_Models
import UT_Utils

In [3]:
# hyper parameters for this model

KERNEL = 'rbf'
C = 10
GAMMA = 0.9
CLASS_WEIGHT = 'balanced'

In [4]:
modelname = 'SVM'

modelpath = './Saved Models/' + modelname + '/'
modelresults = './Model Results'
modelsummaries = './Model - Summaries-Figures'

DIRECTORIES_TO_BE_CREATED = [ modelpath, modelresults, modelsummaries ]
for directory in DIRECTORIES_TO_BE_CREATED:
    if not os.path.exists( directory ):
        os.makedirs( directory )

In [5]:
def hms_string( sec_elapsed ):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{} hrs {:>02} mins {:>05.2f} secs".format( h, m, s )

In [6]:
sheets = [ pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet1' ), pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )

df.Urdu = df.Urdu.astype( 'str' )
df.Urdu = df.Urdu.apply( UT_Utils.urdu_preprocessing )

df.dropna( inplace=True )
df.reset_index( drop=True, inplace=True )

del sheets
gc.collect()

xcolumn = 'Urdu'
ycolumn = 'Toxic'

df.shape

(72771, 3)

In [7]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [8]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()
print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

fold = 1
for train_index, test_index in skf.split( df[ xcolumn ], df[ ycolumn ] ):
    # seprating train and test sets
    xtrain = df.loc[ train_index ][ xcolumn ].values
    xtest = df.loc[ test_index ][ xcolumn ].values
    
    ytrain = df.loc[ train_index ][ ycolumn ].values
    ytest = df.loc[ test_index ][ ycolumn ].values
    
    # splitting train and validation sets
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    # tf.idf vectorization    
    vectorizer = TfidfVectorizer(  )
    vectorizer.fit( np.append( xtrain, xval ) )
    
    xtrain = vectorizer.transform( xtrain )
    xval = vectorizer.transform( xval )
    xtest = vectorizer.transform( xtest )
    
    # define a model
    model = All_UT_Models.SVM_Model( c=C, ker=KERNEL, gam=GAMMA, class_weight=CLASS_WEIGHT )
    
    # train the model
    model.fit( xtrain, ytrain )
    
    # delete trained model
    with open( modelpath + modelname + str(fold) + '.pkl', 'wb' ) as f:
        pickle.dump( model, f )
    
    # delete model
    del model
    gc.collect()
    
    # load saved model
    with open( modelpath + modelname + str(fold) + '.pkl', 'rb' ) as f:
        loaded_model = pickle.load( f )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict_proba( xval )[ :, 1 ]
    testpredictions = loaded_model.predict_proba( xtest )[ :, 1 ]
    
    # delete loaded model
    del loaded_model
    gc.collect()
    
    # optimizer threshold on validation set
    threshold = UT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = ( valpredictions >= threshold ).astype( 'int32' )
    testlabels = ( testpredictions >= threshold ).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ xcolumn ] )
    com_label.extend( df.loc[ test_index ][ ycolumn ].tolist() )
    com_predicted.extend( testlabels )
    com_prob.extend( testpredictions )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    print( 'Local System Time: {}'.format( time.strftime( "%I:%M %p", time.localtime() ) ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print( f"Total runtime: { hms_string( time_took ) }" )

Local System Time: 06:36 PM
Fold: 01 out of 05 completed.
Local System Time: 06:45 PM
Fold: 02 out of 05 completed.
Local System Time: 06:54 PM
Fold: 03 out of 05 completed.
Local System Time: 07:04 PM
Fold: 04 out of 05 completed.
Local System Time: 07:13 PM
Fold: 05 out of 05 completed.
Local System Time: 07:22 PM
Total runtime: 0 hrs 46 mins 24.43 secs


In [9]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9599', '0.9592', '0.9565', '0.9608', '0.9587'] 0.9590289705713959 +- 0.001463556194542134 

Validation Precision
['0.9022', '0.8833', '0.8765', '0.8950', '0.8850'] 0.8884288991512779 +- 0.009096096915310516 

Validation Recall
['0.8658', '0.8856', '0.8765', '0.8811', '0.8798'] 0.8777729944356608 +- 0.006661709906137988 

Validation F1
['0.8836', '0.8845', '0.8765', '0.8880', '0.8824'] 0.8830130485388701 +- 0.003738370138215867


In [10]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[1329  206]
 [ 144 7054]] 

[[1363  176]
 [ 180 7014]] 

[[1349  190]
 [ 190 7004]] 

[[1356  183]
 [ 159 7035]] 

[[1354  185]
 [ 176 7018]] 



In [11]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9593', '0.9568', '0.9598', '0.9590', '0.9577'] 0.9585274243961542 +- 0.0011079713052996973 

Test Precision
['0.9056', '0.8801', '0.8815', '0.8874', '0.8824'] 0.8873952697114655 +- 0.00942513479518696 

Test Recall
['0.8641', '0.8798', '0.8973', '0.8843', '0.8828'] 0.88165376913862 +- 0.010617645342731252 

Test F1
['0.8844', '0.8799', '0.8893', '0.8858', '0.8826'] 0.8844127003039537 +- 0.003138309037483437


In [12]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2264   356]
 [  236 11699]] 

[[ 2305   315]
 [  314 11620]] 

[[ 2350   269]
 [  316 11619]] 

[[ 2316   303]
 [  294 11641]] 

[[ 2312   307]
 [  308 11627]] 



In [13]:
file = open( '{}/ResultsMain.csv'.format( modelresults ), mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [14]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( '{}/{}.csv'.format( modelresults, modelname ), index=False )
dfPredictions.shape

(72771, 4)