In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [2]:
import All_UT_Models
import UT_Utils

In [3]:
# hyper parameters for this model

PENALTY = 'l2'
C = 18
SOLVER = 'newton-cg'
CLASS_WEIGHT = 'balanced'

In [4]:
modelname = 'LR'

modelpath = './Saved Models/' + modelname + '/'
modelresults = './Model Results'
modelsummaries = './Model - Summaries-Figures'

DIRECTORIES_TO_BE_CREATED = [ modelpath, modelresults, modelsummaries ]
for directory in DIRECTORIES_TO_BE_CREATED:
    if not os.path.exists( directory ):
        os.makedirs( directory )

In [5]:
def hms_string( sec_elapsed ):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{} hrs {:>02} mins {:>05.2f} secs".format( h, m, s )

In [6]:
sheets = [ pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet1' ), pd.read_excel( 'RUT_UT_Comments.xlsx', 'Sheet2' ) ]
df = pd.concat( sheets )
df.reset_index( drop=True, inplace=True )

df.Urdu = df.Urdu.astype( 'str' )
df.Urdu = df.Urdu.apply( UT_Utils.urdu_preprocessing )

df.dropna( inplace=True )
df.reset_index( drop=True, inplace=True )

del sheets
gc.collect()

xcolumn = 'Urdu'
ycolumn = 'Toxic'

df.shape

(72771, 3)

In [7]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [8]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

fold = 1
for train_index, test_index in skf.split( df[ xcolumn ], df[ ycolumn ] ):
    # seprating train and test sets
    xtrain = df.loc[ train_index ][ xcolumn ].values
    xtest = df.loc[ test_index ][ xcolumn ].values
    
    ytrain = df.loc[ train_index ][ ycolumn ].values
    ytest = df.loc[ test_index ][ ycolumn ].values
    
    # splitting train and validation sets
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.15, random_state=0 )
    
    # tf.idf vectorization    
    vectorizer = TfidfVectorizer(  )
    vectorizer.fit( np.append( xtrain, xval ) )
    
    xtrain = vectorizer.transform( xtrain )
    xval = vectorizer.transform( xval )
    xtest = vectorizer.transform( xtest )
    
    # define a model
    model = All_UT_Models.LR_Model( pen=PENALTY, c=C, sol=SOLVER, class_weight=CLASS_WEIGHT )
    
    # train the model
    model.fit( xtrain, ytrain )
    
    # save the model
    with open( modelpath + modelname + str(fold) + '.pkl', 'wb' ) as f:
        pickle.dump( model, f )
    
    # delete trained model
    del model
    gc.collect()
    
    # load saved model
    with open( modelpath + modelname + str(fold) + '.pkl', 'rb' ) as f:
        loaded_model = pickle.load( f )
    
    # get predictions (probabilities) for validation and test sets respectively
    valpredictions = loaded_model.predict_proba( xval )[ :, 1 ]
    testpredictions = loaded_model.predict_proba( xtest )[ :, 1 ]
    
    # delete loaded model
    del loaded_model
    gc.collect()
    
    # optimizer threshold on validation set
    threshold = UT_Utils.optimize_threshold( yval, valpredictions )
    
    # save accuracy, precision, recall, f1 and confusion matrices
    vallabels = ( valpredictions >= threshold ).astype( 'int32' )
    testlabels = ( testpredictions >= threshold ).astype( 'int32' )
    
    valaccuracy.append( accuracy_score( yval, vallabels ) )
    valprecision.append( precision_score( yval, vallabels ) )
    valrecall.append( recall_score( yval, vallabels ) )
    valf1.append( f1_score( yval, vallabels ) )
    valcm.append( confusion_matrix( yval, vallabels ) )    
    
    testaccuracy.append( accuracy_score( ytest, testlabels ) )
    testprecision.append( precision_score( ytest, testlabels ) )
    testrecall.append( recall_score( ytest, testlabels ) )
    testf1.append( f1_score( ytest, testlabels ) )
    testcm.append( confusion_matrix( ytest, testlabels ) )
    
    # save for future analysis and ensemble
    com_indices.extend( test_index.tolist() )
    com_text.extend( df.loc[ test_index ][ xcolumn ] )
    com_label.extend( df.loc[ test_index ][ ycolumn ].tolist() )
    com_predicted.extend( testlabels )
    com_prob.extend( testpredictions )
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print( f"Total runtime: { hms_string( time_took ) }" )

Fold: 01 out of 05 completed.
Fold: 02 out of 05 completed.
Fold: 03 out of 05 completed.
Fold: 04 out of 05 completed.
Fold: 05 out of 05 completed.
Total runtime: 0 hrs 00 mins 25.80 secs


In [9]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9553', '0.9559', '0.9547', '0.9567', '0.9567'] 0.955868544600939 +- 0.0007986073868848263 

Validation Precision
['0.8860', '0.8893', '0.8972', '0.8842', '0.8867'] 0.8886912340265815 +- 0.0045415644134817875 

Validation Recall
['0.8560', '0.8564', '0.8389', '0.8681', '0.8648'] 0.8568452377172875 +- 0.010149717074840027 

Validation F1
['0.8708', '0.8726', '0.8670', '0.8761', '0.8757'] 0.8724164841381492 +- 0.0033348432544904176


In [10]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[1314  221]
 [ 169 7029]] 

[[1318  221]
 [ 164 7030]] 

[[1291  248]
 [ 148 7046]] 

[[1336  203]
 [ 175 7019]] 

[[1331  208]
 [ 170 7024]] 



In [11]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9549', '0.9556', '0.9566', '0.9519', '0.9554'] 0.954899616193796 +- 0.001599396645913729 

Test Precision
['0.8842', '0.8829', '0.8957', '0.8700', '0.8875'] 0.8840548981011944 +- 0.008312031003289986 

Test Recall
['0.8626', '0.8687', '0.8591', '0.8614', '0.8614'] 0.8626398398083296 +- 0.003235010855818735 

Test F1
['0.8733', '0.8757', '0.8770', '0.8657', '0.8742'] 0.8731896774867194 +- 0.00395982889921393


In [12]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2260   360]
 [  296 11639]] 

[[ 2276   344]
 [  302 11632]] 

[[ 2250   369]
 [  262 11673]] 

[[ 2256   363]
 [  337 11598]] 

[[ 2256   363]
 [  286 11649]] 



In [13]:
file = open( '{}/ResultsMain.csv'.format( modelresults ), mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [14]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
#dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( '{}/{}.csv'.format( modelresults, modelname ), index=False )
dfPredictions.shape

(72771, 4)