In [14]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.models import load_model
# NLP library
import nltk
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, average_precision_score
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score, auc, classification_report

# function
import sys
sys.path.append('function/')
from ursar import nlp

In [2]:
#function
def preprocess_text(sentence):
    id_stop = set(nltk.corpus.stopwords.words('indonesian'))
    factory_Stemmer = StemmerFactory()
    stemmer = factory_Stemmer.create_stemmer()
    sentence = re.sub(r'\W', ' ', str(sentence))
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence)
    sentence = sentence.replace("\n"," ")
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = re.sub(r"\s+$", "", sentence)
    sentence = re.sub(r"^\s+", "", sentence)
    sentence = sentence.lower()
    sentence = sentence.split()
    sentence = [word for word in sentence if word not in id_stop]
    sentence = [word for word in sentence if len(word) > 3]
    sentence = ' '.join(sentence)
    sentence = stemmer.stem(sentence)
    return sentence

def tokenize_matrix(corpus,mode):
    #corpus, 5000, 'post', 120
    # create the tokenizer
    # load label train dataset file here
    with open('model/tokenizer_ann', 'rb') as picklefile:
        tokenizer = pickle.load(picklefile)

    # encode training data set
    sen = tokenizer.texts_to_matrix(corpus, mode=mode)
    return(sen)

def tokenize_embedding(corpus,padding_type,max_length):
    #corpus, 5000, 'post', 120
    # create the tokenizer
    # load label train dataset file here
    with open('model/tokenizer_embed', 'rb') as picklefile:
        tokenizer = pickle.load(picklefile)

    # encode training data set
    sen = tokenizer.texts_to_sequences(corpus)
    sen = pad_sequences(sen, padding=padding_type, maxlen=max_length)
    return(sen)

def predict_embedding(reviews, model,input):
    # apply preprocess_text function to out training dataset
    # encode
    if (input == "embedding"):
        encoded = tokenize_embedding(reviews,'post', 120)
    if (input == "matrixs"):
        encoded = tokenize_matrix(reviews,"freq")
    # prediction
    yhat = model.predict(encoded, verbose=0)

    if (yhat[0,0]>=0.5):
        res = "positive review"
    else:
        res = "negative review"
    return (res,yhat[0,0])

def print_score (y_test,y_pred,y_probs):
    print("comfusion matrix = ")
    print(confusion_matrix(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    (tn,fp,fn,tp) = confusion_matrix(y_test, y_pred).ravel()

    print("")
    accuracy = accuracy_score(y_test, y_pred)
    print('accuracy_score = ', accuracy)
    bas = balanced_accuracy_score(y_test, y_pred)
    print('balanced_accuracy_score = ', bas)
    #balanced accuracy is equal to the arithmetic mean of sensitivity (true positive rate) and specificity (true negative rate),
    #or the area under the ROC curve with binary predictions rather than scores.

    #In multilabel classification,
    #this function computes subset accuracy: the set of labels predicted for
    #a sample must exactly match the corresponding set of labels in y_true

    print("")
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    aps = average_precision_score(y_test, y_pred)
    print ("precision score = ", precision)
    print ("average precision score = ", aps)
    print ("recall score = ", recall)

    #precision An interesting one to look at is the accuracy of the positive pre‐ dictions; this is called the precision of the classifier
    # recall, also called sensitivity or true positive rate (TPR): this is the ratio of positive instances that are correctly detected by the classifier
    #precision = TP/TP + FP
    #recall = TP/TP + FN

    print("")
    f1 = f1_score(y_test, y_pred)
    print ("F1 score = ", f1)
    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
    aucs = auc(recall, precision)
    print ("AUC of Precision-Recall Curve on Testing = ", aucs)
    aucroc = roc_auc_score(y_test,y_probs)
    print ("AUC of ROC = ", aucroc)
    gini = aucs*2 - 1
    print("Gini = ", gini)

    print("")
    cr = classification_report(y_test,y_pred)
    print("classification_report")
    print(cr)

    #The F1 score is the harmonic mean of precision and recall (Equation 3-3).
    #Whereas the regular mean treats all values equally,
    #the harmonic mean gives much more weight to low values.

# Load Test dataset

In [3]:
print("Load Test dataset")
test = pd.read_csv('DATA/test_data_restaurant.tsv', sep='\t',header=None).sample(frac=1).reset_index(drop=True)
test.columns = ['sentence', 'label']
print("\ndata testing shape")
print(test.shape)

Load Test dataset

data testing shape
(185, 2)


In [4]:
print("\nis the test dataset contain the null values?")
print(test.isnull().sum())


is the test dataset contain the null values?
sentence    0
label       0
dtype: int64


In [5]:
test["label"] = list(map(lambda x: 1 if x=="positive" else 0, test["label"]))
print("\nchange label for test dataset")
test["label"].unique()
y_test = test["label"]


change label for test dataset


In [6]:
print("\napply preprocess_text function to out testing dataset")
reviews_test = []
sentences = list(test["sentence"])
for sen in sentences:
    reviews_test.append(preprocess_text(sen))


apply preprocess_text function to out testing dataset


In [7]:
print("\nmake token for test dataset")
token_test_ann = tokenize_matrix(reviews_test,"freq")
token_test = tokenize_embedding(reviews_test,'post', 120)


make token for test dataset


In [8]:
print("\nload matrix_ANN model")
model = load_model('model/model_matrix_ANN.h5')


load matrix_ANN model


In [17]:
loss, accuracy = model.evaluate(token_test_ann,y_test , verbose=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("Testing loss:  {:.4f}\n".format(loss))
print_score(y_test,np.round(model.predict(token_test_ann)),model.predict(token_test_ann))


Testing Accuracy:  0.7892
Testing loss:  0.4885

comfusion matrix = 
[[ 32  33]
 [  6 114]]

accuracy_score =  0.7891891891891892
balanced_accuracy_score =  0.7211538461538461

precision score =  0.7755102040816326
average precision score =  0.7691671263099834
recall score =  0.95

F1 score =  0.8539325842696629
AUC of Precision-Recall Curve on Testing =  0.8606574410209081
AUC of ROC =  0.8242307692307692
Gini =  0.7213148820418163

classification_report
              precision    recall  f1-score   support

           0       0.84      0.49      0.62        65
           1       0.78      0.95      0.85       120

    accuracy                           0.79       185
   macro avg       0.81      0.72      0.74       185
weighted avg       0.80      0.79      0.77       185



In [18]:
print("\nload CNN model")
model = load_model('model/model_CNN.h5')


load CNN model


In [20]:
loss, accuracy = model.evaluate(token_test,y_test, verbose=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("Testing loss:  {:.4f}\n".format(loss))
print_score(y_test,np.round(model.predict(token_test)),model.predict(token_test))


Testing Accuracy:  0.8270
Testing loss:  0.5228

comfusion matrix = 
[[ 43  22]
 [ 10 110]]

accuracy_score =  0.827027027027027
balanced_accuracy_score =  0.7891025641025641

precision score =  0.8333333333333334
average precision score =  0.8179429429429429
recall score =  0.9166666666666666

F1 score =  0.8730158730158729
AUC of Precision-Recall Curve on Testing =  0.8840873698179894
AUC of ROC =  0.8451282051282051
Gini =  0.7681747396359788

classification_report
              precision    recall  f1-score   support

           0       0.81      0.66      0.73        65
           1       0.83      0.92      0.87       120

    accuracy                           0.83       185
   macro avg       0.82      0.79      0.80       185
weighted avg       0.83      0.83      0.82       185



In [22]:
print("\nload CNN LSTM model")
model = load_model('model/model_CNN_LSTM.h5')


load CNN LSTM model


In [23]:
loss, accuracy = model.evaluate(token_test,y_test, verbose=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("Testing loss:  {:.4f}\n".format(loss))
print_score(y_test,np.round(model.predict(token_test)),model.predict(token_test))


Testing Accuracy:  0.7946
Testing loss:  0.5262

comfusion matrix = 
[[ 47  18]
 [ 20 100]]

accuracy_score =  0.7945945945945946
balanced_accuracy_score =  0.7782051282051282

precision score =  0.847457627118644
average precision score =  0.8143227973736449
recall score =  0.8333333333333334

F1 score =  0.8403361344537815
AUC of Precision-Recall Curve on Testing =  0.9035301805555064
AUC of ROC =  0.8628205128205128
Gini =  0.8070603611110128

classification_report
              precision    recall  f1-score   support

           0       0.70      0.72      0.71        65
           1       0.85      0.83      0.84       120

    accuracy                           0.79       185
   macro avg       0.77      0.78      0.78       185
weighted avg       0.80      0.79      0.80       185



In [24]:
print("\nload CNN wiki model")
model = load_model('model/model_CNN_wiki.h5')


load CNN wiki model


In [25]:
loss, accuracy = model.evaluate(token_test,y_test, verbose=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("Testing loss:  {:.4f}\n".format(loss))
print_score(y_test,np.round(model.predict(token_test)),model.predict(token_test))


Testing Accuracy:  0.7730
Testing loss:  0.5976

comfusion matrix = 
[[ 31  34]
 [  8 112]]

accuracy_score =  0.772972972972973
balanced_accuracy_score =  0.7051282051282052

precision score =  0.7671232876712328
average precision score =  0.7592249784030606
recall score =  0.9333333333333333

F1 score =  0.8421052631578947
AUC of Precision-Recall Curve on Testing =  0.8765774603184949
AUC of ROC =  0.816923076923077
Gini =  0.7531549206369899

classification_report
              precision    recall  f1-score   support

           0       0.79      0.48      0.60        65
           1       0.77      0.93      0.84       120

    accuracy                           0.77       185
   macro avg       0.78      0.71      0.72       185
weighted avg       0.78      0.77      0.76       185



In [26]:
print("\nload CNN LSTM wiki model")
model = load_model('model/model_CNN_LSTM_wiki.h5')


load CNN LSTM wiki model


In [27]:
loss, accuracy = model.evaluate(token_test,y_test, verbose=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("Testing loss:  {:.4f}\n".format(loss))
print_score(y_test,np.round(model.predict(token_test)),model.predict(token_test))


Testing Accuracy:  0.7676
Testing loss:  0.6052

comfusion matrix = 
[[ 36  29]
 [ 14 106]]

accuracy_score =  0.7675675675675676
balanced_accuracy_score =  0.7185897435897436

precision score =  0.7851851851851852
average precision score =  0.7692559225892559
recall score =  0.8833333333333333

F1 score =  0.8313725490196078
AUC of Precision-Recall Curve on Testing =  0.8986731472238412
AUC of ROC =  0.8398717948717946
Gini =  0.7973462944476823

classification_report
              precision    recall  f1-score   support

           0       0.72      0.55      0.63        65
           1       0.79      0.88      0.83       120

    accuracy                           0.77       185
   macro avg       0.75      0.72      0.73       185
weighted avg       0.76      0.77      0.76       185

