<a href="https://colab.research.google.com/github/kairamilanifitria/Bootcamp-NLP/blob/main/13092024_Evaluation_Metrics_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, GRU, SimpleRNN
from keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Bootcamp AI/Dataset/dataset_spam.csv')
data.tail()

Unnamed: 0,content,label
6663,f a n s b e t i n g agent bola terpercaya di i...,spam
6664,pasti ada campur tangan ulah orang dalam karen...,normal
6665,mauuu puunyaa kullittt puutiiihh secaaraa inns...,spam
6666,bagi aku kaaa dikit aja,normal
6667,mila sipitt allah tuh benci sama orang yg meru...,normal


In [3]:
# mapping  label dan konten, X untuk konten, dan Y untuk label
X = data.content
Y = data.label
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
Y = Y.reshape(-1,1)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
# 6000 row
# 70% data train dan 30% data test.
# 80< maka dia spam

In [5]:
# conversi teks ke sequence
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train) # indexing
sequences_matrix = pad_sequences(sequences,maxlen=max_len)

In [6]:
# define RNN
def RNN():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 50, input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [7]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])



In [8]:
model.fit(sequences_matrix, Y_train,batch_size=128, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 411ms/step - accuracy: 0.8596 - loss: 0.4328 - val_accuracy: 0.9111 - val_loss: 0.3332
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 480ms/step - accuracy: 0.9160 - loss: 0.2931 - val_accuracy: 0.9272 - val_loss: 0.2811
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 234ms/step - accuracy: 0.9365 - loss: 0.2367 - val_accuracy: 0.9336 - val_loss: 0.2228
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 269ms/step - accuracy: 0.9511 - loss: 0.1759 - val_accuracy: 0.9465 - val_loss: 0.1919
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 306ms/step - accuracy: 0.9633 - loss: 0.1318 - val_accuracy: 0.9572 - val_loss: 0.1512
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 232ms/step - accuracy: 0.9643 - loss: 0.1227 - val_accuracy: 0.9540 - val_loss: 0.1385
Epoch 7/10
[1m30/30[0

<keras.src.callbacks.history.History at 0x7958b2730fd0>

In [9]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)

In [10]:
accr = model.evaluate(test_sequences_matrix, Y_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9515 - loss: 0.1802


In [11]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.162
  Accuracy: 0.957


# Matriks NLP Dasar

* Akurasi, Presisi, Recall, F1-Score

In [12]:
y_prediction = model.predict(test_sequences_matrix)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step


In [13]:
confusion_matrix = confusion_matrix(Y_test, np.rint(y_prediction))

In [14]:
confusion_matrix

array([[1696,   14],
       [  73,  218]])

In [15]:
print(classification_report(Y_test, np.rint(y_prediction)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1710
           1       0.94      0.75      0.83       291

    accuracy                           0.96      2001
   macro avg       0.95      0.87      0.90      2001
weighted avg       0.96      0.96      0.95      2001



# Matriks NLP Lanjut

* BLEU

In [16]:
from nltk.translate.bleu_score import sentence_bleu

In [17]:
reference = [['budi', 'pergi', 'ke', 'sekolah']]
candidate = ['budi', 'pergi', 'ke', 'sekolah'] # from machine translation
score = sentence_bleu(reference, candidate)
print(score)

1.0


In [18]:
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["aku", "suka", "makan", "nasi", "goreng", "di", "tempat", "ini"]

score = sentence_bleu(reference, candidate)
print(score)

0.5410822690539396


In [19]:
# 1-gram individual BLEU

reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["aku", "suka", "makan", "nasi", "goreng", "di", "tempat", "ini"]

score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
print(score)

0.75


In [20]:
# n-gram individual BLEU

print('Individual 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Individual 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))
print('Individual 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))
print('Individual 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))

Individual 1-gram: 0.750000
Individual 2-gram: 0.571429
Individual 3-gram: 0.500000
Individual 4-gram: 0.400000


In [21]:
# 4-gram cumulative BLEU

score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
print(score)

0.5410822690539396


In [22]:
# cumulative BLEU scores

print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))

print('Cumulative 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))

Cumulative 1-gram: 0.750000
Cumulative 2-gram: 0.654654
Cumulative 3-gram: 0.601489
Cumulative 4-gram: 0.541082


In [23]:
# perfect match
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"]
score = sentence_bleu(reference, candidate)
print(score)

1.0


In [24]:
# berbeda satu kata
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["saya", "suka", "makan", "nasi", "goreng", "di", "tempat", "ini"]
score = sentence_bleu(reference, candidate)
print(score)

0.7071067811865475


In [25]:
# berbeda tiga kata
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["saya", "suka", "beli", "nasi", "goreng", "di", "tempat", "ini"]
score = sentence_bleu(reference, candidate)
print(score)

5.87583260478785e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [26]:
# berbeda keseluruhan
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["q", "w", "e", "r", "t", "y", "u", "i"]
score = sentence_bleu(reference, candidate)
print(score)

0


* ROUGE

In [27]:
!pip install evaluate
!pip install rouge-score



In [28]:
import evaluate
rouge = evaluate.load('rouge')
predictions = ["Hari ini saya belajar NLP"]
references = [
              ["Hari ini saya belajar NLP di rumah"]
             ]
results = rouge.compute(predictions=predictions, references=references)
print(results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.8333333333333333, 'rouge2': 0.8, 'rougeL': 0.8333333333333333, 'rougeLsum': 0.8333333333333333}


In [29]:
rouge = evaluate.load('rouge')
predictions = ["Hari ini saya belajar NLP di rumah"]
references = [
              ["Hari ini saya belajar NLP di rumah"]
             ]
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}


In [30]:
rouge = evaluate.load('rouge')
predictions = ["Hari ini saya belajar di rumah"]
references = [
              ["Hari ini saya belajar NLP di rumah"]
             ]
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 0.923076923076923, 'rouge2': 0.7272727272727272, 'rougeL': 0.923076923076923, 'rougeLsum': 0.923076923076923}


* Perplexity

In [31]:
!pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [32]:
import torch

In [33]:
inputs = tokenizer("Machine Learing adalah topik yang cukup hangat dalam pembicaraan", return_tensors = "pt")
loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
ppl = torch.exp(loss)
print(ppl)

tensor(1172.3922, grad_fn=<ExpBackward0>)


In [34]:
inputs_wiki_text = tokenizer("Generative Pretrained Transformer is an opensource artificial intelligence created by OpenAI in February 2019", return_tensors = "pt")
loss = model(input_ids = inputs_wiki_text["input_ids"], labels = inputs_wiki_text["input_ids"]).loss
ppl = torch.exp(loss)
print(ppl)

tensor(211.8131, grad_fn=<ExpBackward0>)
