In [1]:
from typing import Tuple

import pandas as pd
import tensorflow as tf
from keras import models
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BatchEncoding, PreTrainedTokenizerBase, TFBertModel

tf.get_logger().setLevel('ERROR')

In [3]:
MAX_LENGTH = 360
CATEGORIES_NUMBER = 5

In [4]:
df = pd.read_csv("./test-dataset.csv", header=0)

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [6]:
def encode_texts(dataset_: pd.DataFrame, pretrained_tokenizer: PreTrainedTokenizerBase) -> Tuple[BatchEncoding, pd.Series, LabelEncoder]:
    _encodings = pretrained_tokenizer(dataset_["txt"].tolist(), truncation=True, padding="max_length", max_length=MAX_LENGTH)

    _label_encoder = LabelEncoder()
    encoded_labels = _label_encoder.fit_transform(dataset_["category"])
    categorical_labels = to_categorical(encoded_labels)

    return _encodings, categorical_labels, _label_encoder

In [7]:
encodings, labels, label_encoder = encode_texts(df, tokenizer)

In [8]:
model = models.load_model("./news-classification-model", custom_objects={"TFBertModel": TFBertModel})

In [9]:
def create_tf_dataset(encodings_: BatchEncoding, labels_: pd.Series) -> tf.data.Dataset:
    input_dict = {
        "input_ids": encodings_["input_ids"],
        "attention_mask": encodings_["attention_mask"],
    }

    dataset_ = tf.data.Dataset.from_tensor_slices((input_dict, labels_))
    return dataset_.shuffle(1024).batch(1)

In [10]:
test_dataset = create_tf_dataset(encodings, labels)

In [11]:
test_loss, test_accuracy = model.evaluate(test_dataset)

print(f"The size of test dataset: {len(test_dataset)}")
print(f"The loss of the model on the test dataset: {round(test_loss, 5)}")
print(f"The accuracy of the model on the test dataset: {round(test_accuracy*100, 4)}%")

The size of test dataset: 550
The loss of the model on the test dataset: 1.15759
The accuracy of the model on the test dataset: 76.3636%
