In [1]:
from typing import Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import models, layers, optimizers, losses
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BatchEncoding, PreTrainedTokenizerBase

tf.get_logger().setLevel('ERROR')

In [2]:
MAX_LENGTH = 360
CATEGORIES_NUMBER = 5

In [3]:
df_train = pd.read_csv("./train-dataset.csv", header=0)
df_test = pd.read_csv("./test-dataset.csv", header=0)

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [9]:
def encode_texts(
    dataset_: pd.DataFrame,
    pretrained_tokenizer: PreTrainedTokenizerBase,
    label_encoder_: LabelEncoder,
) -> Tuple[BatchEncoding, pd.Series]:
    _encodings = pretrained_tokenizer(dataset_["txt"].tolist(), truncation=True, padding="max_length", max_length=MAX_LENGTH)

    encoded_labels = label_encoder_.transform(dataset_["category"])
    categorical_labels = to_categorical(encoded_labels)

    return _encodings, categorical_labels

In [10]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(df_train["category"])

In [11]:
encodings_train, labels_train = encode_texts(df_train, tokenizer, label_encoder)
encodings_test, labels_test = encode_texts(df_test, tokenizer, label_encoder)

In [12]:
def create_tf_dataset(encodings_: BatchEncoding, labels_: pd.Series) -> tf.data.Dataset:
    input_dict = {
        "input_ids": encodings_["input_ids"],
        "attention_mask": encodings_["attention_mask"],
    }

    dataset_ = tf.data.Dataset.from_tensor_slices((input_dict, labels_))
    return dataset_.shuffle(1024).batch(1)

In [13]:
train_dataset = create_tf_dataset(encodings_train, labels_train)
test_dataset = create_tf_dataset(encodings_test, labels_test)

In [14]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses
from transformers import TFBertModel


def create_model() -> tf.keras.Model:
    input_ids = layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
    attention_mask = layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")

    bert_model = TFBertModel.from_pretrained("bert-base-multilingual-uncased", num_labels=CATEGORIES_NUMBER)

    bert_output = bert_model(input_ids, attention_mask=attention_mask).last_hidden_state
    cls_token_state = bert_output[:, 0, :]
    dropout = layers.Dropout(0.3)(cls_token_state)

    dense = layers.Dense(128, activation='relu')(dropout)
    
    bert_output = layers.Dense(CATEGORIES_NUMBER, activation=None)(dense)

    _model = models.Model(inputs=[input_ids, attention_mask], outputs=bert_output)

    _model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-5), 
        loss=losses.CategoricalCrossentropy(from_logits=True), 
        metrics=['accuracy'],
    )

    return _model

In [15]:
model = create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [16]:
num_epochs = 8

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=num_epochs,
)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Saving the model  (97.28%)

In [None]:
model.save("news-classification-model")

# Analyzing errors

In [30]:
def decode_prediction(input_ids, attention_mask):
    text = tokenizer.decode(input_ids.numpy(), skip_special_tokens=True)

    prediction = model.predict(
        {
            "input_ids": tf.expand_dims(input_ids, 0),
            "attention_mask": tf.expand_dims(attention_mask, 0),
        },
        verbose=False,
    )

    predicted_label_idx = np.argmax(prediction, axis=1)[0]
    predicted_label_text = label_encoder.inverse_transform([predicted_label_idx])

    return text, predicted_label_text[0]

In [50]:
wrong_counter = 0

for i, (input_dict, label) in enumerate(test_dataset):
    input_ids = input_dict['input_ids'][0]
    attention_mask = input_dict['attention_mask'][0]
    
    true_label_idx = label.numpy()[0]
    true_label_text = label_encoder.inverse_transform([true_label_idx.argmax()])[0]

    text, predicted_label_text = decode_prediction(input_ids, attention_mask)
    
    if true_label_text != predicted_label_text:
        wrong_counter += 1
        print(f"Text: {text}")
        print(f"True Label: {true_label_text}, Predicted Label: {predicted_label_text}\n\n\n")


print("TOTAL WRONG COUNTER:", wrong_counter)

Text: путин поручил минобороны до 1 февраля доложить об обеспечении участников спецоперации вооружением, техникои, материальными средствами и экипировкои
True Label: Economical, Predicted Label: Political


Text: залишилось ще 2 дні наибільших знижок липня від « дебету - кредиту »! передплата 2023 - 65 % замовляите прямо зараз
True Label: Other, Predicted Label: Crisis


Text: в украіні продовжено карантин 19 серпня кму продовжив дію режиму надзвичаиноі ситуаціі та карантину до 31 грудня 2022 р. на всіи територіі украіни. час читання : 2 хвилини діліться посиланням на наш канал з друзями і колегами ; він буде корисним і для них теж :
True Label: Other, Predicted Label: Crisis


Text: сводка минобороны россии
True Label: Crisis, Predicted Label: Political


Text: министр обороны рф сергеи шоигу провел в штабе объединеннои группировки россииских воиск совещание по вопросам обеспечения боеприпасами.
True Label: Crisis, Predicted Label: Political


Text: в крупнеишем городе новои зеландии 

In [90]:
string = "залишилось ще 2 дні наибільших знижок липня від « дебету - кредиту »! передплата 2023 - 65 % замовляите прямо зараз"

In [91]:
inputs = tokenizer([string], return_tensors="tf", padding="max_length", truncation=True, max_length=MAX_LENGTH)

In [92]:
input_ids = inputs["input_ids"][0]
attention_mask = inputs['attention_mask'][0]

In [93]:
prediction = model.predict(
    {
        "input_ids": tf.expand_dims(input_ids, 0),
        "attention_mask": tf.expand_dims(attention_mask, 0),
    },
    verbose=True,
)



In [95]:
predict_idx = np.argmax(prediction)

label_encoder.inverse_transform([predict_idx])[0]

'Crisis'