In [24]:
import keras
import pandas as pd
import tensorflow as tf
from keras import models, layers, optimizers, losses, metrics
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification, BatchEncoding, PreTrainedTokenizerBase

In [25]:
df = pd.read_csv("./data/training-data/news-for-training.csv", header=None)
df.columns = ["channel_name", "text", "date", "label", "sent"]

print("DATASET SHAPE BEFORE DROPPING NAN:", df.shape)
df = df.dropna()
print("DATASET SHAPE AFTER DROPPING NAN:", df.shape)

DATASET SHAPE BEFORE DROPPING NAN: (2861, 5)
DATASET SHAPE AFTER DROPPING NAN: (2858, 5)


In [26]:
df.head()

Unnamed: 0,channel_name,text,date,label,sent
0,экономика,большинство страна евросоюз согласовать заморо...,2022-12-13,Economical,positive
1,экономика,россия исключить список страна проект google п...,2022-12-13,Political,positive
2,экономика,глава еврокомиссия урсула фон дер ляйена надея...,2022-12-13,Economical,positive
3,экономика,банк фиксировать рост интерес россиянин к вкла...,2022-12-11,Economical,positive
4,экономика,понедельник декабрь соцсеть возвращать премиал...,2022-12-11,Humanitarian,positive


In [27]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [28]:
def encode_texts(dataset_: pd.DataFrame, pretrained_tokenizer: PreTrainedTokenizerBase) -> tuple[BatchEncoding, pd.Series, LabelEncoder]:
    _encodings = pretrained_tokenizer(dataset_["text"].tolist(), truncation=True, padding=True)

    _label_encoder = LabelEncoder()
    df["label"] = _label_encoder.fit_transform(dataset_["label"])

    return _encodings, df["label"].values, _label_encoder

In [29]:
encodings, labels, label_encoder = encode_texts(df, tokenizer)

In [30]:
def create_tf_dataset(encodings_, labels_) -> tf.data.Dataset:
    input_ids = encodings_["input_ids"]
    attention_mask = encodings_["attention_mask"]

    input_dict = {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

    dataset_ = tf.data.Dataset.from_tensor_slices((input_dict, labels_))
    return dataset_.shuffle(1024).batch(32)

In [39]:
dataset = create_tf_dataset(encodings, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, validation_dataset = dataset.take(train_size), dataset.skip(val_size)

In [48]:
def create_model() -> keras.Model:
    input_ids = layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
    attention_mask = layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

    bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
    bert_output = bert_model(input_ids, attention_mask=attention_mask).logits

    _model = models.Model(inputs=[input_ids, attention_mask], outputs=bert_output)
    _model.compile(optimizer=optimizers.Adam(learning_rate=1e-5), loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[metrics.Accuracy()])

    return _model

In [49]:
model = create_model()

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(name, **kwargs)


In [40]:
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=3)

Epoch 1/3


2023-07-02 12:33:24.740745: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [2858]
	 [[{{node Placeholder/_2}}]]
2023-07-02 12:33:24.741380: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [2858,512]
	 [[{{node Placeholder/_1}}]]


ValueError: in user code:

    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/engine/training.py", line 1055, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/engine/training.py", line 1149, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 605, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 77, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 691, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/kinfi4/python/envs/ds-env/lib/python3.10/site-packages/keras/metrics/accuracy_metrics.py", line 361, in accuracy  **
        y_true.shape.assert_is_compatible_with(y_pred.shape)

    ValueError: Shapes (None, 1) and (None, 4) are incompatible


In [58]:
for e in train_dataset.take(1):
    print(e[0]["input_ids"].shape)

(32, 512)


2023-07-02 12:39:01.803813: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [2858]
	 [[{{node Placeholder/_2}}]]
2023-07-02 12:39:01.804763: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [2858]
	 [[{{node Placeholder/_2}}]]
