In [29]:
import keras
import pandas as pd
import tensorflow as tf
from keras import models, layers, optimizers, losses, metrics
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification, BatchEncoding, PreTrainedTokenizerBase

tf.get_logger().setLevel('ERROR')

In [47]:
df = pd.read_csv("./data/training-data/news-for-training.csv", header=None)
df.columns = ["channel_name", "text", "date", "label", "sent"]

print("DATASET SHAPE BEFORE DROPPING NAN:", df.shape)
df = df.dropna()
print("DATASET SHAPE AFTER DROPPING NAN:", df.shape)

DATASET SHAPE BEFORE DROPPING NAN: (2861, 5)
DATASET SHAPE AFTER DROPPING NAN: (2858, 5)


In [48]:
df.head()

Unnamed: 0,channel_name,text,date,label,sent
0,экономика,большинство страна евросоюз согласовать заморо...,2022-12-13,Economical,positive
1,экономика,россия исключить список страна проект google п...,2022-12-13,Political,positive
2,экономика,глава еврокомиссия урсула фон дер ляйена надея...,2022-12-13,Economical,positive
3,экономика,банк фиксировать рост интерес россиянин к вкла...,2022-12-11,Economical,positive
4,экономика,понедельник декабрь соцсеть возвращать премиал...,2022-12-11,Humanitarian,positive


In [49]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [50]:
def encode_texts(dataset_: pd.DataFrame, pretrained_tokenizer: PreTrainedTokenizerBase) -> tuple[BatchEncoding, pd.Series, LabelEncoder]:
    _encodings = pretrained_tokenizer(dataset_["text"].tolist(), truncation=True, padding=True)

    _label_encoder = LabelEncoder()
    encoded_labels = _label_encoder.fit_transform(dataset_["label"])
    categorical_labels = to_categorical(encoded_labels)

    return _encodings, categorical_labels, _label_encoder

In [51]:
encodings, labels, label_encoder = encode_texts(df, tokenizer)

In [54]:
def create_tf_dataset(encodings_: BatchEncoding, labels_: pd.Series) -> tf.data.Dataset:
    input_ids = encodings_["input_ids"]
    attention_mask = encodings_["attention_mask"]

    input_dict = {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

    dataset_ = tf.data.Dataset.from_tensor_slices((input_dict, labels_))
    return dataset_.shuffle(1024).batch(32)

In [55]:
dataset = create_tf_dataset(encodings, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, validation_dataset = dataset.take(train_size), dataset.skip(val_size)

In [56]:
def create_model() -> keras.Model:
    input_ids = layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
    attention_mask = layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

    bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4, classifier_dropout=0.2)
    bert_output = bert_model(input_ids, attention_mask=attention_mask).logits

    _model = models.Model(inputs=[input_ids, attention_mask], outputs=bert_output)
    _model.compile(optimizer=optimizers.Adam(learning_rate=1e-5), loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[metrics.Accuracy()])

    return _model

In [57]:
model = create_model()

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=3)

Epoch 1/3


2023-07-12 08:39:06.493756: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype float and shape [2858,4]
	 [[{{node Placeholder/_2}}]]
2023-07-12 08:39:06.494034: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype float and shape [2858,4]
	 [[{{node Placeholder/_2}}]]
