In [59]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [60]:
# Create a simple custom dataset
data = {
    'text': [
        'I love this movie, it is fantastic!',
        'This film was terrible, I hated it.',
        'An excellent movie with a great plot.',
        'The worst movie I have ever seen.',
        'Absolutely wonderful and thrilling!',
        'Not good, I will not recommend it.',
        'An outstanding performance by the actors.',
        'A dull and boring movie.',
        'A masterpiece of cinematography.',
        'A complete waste of time.',
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 for positive, 0 for negative
}

df = pd.DataFrame(data)

In [61]:
# Split dataset into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

In [65]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [82]:
# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='np')

In [67]:
# Tokenize datasets
train_encodings = tokenize_function(train_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

In [83]:
# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: train_encodings[key] for key in train_encodings},
    train_labels.values
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    {key: test_encodings[key] for key in test_encodings},
    test_labels.values
))

In [84]:
# Data collator
batch_size = 4
train_dataset = train_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [85]:
# Load pre-trained model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

RecursionError: maximum recursion depth exceeded in comparison

In [72]:
# Train the model
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [39]:
# Predictions and metrics computation
y_pred = np.argmax(model.predict(test_dataset).logits, axis=1)
y_true = np.concatenate([y for x, y in test_dataset], axis=0)

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

NameError: name 'trainer' is not defined