In [2]:
# Import libraries
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = pd.read_csv('/kaggle/input/suicide-ideation-clean/clean_text.csv')  # Adjust this to the path of your dataset

# Assuming 'text' is the column with text data and 'label' is the column with labels
X = data['clean_text'].astype(str)
y = data['class']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert data to InputExamples
def convert_examples_to_tf_dataset(examples, labels, tokenizer, max_length=128):
    input_features = []

    for example, label in zip(examples, labels):
        input_dict = tokenizer.encode_plus(
            example,
            add_special_tokens=True,
            max_length=max_length,  # Max length of the text that can go to BERT
            return_attention_mask=True,
            pad_to_max_length=True,  # Add [PAD] tokens
            truncation=True
        )
        input_ids, attention_mask = input_dict['input_ids'], input_dict['attention_mask']

        input_features.append(
            InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=None, label=label)
        )

    def gen():
        for feature in input_features:
            yield (
                {
                    "input_ids": feature.input_ids,
                    "attention_mask": feature.attention_mask,
                },
                feature.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64),
        (
            {'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])},
            tf.TensorShape([]),
        ),
    )

# Prepare the datasets
train_dataset = convert_examples_to_tf_dataset(list(X_train), list(y_train), tokenizer)
test_dataset = convert_examples_to_tf_dataset(list(X_test), list(y_test), tokenizer)

# Determine the number of labels
num_labels = len(label_encoder.classes_)

# Load pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Fine-tune the model
model.fit(train_dataset.shuffle(100).batch(32), epochs=3, batch_size=32, validation_data=test_dataset.batch(32))

# Evaluate the model
result = model.evaluate(test_dataset.batch(32), return_dict=True)
print(f"Test loss: {result['loss']}")
print(f"Test accuracy: {result['accuracy']}")

# Save the model
model.save_pretrained("/kaggle/working/bert_finetuned_classification")

# To load the model later
# model = TFBertForSequenceClassification.from_pretrained("./bert_finetuned_classification")


2024-04-07 20:37:59.370062: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-07 20:37:59.370191: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-07 20:37:59.516767: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Cause: for/else statement not yet supported


I0000 00:00:1712522954.936151      93 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3
Test loss: 0.14560937881469727
Test accuracy: 0.9551222920417786


In [3]:
!pip install --upgrade transformers



In [4]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
import numpy as np

# Predictions
preds = model.predict(test_dataset.batch(32), verbose=1)
pred_labels = np.argmax(preds.logits, axis=1)

# True labels
true_labels = list(y_test)

# Confusion Matrix
conf_matrix = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:\n", conf_matrix)

# Accuracy
accuracy = accuracy_score(true_labels, pred_labels)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Recall
recall = recall_score(true_labels, pred_labels, average='binary')
print("Recall: {:.2f}".format(recall))

# Precision
precision = precision_score(true_labels, pred_labels, average='binary')
print("Precision: {:.2f}".format(precision))

# F1-score
f1 = f1_score(true_labels, pred_labels, average='binary')
print("F1-score: {:.2f}".format(f1))

Confusion Matrix:
 [[22231  1056]
 [ 1027 22101]]
Accuracy: 95.51%
Recall: 0.96
Precision: 0.95
F1-score: 0.95
